Add openpilot tests
This commit is contained in:
12
tinygrad_repo/test/Dockerfile
Normal file
12
tinygrad_repo/test/Dockerfile
Normal file
@@ -0,0 +1,12 @@
|
||||
FROM ubuntu:20.04
|
||||
|
||||
# Install python3.8, and pip3
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3.8 \
|
||||
python3-pip \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install python dependencies
|
||||
COPY . ./tinygrad
|
||||
WORKDIR tinygrad
|
||||
RUN pip install -e .
|
||||
0
tinygrad_repo/test/__init__.py
Normal file
0
tinygrad_repo/test/__init__.py
Normal file
62
tinygrad_repo/test/external/dist/test_collectives.py
vendored
Normal file
62
tinygrad_repo/test/external/dist/test_collectives.py
vendored
Normal file
@@ -0,0 +1,62 @@
|
||||
from extra import dist
|
||||
from tinygrad.jit import TinyJit
|
||||
if __name__ == "__main__":
|
||||
dist.preinit()
|
||||
|
||||
from extra.dist import collectives
|
||||
from tinygrad.helpers import CI, getenv
|
||||
from tinygrad.tensor import Tensor
|
||||
import numpy as np
|
||||
|
||||
@TinyJit
|
||||
def allreduce_jit(t:Tensor, cache_id=None) -> Tensor:
|
||||
return collectives.allreduce(t, cache_id=cache_id).realize()
|
||||
|
||||
SIZE = 2048 if not CI else 2
|
||||
SIZE_2 = 255 if not CI else 3
|
||||
|
||||
def run():
|
||||
# set a deterministic seed so that both ranks generate the same random tensor
|
||||
Tensor.manual_seed(42)
|
||||
|
||||
rank = getenv("RANK")
|
||||
|
||||
# loop 3 times to make sure it works with the jit
|
||||
for _ in range(3):
|
||||
# create a tensor to send
|
||||
t = Tensor.zeros(SIZE, SIZE) if rank != 0 else Tensor.ones(SIZE, SIZE)
|
||||
t2 = allreduce_jit(t.contiguous().realize(), cache_id="test")
|
||||
assert np.allclose(np.ones((SIZE, SIZE)), t2.numpy()), f"{t2.numpy()} wasn't ones"
|
||||
|
||||
# reset jit
|
||||
allreduce_jit.cnt = 0
|
||||
allreduce_jit.input_replace = {}
|
||||
|
||||
# test uneven chunk sizes
|
||||
for _ in range(3):
|
||||
# create a tensor to send
|
||||
t = Tensor.ones(SIZE_2, SIZE_2, SIZE_2) if rank == 0 else Tensor.zeros(SIZE_2, SIZE_2, SIZE_2)
|
||||
t2 = allreduce_jit(t.contiguous().realize(), cache_id="test2")
|
||||
assert np.allclose(np.ones((SIZE_2, SIZE_2, SIZE_2)), t2.numpy()), f"{t2.numpy()} wasn't ones"
|
||||
|
||||
print(f"rank {rank} passed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if getenv("HIP"):
|
||||
from tinygrad.runtime.ops_hip import HIP
|
||||
devices = [f"hip:{i}" for i in range(HIP.device_count)]
|
||||
else:
|
||||
from tinygrad.runtime.ops_gpu import CL
|
||||
devices = [f"gpu:{i}" for i in range(len(CL.devices))] if not CI else ["gpu:0", "gpu:0"]
|
||||
world_size = len(devices)
|
||||
|
||||
dist.init_oob(world_size)
|
||||
|
||||
processes = []
|
||||
for rank, device in enumerate(devices):
|
||||
processes.append(dist.spawn(rank, device, fn=run, args=()))
|
||||
for p in processes: p.join()
|
||||
|
||||
# exit with error code if any of the processes failed
|
||||
for p in processes:
|
||||
if p.exitcode != 0: exit(p.exitcode)
|
||||
68
tinygrad_repo/test/external/dist/test_world.py
vendored
Normal file
68
tinygrad_repo/test/external/dist/test_world.py
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
from extra import dist
|
||||
from tinygrad.jit import TinyJit
|
||||
if __name__ == "__main__":
|
||||
dist.preinit()
|
||||
|
||||
from extra.dist import world
|
||||
from tinygrad.helpers import CI, getenv
|
||||
from tinygrad.tensor import Tensor
|
||||
import numpy as np
|
||||
|
||||
@TinyJit
|
||||
def send_jit(t, target_rank, cache_id=None) -> Tensor:
|
||||
return world.send(t, target_rank, cache_id=cache_id).realize()
|
||||
|
||||
@TinyJit
|
||||
def recv_jit(t, target_rank, cache_id=None) -> Tensor:
|
||||
return world.recv(t, target_rank, cache_id=cache_id).realize()
|
||||
|
||||
SIZE = 2048 if not CI else 2
|
||||
|
||||
def run():
|
||||
# set a deterministic seed so that both ranks generate the same random tensor
|
||||
Tensor.manual_seed(42)
|
||||
|
||||
rank = getenv("RANK")
|
||||
|
||||
# loop 3 times to make sure it works with the jit
|
||||
for _ in range(3):
|
||||
# create a tensor to send
|
||||
t = Tensor.randn(SIZE, SIZE)
|
||||
|
||||
# send to rank 1
|
||||
if rank == 0:
|
||||
send_jit(t, 1, cache_id="test")
|
||||
elif rank == 1:
|
||||
t2 = Tensor.empty(SIZE, SIZE)
|
||||
recv_jit(t2, 0, cache_id="test")
|
||||
|
||||
# recv from rank 1
|
||||
if rank == 0:
|
||||
t2 = Tensor.empty(SIZE, SIZE)
|
||||
recv_jit(t2, 1, cache_id="test2")
|
||||
elif rank == 1:
|
||||
send_jit(t2, 0, cache_id="test2")
|
||||
|
||||
# check that the received tensor is the same as the sent tensor
|
||||
if rank == 0:
|
||||
assert np.allclose(t.numpy(), t2.numpy()), f"{t2.numpy()} wasn't equal to {t.numpy()}"
|
||||
|
||||
print(f"rank {rank} passed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if getenv("HIP"):
|
||||
devices = ["hip:0", "hip:1"]
|
||||
else:
|
||||
devices = ["gpu:0", "gpu:1" if not CI else "gpu:0"]
|
||||
world_size = len(devices)
|
||||
|
||||
dist.init_oob(world_size)
|
||||
|
||||
processes = []
|
||||
for rank, device in enumerate(devices):
|
||||
processes.append(dist.spawn(rank, device, fn=run, args=()))
|
||||
for p in processes: p.join()
|
||||
|
||||
# exit with error code if any of the processes failed
|
||||
for p in processes:
|
||||
if p.exitcode != 0: exit(p.exitcode)
|
||||
27
tinygrad_repo/test/external/external_copy_benchmark.py
vendored
Normal file
27
tinygrad_repo/test/external/external_copy_benchmark.py
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
import unittest
|
||||
from tinygrad.helpers import prod
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.jit import CacheCollector
|
||||
|
||||
class TestCopy(unittest.TestCase):
|
||||
def test_add1(self):
|
||||
pts = []
|
||||
for i in range(16384, 16384*256, 16384):
|
||||
t = Tensor.randn(i).realize()
|
||||
CacheCollector.start()
|
||||
t.assign(t+1).realize()
|
||||
fxn, args, _ = CacheCollector.finish()[0]
|
||||
GlobalCounters.reset()
|
||||
def run(): return fxn(args, force_wait=True)
|
||||
ct = min([run() for _ in range(10)])
|
||||
mb = prod(t.shape)*t.dtype.itemsize*2*1e-6
|
||||
print(f"{mb*1e3:.2f} kB, {ct*1e3:.2f} ms, {mb/ct:.2f} MB/s")
|
||||
pts.append((mb, mb/ct))
|
||||
from matplotlib import pyplot as plt
|
||||
plt.plot([x[0] for x in pts], [x[1] for x in pts])
|
||||
plt.show()
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
102
tinygrad_repo/test/external/external_llama_eval.py
vendored
Normal file
102
tinygrad_repo/test/external/external_llama_eval.py
vendored
Normal file
@@ -0,0 +1,102 @@
|
||||
from lm_eval.base import BaseLM
|
||||
from lm_eval import evaluator, tasks
|
||||
import torch, json, argparse
|
||||
|
||||
from examples.llama import LLaMa
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
|
||||
class LLaMaAdaptor(BaseLM):
|
||||
def __init__(
|
||||
self,
|
||||
model_size="7B",
|
||||
model_gen=1,
|
||||
device="",
|
||||
quantize=False,
|
||||
batch_size=1,
|
||||
max_batch_size=1,
|
||||
do_sample=False,
|
||||
temperature=1.0,
|
||||
checkpoint_path="",
|
||||
tokenizer_path="",
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
if batch_size is None:
|
||||
batch_size = 1
|
||||
self.do_sample = do_sample
|
||||
self.temperature = temperature
|
||||
self._device = device
|
||||
|
||||
assert isinstance(model_gen, int)
|
||||
assert isinstance(model_size, str)
|
||||
assert isinstance(batch_size, int)
|
||||
assert isinstance(checkpoint_path, str)
|
||||
assert isinstance(tokenizer_path, str)
|
||||
|
||||
self.llama = LLaMa.build(checkpoint_path, tokenizer_path, model_gen, model_size, quantize)
|
||||
|
||||
@classmethod
|
||||
def create_from_arg_string(cls, arg_string, additional_config=None):
|
||||
kwargs = {el.split("=")[0]: el.split("=")[1] for el in arg_string.split(",")}
|
||||
return cls(**kwargs, **additional_config)
|
||||
|
||||
@property
|
||||
def eot_token_id(self):
|
||||
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
|
||||
return self.llama.tokenizer.eos_id()
|
||||
|
||||
@property
|
||||
def max_length(self):
|
||||
return 1024
|
||||
|
||||
@property
|
||||
def max_gen_toks(self):
|
||||
return 256
|
||||
|
||||
@property
|
||||
def batch_size(self):
|
||||
return 1
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return self._device
|
||||
|
||||
def tok_encode(self, string: str):
|
||||
return [self.llama.tokenizer.bos_id()] + self.llama.tokenizer.encode(string)
|
||||
|
||||
def tok_decode(self, tokens):
|
||||
return self.llama.tokenizer.decode(tokens)
|
||||
|
||||
def _model_call(self, inps):
|
||||
Tensor.no_grad = True
|
||||
return torch.Tensor(self.llama.model(Tensor(inps.numpy()), 0).numpy())
|
||||
|
||||
def greedy_until(self, requests):
|
||||
continuations = []
|
||||
for request in requests:
|
||||
prompt, until = request[0], request[1]['until']
|
||||
output = self.llama.greedy_until(prompt, until, max_length=128, temperature=0.0)
|
||||
continuations.append(output[len(prompt):])
|
||||
return continuations
|
||||
|
||||
def _model_generate(self, context, max_length, eos_token_id):
|
||||
raise NotImplementedError()
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(f"using {Device.DEFAULT} backend")
|
||||
|
||||
parser = argparse.ArgumentParser(description='Run LLaMA evals in tinygrad', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--size', type=str, default="7B", help="Size of model to use [7B, 13B, 30B, 65B] for Gen 1, [7B, 13B] for Gen 2")
|
||||
parser.add_argument('--gen', type=int, default="1", help="Generation of the model to use [1, 2]")
|
||||
parser.add_argument('--quantize', action='store_true', help="Quantize the weights to int8 in memory")
|
||||
parser.add_argument('--eval', type=str, default="arc_easy", help="Run in evaluation mode")
|
||||
parser.add_argument('--limit', type=int, default=None, help="Limit tests in eval")
|
||||
parser.add_argument('--weights', type=str, default="./weights/LLaMa/", help="Location of the weights")
|
||||
parser.add_argument('--tokenizer', type=str, default="./weights/LLaMa/tokenizer.model", help="Location of the tokenizer")
|
||||
args = parser.parse_args()
|
||||
|
||||
# run eval and exit
|
||||
adaptor = LLaMaAdaptor(model_gen=args.gen, model_size=args.size, quantize=args.quantize, checkpoint_path=args.weights, tokenizer_path=args.tokenizer, device="cpu")
|
||||
results = evaluator.evaluate(adaptor, tasks.get_task_dict(args.eval.split(",")), False, 0, args.limit)
|
||||
print(json.dumps(results, indent=2))
|
||||
128
tinygrad_repo/test/external/external_model_benchmark.py
vendored
Normal file
128
tinygrad_repo/test/external/external_model_benchmark.py
vendored
Normal file
@@ -0,0 +1,128 @@
|
||||
import csv, pathlib, time, numpy as np
|
||||
from os import getenv
|
||||
import torch
|
||||
torch.set_num_threads(1)
|
||||
import onnx
|
||||
from onnx.helper import tensor_dtype_to_np_dtype
|
||||
import onnxruntime as ort
|
||||
from onnx2torch import convert
|
||||
from extra.utils import download_file
|
||||
from extra.onnx import get_run_onnx
|
||||
from tinygrad.helpers import OSX, DEBUG
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
|
||||
MODELS = {
|
||||
"resnet50": "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx",
|
||||
"openpilot": "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx",
|
||||
"efficientnet": "https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx",
|
||||
"shufflenet": "https://github.com/onnx/models/raw/main/vision/classification/shufflenet/model/shufflenet-9.onnx",
|
||||
"commavq": "https://huggingface.co/commaai/commavq-gpt2m/resolve/main/gpt2m.onnx",
|
||||
|
||||
# broken in torch MPS
|
||||
#"zfnet": "https://github.com/onnx/models/raw/main/vision/classification/zfnet-512/model/zfnet512-9.onnx",
|
||||
# TypeError: BatchNormalization() got an unexpected keyword argument 'is_test'
|
||||
#"densenet": "https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-3.onnx",
|
||||
# AssertionError: only onnx version >= 10 supported for slice
|
||||
#"bert": "https://github.com/onnx/models/raw/main/text/machine_comprehension/bert-squad/model/bertsquad-8.onnx",
|
||||
# really slow
|
||||
#"resnet18": "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx",
|
||||
}
|
||||
|
||||
CSV = {}
|
||||
open_csv = None
|
||||
torch.manual_seed(1)
|
||||
|
||||
def benchmark(mnm, nm, fxn):
|
||||
tms = []
|
||||
for _ in range(3):
|
||||
st = time.perf_counter_ns()
|
||||
ret = fxn()
|
||||
tms.append(time.perf_counter_ns() - st)
|
||||
print(f"{mnm:15s} {nm:25s} {min(tms)*1e-6:7.2f} ms")
|
||||
CSV[nm] = min(tms)*1e-6
|
||||
return min(tms), ret
|
||||
|
||||
#BASE = pathlib.Path(__file__).parents[2] / "weights" / "onnx"
|
||||
BASE = pathlib.Path("/tmp/onnx")
|
||||
def benchmark_model(m, validate_outs=False):
|
||||
global open_csv, CSV
|
||||
CSV = {"model": m}
|
||||
|
||||
fn = BASE / MODELS[m].split("/")[-1]
|
||||
download_file(MODELS[m], fn)
|
||||
onnx_model = onnx.load(fn)
|
||||
output_names = [out.name for out in onnx_model.graph.output]
|
||||
excluded = {inp.name for inp in onnx_model.graph.initializer}
|
||||
input_shapes = {inp.name:tuple(x.dim_value if x.dim_value != 0 else 1 for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input if inp.name not in excluded}
|
||||
input_types = {inp.name: tensor_dtype_to_np_dtype(inp.type.tensor_type.elem_type) for inp in onnx_model.graph.input if inp.name not in excluded}
|
||||
#input_types = {k:v if v!=np.float16 else np.float32 for k,v in input_types.items()} # cast
|
||||
np_inputs = {k:torch.randn(shp).numpy().astype(input_types[k]) for k,shp in input_shapes.items()}
|
||||
assert len(input_shapes) < 30, f"too many input shapes {len(input_shapes)}"
|
||||
|
||||
# print input names
|
||||
if DEBUG >= 2: print([inp.name for inp in onnx_model.graph.input if inp.name not in excluded])
|
||||
|
||||
for device in ["METAL" if OSX else "GPU", "CLANG"]: # + (["CUDA"] if torch.cuda.is_available() else []):
|
||||
Device.DEFAULT = device
|
||||
inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
|
||||
tinygrad_model = get_run_onnx(onnx_model)
|
||||
benchmark(m, f"tinygrad_{device.lower()}_jitless", lambda: {k:v.numpy() for k,v in tinygrad_model(inputs).items()})
|
||||
|
||||
from tinygrad.jit import TinyJit
|
||||
tinygrad_jitted_model = TinyJit(lambda **kwargs: {k:v.realize() for k,v in tinygrad_model(kwargs).items()})
|
||||
for _ in range(3): {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}
|
||||
benchmark(m, f"tinygrad_{device.lower()}_jit", lambda: {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()})
|
||||
del inputs, tinygrad_model, tinygrad_jitted_model
|
||||
|
||||
try:
|
||||
torch_model = convert(onnx_model)
|
||||
torch_inputs = [torch.tensor(x) for x in np_inputs.values()]
|
||||
benchmark(m, "torch_cpu", lambda: torch_model(*torch_inputs))
|
||||
|
||||
torch_device = "mps" if OSX else "cuda"
|
||||
torch_mps_model = torch_model.to(torch_device)
|
||||
torch_mps_inputs = [x.to(torch_device) for x in torch_inputs]
|
||||
benchmark(m, f"torch_{torch_device}", lambda: torch_mps_model(*torch_mps_inputs))
|
||||
except Exception as e: print(f"{m:16s}onnx2torch {type(e).__name__:>25}")
|
||||
|
||||
# bench onnxruntime
|
||||
ort_options = ort.SessionOptions()
|
||||
ort_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
ort_options.log_severity_level = 3 # no warnings
|
||||
for backend in ["CPU", "CUDA" if not OSX else "CoreML"]: # https://onnxruntime.ai/docs/execution-providers/
|
||||
provider = backend+"ExecutionProvider"
|
||||
if provider not in ort.get_available_providers(): continue
|
||||
ort_sess = ort.InferenceSession(str(fn), ort_options, [provider])
|
||||
benchmark(m, f"onnxruntime_{backend.lower()}", lambda: ort_sess.run(output_names, np_inputs))
|
||||
del ort_sess
|
||||
|
||||
if validate_outs:
|
||||
rtol, atol = 2e-3, 2e-3 # tolerance for fp16 models
|
||||
inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
|
||||
tinygrad_model = get_run_onnx(onnx_model)
|
||||
tinygrad_out = tinygrad_model(inputs)
|
||||
|
||||
ort_sess = ort.InferenceSession(str(fn), ort_options, ["CPUExecutionProvider"])
|
||||
onnx_out = ort_sess.run(output_names, np_inputs)
|
||||
onnx_out = dict([*[(name,x) for name, x in zip(output_names, onnx_out)]])
|
||||
|
||||
assert_allclose(tinygrad_out, onnx_out, rtol=rtol, atol=atol)
|
||||
print(f"{m:16s}outputs validated with rtol={rtol:.1e}, atol={atol:.1e}")
|
||||
|
||||
if open_csv is None:
|
||||
open_csv = csv.DictWriter(open('onnx_inference_speed.csv', 'w', newline=''), fieldnames=list(CSV.keys()))
|
||||
open_csv.writeheader()
|
||||
open_csv.writerow(CSV)
|
||||
|
||||
def assert_allclose(tiny_out:dict, onnx_out:dict, rtol=1e-5, atol=1e-5):
|
||||
assert len(tiny_out) == len(onnx_out) and tiny_out.keys() == onnx_out.keys()
|
||||
for k in tiny_out.keys():
|
||||
tiny_v, onnx_v = tiny_out[k], onnx_out[k]
|
||||
if tiny_v is None: assert tiny_v == onnx_v
|
||||
else: np.testing.assert_allclose(tiny_v.numpy(), onnx_v, rtol=rtol, atol=atol, err_msg=f"For tensor '{k}' in {tiny_out.keys()}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if getenv("MODEL", "") != "": benchmark_model(getenv("MODEL", ""), True)
|
||||
else:
|
||||
for m in MODELS: benchmark_model(m, True)
|
||||
70
tinygrad_repo/test/external/external_multi_gpu.py
vendored
Normal file
70
tinygrad_repo/test/external/external_multi_gpu.py
vendored
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
# cd disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
|
||||
# LD_PRELOAD=$PWD/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import colored
|
||||
from tinygrad.helpers import Timing
|
||||
from tinygrad.runtime.ops_gpu import CL
|
||||
|
||||
# TODO: support multidevice in cuda
|
||||
device = 'gpu'
|
||||
|
||||
if __name__ == "__main__":
|
||||
sz = 1024*1024*256 # 1 GB
|
||||
#sz = 1024*64
|
||||
|
||||
with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
|
||||
c0 = Tensor.ones(sz, device="cpu").realize()
|
||||
c1 = (Tensor.ones(sz, device="cpu")/2).realize()
|
||||
|
||||
with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
a0 = c0.to(f'{device}:0').realize()
|
||||
CL.synchronize()
|
||||
with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
b1 = c1.to(f'{device}:1').realize()
|
||||
CL.synchronize()
|
||||
|
||||
# cross copy. this is going through the CPU
|
||||
with Timing("0 -> CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
a1 = a0.to(f'{device}:1').realize()
|
||||
CL.synchronize()
|
||||
with Timing("1 -> CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
b0 = b1.to(f'{device}:0').realize()
|
||||
CL.synchronize()
|
||||
|
||||
# sum
|
||||
with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
ab0 = (a0 + b0).realize()
|
||||
CL.synchronize()
|
||||
with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
ab1 = (a1 + b1).realize()
|
||||
CL.synchronize()
|
||||
|
||||
# cross device sum (does this work?)
|
||||
# is this making a copy first? is that copy through the CPU?
|
||||
# the slowness comes from the *blocking* clprg call, is this pyopencl?
|
||||
with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
abx0 = (a0 + b1).realize()
|
||||
CL.synchronize()
|
||||
|
||||
with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
abx1 = (b1 + a0).realize()
|
||||
CL.synchronize()
|
||||
|
||||
# copy back
|
||||
# NOTE: half of this slowness is caused by allocating memory on the CPU
|
||||
with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
cc0 = ab0.numpy()
|
||||
with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
cc1 = ab1.numpy()
|
||||
|
||||
# same
|
||||
print("testing")
|
||||
np.testing.assert_allclose(cc0, cc1)
|
||||
|
||||
# devices
|
||||
print(ab0)
|
||||
print(ab1)
|
||||
print(abx0)
|
||||
print(abx1)
|
||||
41
tinygrad_repo/test/external/external_osx_profiling.py
vendored
Normal file
41
tinygrad_repo/test/external/external_osx_profiling.py
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
from tinygrad.runtime.ops_gpu import CLProgram, CL, CLBuffer
|
||||
from tinygrad.helpers import dtypes
|
||||
import time
|
||||
|
||||
N = 1000000
|
||||
a = CLBuffer(N, dtypes.float32)
|
||||
b = CLBuffer(N, dtypes.float32)
|
||||
c = CLBuffer(N, dtypes.float32)
|
||||
|
||||
prg = CLProgram("test", """__kernel void test(__global float *a, __global float *b, __global float *c) {
|
||||
int idx = get_global_id(0);
|
||||
a[idx] = b[idx] + c[idx];
|
||||
}""")
|
||||
prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
|
||||
t1 = time.monotonic_ns()
|
||||
e1 = prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
|
||||
CL.synchronize()
|
||||
t2 = time.monotonic_ns()
|
||||
time.sleep(3)
|
||||
t3 = time.monotonic_ns()
|
||||
e2 = prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
|
||||
CL.synchronize()
|
||||
t4 = time.monotonic_ns()
|
||||
|
||||
print(e1.profile.queued)
|
||||
print(e1.profile.submit)
|
||||
print(e1.profile.start)
|
||||
print(e1.profile.end)
|
||||
|
||||
print(e1, e2)
|
||||
print(t2-t1, e1.profile.end - e1.profile.start)
|
||||
print(t4-t3, e2.profile.end - e2.profile.start)
|
||||
print(t3-t2, e2.profile.queued-e1.profile.end)
|
||||
print((t3-t2) / (e2.profile.start-e1.profile.end), "ratio")
|
||||
|
||||
print("ratio since boot", t1/e1.profile.start)
|
||||
|
||||
print(e1.profile.start)
|
||||
print(e1.profile.end)
|
||||
print(e2.profile.start)
|
||||
print(e2.profile.end)
|
||||
125
tinygrad_repo/test/external/external_test_allocator_on_models.py
vendored
Normal file
125
tinygrad_repo/test/external/external_test_allocator_on_models.py
vendored
Normal file
@@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest, gc
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn.state import get_state_dict
|
||||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.runtime.lib import RawBuffer, LRUAllocator
|
||||
from tinygrad.helpers import dtypes, prod
|
||||
from tinygrad.ops import Device
|
||||
from test.helpers import derandomize_model
|
||||
|
||||
from examples.llama import Transformer
|
||||
|
||||
ALLOCATED_DEV_BUFS = 0
|
||||
class FakeDeviceBuffer:
|
||||
def __init__(self, sz, dt, device):
|
||||
self.id = 1
|
||||
self.size = sz
|
||||
self.dtype = dt
|
||||
self.device = device
|
||||
|
||||
global ALLOCATED_DEV_BUFS
|
||||
ALLOCATED_DEV_BUFS += 1
|
||||
class FakeAllocator(LRUAllocator):
|
||||
def _do_alloc(self, size, dtype, device, **kwargs): return FakeDeviceBuffer(size, dtype, device)
|
||||
def _do_free(self, buf):
|
||||
buf.id -= 1
|
||||
assert buf.id == 0, f"Free should be called once, but {buf.id}"
|
||||
def __del__(self): # Fake allocator should clear all buffers after each test.
|
||||
for v in self.cached_buffers.values():
|
||||
for buf, _ in v: self._free_buffer(buf)
|
||||
|
||||
FAKE_GLOBAL_ALLOCATOR = None
|
||||
class FakeBuffer(RawBuffer):
|
||||
def __init__(self, size, dtype, device='0'):
|
||||
global FAKE_GLOBAL_ALLOCATOR
|
||||
super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
|
||||
assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
|
||||
@classmethod
|
||||
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
|
||||
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
|
||||
class FakeProgram:
|
||||
def __init__(self, name:str, prg:str): pass
|
||||
def __call__(self, *bufs, global_size, local_size, wait=False): pass
|
||||
|
||||
def helper_test_correctness(gen, train):
|
||||
from tinygrad.runtime.ops_gpu import CL, CLAllocator
|
||||
old_alloc = CL.cl_allocator
|
||||
CL.cl_allocator = CLAllocator(0)
|
||||
no_alloc_result = train(*gen()).numpy()
|
||||
Device[Device.DEFAULT].synchronize()
|
||||
CL.cl_allocator = CLAllocator(512<<30) # Test cache correctness, so cache as much as possible, 512gb
|
||||
for _ in range(4):
|
||||
GlobalCounters.reset()
|
||||
np.testing.assert_allclose(train(*gen()).numpy(), no_alloc_result, rtol=1e-3, atol=1e-5)
|
||||
Device[Device.DEFAULT].synchronize()
|
||||
assert len(CL.cl_allocator.cached_buffers) != 0, "Cache must be used"
|
||||
CL.cl_allocator = old_alloc
|
||||
|
||||
def __helper_test_alloc_count(gen, train):
|
||||
was_alloc = ALLOCATED_DEV_BUFS
|
||||
for _ in range(2):
|
||||
train(*gen())
|
||||
return ALLOCATED_DEV_BUFS - was_alloc
|
||||
|
||||
def helper_test_alloc_count(mm, gen, train):
|
||||
global FAKE_GLOBAL_ALLOCATOR
|
||||
backup_program = Device[Device.DEFAULT].runtime
|
||||
backup_buffer = Device[Device.DEFAULT].buffer
|
||||
Device[Device.DEFAULT].runtime = FakeProgram
|
||||
Device[Device.DEFAULT].buffer = FakeBuffer
|
||||
Device[Device.DEFAULT].method_cache.clear()
|
||||
FAKE_GLOBAL_ALLOCATOR = FakeAllocator(16<<30)
|
||||
new_allocs = __helper_test_alloc_count(gen, train)
|
||||
Device[Device.DEFAULT].method_cache.clear()
|
||||
FAKE_GLOBAL_ALLOCATOR = FakeAllocator(0)
|
||||
old_allocs = __helper_test_alloc_count(gen, train)
|
||||
print(f"{mm}: llama: old allocs count {old_allocs}, new allocs count {new_allocs}")
|
||||
assert new_allocs < old_allocs, f"Hmm, doesn't cache work any more?"
|
||||
Device[Device.DEFAULT].runtime = backup_program
|
||||
Device[Device.DEFAULT].buffer = backup_buffer
|
||||
FAKE_GLOBAL_ALLOCATOR = None
|
||||
|
||||
def check_gc():
|
||||
if Device.DEFAULT == "GPU":
|
||||
gc.collect() # Need to collect Tensors.
|
||||
from extra.introspection import print_objects
|
||||
assert print_objects() == 0
|
||||
|
||||
class TestAllocators(unittest.TestCase):
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
def test_lru_allocator_tiny_llama(self):
|
||||
old_type = Tensor.default_type
|
||||
Tensor.default_type = dtypes.float16
|
||||
|
||||
args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
|
||||
def __test():
|
||||
model = Transformer(**args_tiny)
|
||||
derandomize_model(model)
|
||||
def test(t): return model(t, 0).realize()
|
||||
helper_test_correctness(lambda: (Tensor([[1,]]),), test)
|
||||
__test()
|
||||
Tensor.default_type = old_type
|
||||
check_gc()
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
def test_lru_allocator_tiny_llama_alloc_counts(self):
|
||||
args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
|
||||
def test_alloc_count(t):
|
||||
model = Transformer(**args_tiny)
|
||||
for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
|
||||
return model(t, 0).realize()
|
||||
helper_test_alloc_count("llama", lambda: (Tensor([[2,]]),), test_alloc_count)
|
||||
check_gc()
|
||||
|
||||
@unittest.skip("huge for CI")
|
||||
def test_stable_diffusion(self):
|
||||
from examples.stable_diffusion import UNetModel
|
||||
model = UNetModel()
|
||||
derandomize_model(model)
|
||||
def test(t, t2): return model(t, 801, t2).realize()
|
||||
helper_test_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
8
tinygrad_repo/test/external/external_test_embedding.py
vendored
Normal file
8
tinygrad_repo/test/external/external_test_embedding.py
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn import Embedding
|
||||
|
||||
if __name__ == "__main__":
|
||||
vocab_size = 50257
|
||||
dim = 128
|
||||
test = Embedding(vocab_size, dim)
|
||||
ret = test(Tensor([[1,2,3]])).numpy()
|
||||
208
tinygrad_repo/test/external/external_test_gpu_ast.py
vendored
Normal file
208
tinygrad_repo/test/external/external_test_gpu_ast.py
vendored
Normal file
@@ -0,0 +1,208 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.ops import LazyOp, ReduceOps, BinaryOps, UnaryOps, MovementOps
|
||||
from tinygrad.shape.shapetracker import ShapeTracker, View, ZeroView
|
||||
from tinygrad.runtime.ops_gpu import GPUBuffer, CLProgram, CLCodegen
|
||||
#from tinygrad.runtime.ops_metal import MetalBuffer as GPUBuffer, MetalProgram as CLProgram, MetalCodegen as CLCodegen
|
||||
from tinygrad.helpers import getenv
|
||||
from extra.lib_test_ast import test_ast
|
||||
|
||||
import platform
|
||||
OSX = platform.system() == "Darwin"
|
||||
|
||||
def compile_and_test_ast(ast, local_size=None):
|
||||
k = CLCodegen(ast)
|
||||
prg = k.codegen().build(CLProgram)
|
||||
if local_size is not None: prg.local_size = local_size
|
||||
for i in range(5): prg(prg.lower(k.bufs))
|
||||
if getenv("TEST", 0): test_ast(k)
|
||||
|
||||
class TestAST(unittest.TestCase):
|
||||
def test_conv_zeroview_ast(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 3, 4), views=[View((1, 1, 3, 4), (2, 2, 2, 1), -3), ZeroView((1, 1, 1, 2), ((0, 1), (0, 1), (-1, 2), (-1, 3))), View((1, 1, 3, 4), (0, 0, 4, 1), 0)]))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 3, 4), views=[View((1, 1, 3, 4), (0, 0, 0, 0), 0)]))
|
||||
op1 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
ast = LazyOp(UnaryOps.RELU, (op1,), None)
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_cifar_conv(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(512, 1, 128, 32, 32, 64, 3, 3), views=[View((512, 64, 34, 34), (65536, 1024, 32, 1), -33), ZeroView((512, 64, 32, 32), ((0, 512), (0, 64), (-1, 33), (-1, 33))), View((512, 1, 128, 32, 32, 64, 3, 3), (73984, 73984, 0, 34, 1, 1156, 34, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 64, 32, 32), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(512, 1, 128, 32, 32, 64, 3, 3), views=[View((512, 1, 128, 32, 32, 64, 3, 3), (0, 0, 576, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 64, 3, 3), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
ast = LazyOp(ReduceOps.SUM, (op0,), (512, 1, 128, 32, 32, 1, 1, 1))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_cifar_conv_backward(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(256, 1, 512, 3, 3, 512, 8, 8), views=[View((256, 512, 10, 10), (64, 16384, 8, 1), -9), ZeroView((256, 512, 8, 8), ((0, 256), (0, 512), (-1, 9), (-1, 9))), View((256, 1, 512, 3, 3, 512, 8, 8), (51200, 51200, 0, 10, 1, 100, 10, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 256, 8, 8), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(256, 1, 512, 3, 3, 512, 8, 8), views=[View((256, 1, 512, 3, 3, 512, 8, 8), (0, 0, 64, 0, 0, 32768, 8, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 512, 8, 8), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (256, 1, 512, 3, 3, 1, 1, 1))
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op1,), (256, 512, 3, 3))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_first_op_conv(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 3, 3, 3, 4), views=[View((1, 130, 258, 1, 12), (393216, 3072, 12, 12, 1), -3084), ZeroView((1, 128, 256, 1, 12), ((0, 1), (-1, 129), (-1, 257), (0, 1), (0, 12))), View((1, 64, 128, 8, 4, 3, 3, 3, 4), (0, 6192, 24, 0, 0, 3096, 12, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 768, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 3, 3, 3, 4), views=[View((1, 64, 128, 8, 4, 3, 3, 3, 4), (0, 0, 0, 432, 4, 144, 16, 48, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 108, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 8, 4, 1, 1, 1, 1))
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(32,), force_create=True))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
|
||||
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
|
||||
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
|
||||
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
|
||||
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
|
||||
op6 = LazyOp(UnaryOps.RELU, (op5,), None)
|
||||
op7 = LazyOp(BinaryOps.MUL, (buf3,op6,), None)
|
||||
op8 = LazyOp(BinaryOps.SUB, (op3,op7,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_second_op_conv(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 3, 3), views=[View((1, 66, 130, 32, 1), (262144, 4096, 32, 1, 1), -4128), ZeroView((1, 64, 128, 32, 1), ((0, 1), (-1, 65), (-1, 129), (0, 32), (0, 1))), View((1, 64, 128, 8, 4, 1, 1, 3, 3), (266240, 4160, 32, 4, 1, 12480, 12480, 4160, 32), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 3, 3), views=[View((1, 64, 128, 8, 4, 1, 1, 3, 3), (0, 0, 0, 36, 1, 0, 0, 12, 4), 0)]), hostbuf=GPUBuffer(shape=(8, 9, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 8, 4, 1, 1, 1, 1))
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(32,), force_create=True))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
|
||||
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
|
||||
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
|
||||
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
|
||||
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
|
||||
op6 = LazyOp(UnaryOps.RELU, (op5,), None)
|
||||
op7 = LazyOp(BinaryOps.MUL, (buf3,op6,), None)
|
||||
op8 = LazyOp(BinaryOps.SUB, (op3,op7,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_third_op_conv(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 4096, 32, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 0, 0, 128, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(4, 32, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 4, 4, 1, 1, 1, 1))
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 4, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(16,), force_create=True))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op2,), (64, 512, 4))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
# VALIDHACKS=1 IMAGE=2 DEBUG=4 PYTHONPATH="." GPU=1 OPT=2 python3 test/external_test_gpu_ast.py TestAST.test_reduce_op
|
||||
# 164 time 27.75 ms running re_S128_4 with [128] None count 4 runtime 1016.06 us 2.07 GFLOPS () -> (128, 1)
|
||||
# 169 time 22.51 ms running matmul with [4, 16, 128] [4, 16, 16] count 5 runtime 110.08 us 19.06 GFLOPS ('-DMATMUL',) -> (128, 1)
|
||||
def test_reduce_op(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 512, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 8192, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 2048, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 128, 4, 1, 1, 1, 1))
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 128, 4, 1, 1, 1, 1), (512, 512, 512, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op3,), (1, 128, 4))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_alt_reduce_op(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 512, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 8192, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 2048, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1))
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
|
||||
op3 = LazyOp(BinaryOps.MAX, (op2,buf3,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op3,), (1, 128, 4))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
# re_S32_16_36_6 is fast
|
||||
def test_1x1_36_6(self): # 36 <- 6
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), (0, 1536, 24, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 384, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), (0, 0, 0, 0, 0, 96, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(36, 24, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1))
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(144,), force_create=True))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
|
||||
op3 = LazyOp(BinaryOps.MAX, (op2,buf3,), None)
|
||||
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
|
||||
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
|
||||
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
|
||||
buf5 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
|
||||
op6 = LazyOp(BinaryOps.MAX, (op5,buf5,), None)
|
||||
op7 = LazyOp(BinaryOps.SUB, (op3,op6,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op7,), (32, 2304, 4))
|
||||
compile_and_test_ast(ast, None if OSX else (16, 16, 4))
|
||||
|
||||
# re_S32_16_6_36 is slow
|
||||
def test_1x1_6_36(self): # 6 <- 36
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), (0, 9216, 144, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 2304, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), (0, 0, 0, 0, 0, 576, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(6, 144, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1))
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(24,), force_create=True))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 1536, 24, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(32, 384, 4), force_create=True))
|
||||
op3 = LazyOp(BinaryOps.ADD, (op2,buf3,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op3,), (32, 384, 4))
|
||||
compile_and_test_ast(ast, (6, 16, 4))
|
||||
|
||||
# re_S32_16_6_24
|
||||
def test_1x1_6_24(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), (0, 6144, 96, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 1536, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), (0, 0, 0, 0, 0, 384, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(6, 96, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1))
|
||||
#buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(24,), force_create=True))
|
||||
#op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op1,), (32, 384, 4))
|
||||
compile_and_test_ast(ast, (6, 4, 8))
|
||||
|
||||
def test_full_reduce_op(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (512, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 1, 1, 128, 4, 1, 1, 1, 1), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (0, 1), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.ADD, (buf0,buf1,), None)
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([2.], dtype=np.float32)))
|
||||
op1 = LazyOp(BinaryOps.POW, (op0,buf2,), None)
|
||||
op2 = LazyOp(ReduceOps.SUM, (op1,), (1, 1))
|
||||
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([0.5], dtype=np.float32)))
|
||||
op3 = LazyOp(BinaryOps.POW, (op2,buf3,), None)
|
||||
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.e-12], dtype=np.float32)))
|
||||
op4 = LazyOp(BinaryOps.SUB, (op3,buf4,), None)
|
||||
op5 = LazyOp(UnaryOps.RELU, (op4,), None)
|
||||
buf5 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.e-12], dtype=np.float32)))
|
||||
op6 = LazyOp(BinaryOps.ADD, (op5,buf5,), None)
|
||||
buf6 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([3.4e+38], dtype=np.float32)))
|
||||
op7 = LazyOp(BinaryOps.SUB, (op3,buf6,), None)
|
||||
op8 = LazyOp(UnaryOps.RELU, (op7,), None)
|
||||
op9 = LazyOp(BinaryOps.SUB, (op6,op8,), None)
|
||||
op10 = LazyOp(UnaryOps.RECIPROCAL, (op9,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op10,), (1, 1))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_1239_reduce(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1239, 4, 1, 1, 64, 4), views=[View((1, 1, 1, 1239, 4, 1, 1, 64, 4), (0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 64, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1239, 4, 1, 1, 64, 4), views=[View((1, 1, 1, 1239, 4, 1, 1, 64, 4), (0, 0, 0, 1024, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(1239, 256,
|
||||
4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 1239, 4, 1, 1, 1, 1))
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op1,), (1, 1, 1, 1, 4956))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_enet_first_conv_bs32(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 3, 225, 225), (150528, 50176, 224, 1), 0), ZeroView((8, 3, 224, 224), ((0, 8), (0, 3), (0, 225), (0, 225))), View((8, 1, 32, 112, 112, 3, 3, 3), (151875, 151875, 0, 450, 2, 50625, 225, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 3, 224, 224), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 1, 32, 112, 112, 3, 3, 3), (0, 0, 27, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 3, 3), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (8, 1, 32, 112, 112, 1, 1, 1))
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op1,), (8, 32, 112, 112))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_enet_reduce_bs32(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 32, 225, 225), (50176, 150528, 224, 1), 0), ZeroView((3, 32, 224, 224), ((0, 3), (0, 32), (0, 225), (0, 225))), View((3, 1, 32, 3, 3, 32, 112, 112), (1620000, 1620000, 0, 225, 1, 50625, 450, 2), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 224, 224), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 1, 32, 3, 3, 32, 112, 112), (0, 12845056, 401408, 0, 0, 12544, 112, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 1, 32, 1, 1, 32, 112, 112), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (3, 1, 32, 3, 3, 1, 1, 1))
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op1,), (3, 32, 3, 3))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
52
tinygrad_repo/test/external/external_test_image.py
vendored
Normal file
52
tinygrad_repo/test/external/external_test_image.py
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env python
|
||||
import os
|
||||
import unittest
|
||||
import numpy as np
|
||||
if 'IMAGE' not in os.environ:
|
||||
os.environ['IMAGE'] = '2'
|
||||
os.environ['GPU'] = '1'
|
||||
os.environ['OPT'] = '2'
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn import Conv2d
|
||||
Tensor.no_grad = True
|
||||
|
||||
class TestImage(unittest.TestCase):
|
||||
def test_create_image(self):
|
||||
t = Tensor.ones(128, 128, 1)
|
||||
t = t.reshape(128, 32, 4) + 3
|
||||
t.realize()
|
||||
np.testing.assert_array_equal(t.numpy(), np.ones((128,32,4))*4)
|
||||
|
||||
def test_sum_image(self):
|
||||
t1 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 3
|
||||
t1.realize()
|
||||
t1 = t1.sum()
|
||||
t1.realize()
|
||||
assert t1.numpy() == 16*4*4*4, f"got {t1.numpy()}"
|
||||
|
||||
def test_add_image(self):
|
||||
t1 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 3
|
||||
t2 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 4
|
||||
t1.realize()
|
||||
t2.realize()
|
||||
t3 = t1 + t2
|
||||
t3.realize()
|
||||
np.testing.assert_array_equal(t3.numpy(), np.ones((16,4,4))*9)
|
||||
|
||||
def test_padded_conv(self):
|
||||
bs, in_chans, out_chans = 1,12,32
|
||||
tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
|
||||
tiny_dat = Tensor.ones(bs, 12, 64, 128)
|
||||
tiny_conv(tiny_dat).realize()
|
||||
|
||||
def test_op_conv(self):
|
||||
bs, in_chans, out_chans = 1,12,32
|
||||
tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
|
||||
tiny_dconv = Conv2d(out_chans, out_chans, 1, bias=None, padding=0)
|
||||
tiny_dat = Tensor.ones(bs, 12, 64, 128)
|
||||
p2 = tiny_conv(tiny_dat).relu()
|
||||
p2 = tiny_dconv(p2)
|
||||
p2.realize()
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
45
tinygrad_repo/test/external/external_test_jit_on_models.py
vendored
Normal file
45
tinygrad_repo/test/external/external_test_jit_on_models.py
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
|
||||
from tinygrad.helpers import dtypes, CI
|
||||
from tinygrad.ops import Device
|
||||
from test.helpers import derandomize_model
|
||||
|
||||
from examples.llama import Transformer
|
||||
|
||||
def helper_test_jitted_correctness(gen, train, train_jit):
|
||||
nojit = train(*gen()).numpy()
|
||||
for _ in range(5): jit = train_jit(*gen()).numpy()
|
||||
np.testing.assert_allclose(nojit, jit, rtol=1e-3, atol=1e-5)
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE, "needs JIT")
|
||||
class TestJittedModels(unittest.TestCase):
|
||||
def test_jitted_tiny_llama(self):
|
||||
old_type = Tensor.default_type
|
||||
Tensor.default_type = dtypes.float16
|
||||
|
||||
args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
|
||||
model = Transformer(**args_tiny)
|
||||
derandomize_model(model)
|
||||
def test(t): return model(t, 0).realize()
|
||||
|
||||
@TinyJit
|
||||
def test_jit(t): return model(t, 0).realize()
|
||||
helper_test_jitted_correctness(lambda: (Tensor([[1,]]),), test, test_jit)
|
||||
Tensor.default_type = old_type
|
||||
|
||||
@unittest.skipUnless(not CI, "huge for CI")
|
||||
def test_jitted_stable_diffusion(self):
|
||||
from examples.stable_diffusion import UNetModel
|
||||
model = UNetModel()
|
||||
derandomize_model(model)
|
||||
def test(t, t2): return model(t, 801, t2).realize()
|
||||
|
||||
@TinyJit
|
||||
def test_jit(t, t2): return model(t, 801, t2).realize()
|
||||
helper_test_jitted_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test, test_jit)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
208
tinygrad_repo/test/external/external_test_onnx_backend.py
vendored
Normal file
208
tinygrad_repo/test/external/external_test_onnx_backend.py
vendored
Normal file
@@ -0,0 +1,208 @@
|
||||
import unittest
|
||||
from onnx.backend.base import Backend, BackendRep
|
||||
import onnx.backend.test
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from typing import Any, Tuple
|
||||
from tinygrad.helpers import getenv, CI
|
||||
|
||||
# pip3 install tabulate
|
||||
pytest_plugins = 'onnx.backend.test.report',
|
||||
|
||||
from extra.onnx import get_run_onnx
|
||||
|
||||
class TinygradModel(BackendRep):
|
||||
def __init__(self, run_onnx, input_names):
|
||||
super().__init__()
|
||||
self.fxn = run_onnx
|
||||
self.input_names = input_names
|
||||
|
||||
def run(self, inputs: Any, **kwargs: Any) -> Tuple[Any, ...]:
|
||||
real_inputs = {k:v for k,v in zip(self.input_names, inputs)}
|
||||
ret = self.fxn(real_inputs, debug=True)
|
||||
return tuple(x.numpy() if isinstance(x, Tensor) else [i.numpy() for i in x] if isinstance(x, list) else np.array(x) for x in ret.values())
|
||||
|
||||
class TinygradBackend(Backend):
|
||||
@classmethod
|
||||
def prepare(cls, model, device):
|
||||
input_all = [x.name for x in model.graph.input]
|
||||
input_initializer = [x.name for x in model.graph.initializer]
|
||||
net_feed_input = [x for x in input_all if x not in input_initializer]
|
||||
print("prepare", cls, device, net_feed_input)
|
||||
run_onnx = get_run_onnx(model)
|
||||
return TinygradModel(run_onnx, net_feed_input)
|
||||
|
||||
@classmethod
|
||||
def supports_device(cls, device: str) -> bool:
|
||||
return device == "CPU"
|
||||
|
||||
backend_test = onnx.backend.test.BackendTest(TinygradBackend, __name__)
|
||||
|
||||
# no support for reduce with multiply (needs llop)
|
||||
backend_test.exclude('test_reduce_prod_*')
|
||||
|
||||
# TODO figure out why it's returning wrong values, geohotstan's uneducated guess is it's due to imprecision from float64 (double) -> float32
|
||||
# see Type Constraints: https://onnx.ai/onnx/operators/onnx_aionnxpreviewtraining_Adam.html#type-constraints
|
||||
backend_test.exclude('test_adam_multiple_cpu')
|
||||
backend_test.exclude('test_nesterov_momentum_cpu')
|
||||
|
||||
# we only support float32
|
||||
backend_test.exclude('uint8')
|
||||
backend_test.exclude('uint16')
|
||||
backend_test.exclude('uint32')
|
||||
backend_test.exclude('uint64')
|
||||
backend_test.exclude('int8')
|
||||
backend_test.exclude('int16')
|
||||
backend_test.exclude('float64')
|
||||
backend_test.exclude('string')
|
||||
|
||||
backend_test.exclude('test_pow_types_int*')
|
||||
backend_test.exclude('test_cast_*')
|
||||
backend_test.exclude('test_castlike_*')
|
||||
backend_test.exclude('test_convinteger_*')
|
||||
backend_test.exclude('test_matmulinteger_*')
|
||||
|
||||
backend_test.exclude('test_reduce_log_sum_exp*') # dependent on actual float64 implementation for backends
|
||||
backend_test.exclude('test_operator_add*') # dependent on float64 math. Without it values default to 0 or inf
|
||||
|
||||
# we don't support indexes
|
||||
# backend_test.exclude('test_argmax_*') # Needs more work: select_last_index
|
||||
# backend_test.exclude('test_argmin_*') # Needs more work: select_last_index
|
||||
backend_test.exclude('test_nonzero_*')
|
||||
|
||||
# no support for mod
|
||||
backend_test.exclude('test_mod_*')
|
||||
|
||||
# no boolean ops (2d, 3d, 4d)
|
||||
backend_test.exclude('test_bitshift_*')
|
||||
|
||||
# no scatternd gathernd
|
||||
backend_test.exclude('test_gathernd_*')
|
||||
backend_test.exclude('test_scatternd_*')
|
||||
|
||||
# no quantize
|
||||
backend_test.exclude('test_dynamicquantizelinear_*')
|
||||
backend_test.exclude('test_qlinearmatmul_*')
|
||||
backend_test.exclude('test_qlinearconv_*')
|
||||
backend_test.exclude('test_quantizelinear_*')
|
||||
|
||||
# no rnn
|
||||
backend_test.exclude('test_gru_*')
|
||||
backend_test.exclude('test_rnn_*')
|
||||
backend_test.exclude('test_lstm_*')
|
||||
backend_test.exclude('test_simple_rnn_*')
|
||||
|
||||
# no control flow
|
||||
backend_test.exclude('test_if_*')
|
||||
backend_test.exclude('test_loop*')
|
||||
backend_test.exclude('test_range_float_type_positive_delta_expanded_cpu') # requires loop
|
||||
|
||||
# unsupported (strange) ops
|
||||
backend_test.exclude('test_bitwise_*')
|
||||
backend_test.exclude('test_blackmanwindow_*')
|
||||
backend_test.exclude('test_bernoulli_*')
|
||||
backend_test.exclude('test_cumsum_*')
|
||||
backend_test.exclude('test_det_*')
|
||||
|
||||
backend_test.exclude('test_tril_zero_cpu') # TODO: zero array support
|
||||
backend_test.exclude('test_triu_zero_cpu') # TODO: zero array support
|
||||
|
||||
backend_test.exclude('test_col2im_*')
|
||||
backend_test.exclude('test_hammingwindow_*')
|
||||
backend_test.exclude('test_hannwindow_*')
|
||||
backend_test.exclude('test_hardmax_*')
|
||||
backend_test.exclude('test_gridsample_*')
|
||||
backend_test.exclude('test_dft_*')
|
||||
backend_test.exclude('test_einsum_*')
|
||||
backend_test.exclude('test_strnorm_*')
|
||||
backend_test.exclude('test_unique_*')
|
||||
backend_test.exclude('test_sequence_*')
|
||||
backend_test.exclude('test_nonmaxsuppression_*')
|
||||
backend_test.exclude('test_reversesequence_*')
|
||||
backend_test.exclude('test_roialign_*')
|
||||
backend_test.exclude('test_top_k_*')
|
||||
backend_test.exclude('test_tfidfvectorizer_*')
|
||||
backend_test.exclude('test_stft_*')
|
||||
backend_test.exclude('test_melweightmatrix_*')
|
||||
|
||||
# more strange ops
|
||||
backend_test.exclude('test_basic_deform_conv_*')
|
||||
backend_test.exclude('test_deform_conv_*')
|
||||
backend_test.exclude('test_lppool_*')
|
||||
backend_test.exclude('test_depthtospace_*')
|
||||
backend_test.exclude('test_spacetodepth_*')
|
||||
backend_test.exclude('test_scan*')
|
||||
backend_test.exclude('test_split_to_sequence_*')
|
||||
backend_test.exclude('test_resize_downsample_scales_cubic_*') # unsure how to implement cubic
|
||||
backend_test.exclude('test_resize_downsample_sizes_cubic_*') # unsure how to implement cubic
|
||||
backend_test.exclude('test_resize_upsample_scales_cubic_*') # unsure how to implement cubic
|
||||
backend_test.exclude('test_resize_upsample_sizes_cubic_*') # unsure how to implement cubic
|
||||
|
||||
# rest of the failing tests
|
||||
backend_test.exclude('test_averagepool_2d_dilations_cpu') # dilations != 1 not supported for avgpool
|
||||
backend_test.exclude('test_convtranspose_autopad_same_cpu') # TODO geohotstan has no idea how this is done, autopad requires output_shape but output_shape requires pads from autopad
|
||||
backend_test.exclude('test_optional_has_element_empty_optional_input_cpu') # Attempts to create Tensor from None
|
||||
backend_test.exclude('test_range_int32_type_negative_delta_expanded_cpu') # AttributeProto.GRAPH not implemented
|
||||
backend_test.exclude('test_reshape_allowzero_reordered_cpu') # reshaping to 0 shape
|
||||
backend_test.exclude('test_resize_downsample_scales_linear_antialias_cpu') # antialias not implemented
|
||||
backend_test.exclude('test_resize_downsample_sizes_linear_antialias_cpu') # antialias not implemented
|
||||
backend_test.exclude('test_resize_tf_crop_and_resize_cpu') # unsure about fill value after clip
|
||||
backend_test.exclude('test_operator_addconstant_cpu') # bad data type
|
||||
|
||||
# issue 1556 https://github.com/tinygrad/tinygrad/issues/1556
|
||||
backend_test.exclude('test_isinf_cpu')
|
||||
backend_test.exclude('test_isinf_negative_cpu')
|
||||
backend_test.exclude('test_isinf_positive_cpu')
|
||||
backend_test.exclude('test_isnan_cpu')
|
||||
|
||||
# issue 1791 fast math messes with these https://github.com/tinygrad/tinygrad/issues/1791
|
||||
backend_test.exclude('test_resize_upsample_sizes_nearest_axes_2_3_cpu')
|
||||
backend_test.exclude('test_resize_upsample_sizes_nearest_axes_3_2_cpu')
|
||||
backend_test.exclude('test_resize_upsample_sizes_nearest_cpu')
|
||||
|
||||
# issue 2067 potentially also a fastmath issue https://github.com/tinygrad/tinygrad/issues/2067
|
||||
if getenv('METAL'):
|
||||
backend_test.exclude('test_maxpool_2d_pads_cpu')
|
||||
backend_test.exclude('test_maxpool_2d_same_lower_cpu')
|
||||
|
||||
# Don't know how to treat special TensorProto like TensorProto.FLOAT8E4M3FN
|
||||
if getenv("CPU") or getenv("TORCH"):
|
||||
backend_test.exclude('test_dequantizelinear_axis_cpu')
|
||||
backend_test.exclude('test_dequantizelinear_cpu')
|
||||
|
||||
# compiled backends cannot reshape to and from 0
|
||||
if getenv('LLVM') or getenv('GPU') or getenv('CLANG') or getenv('METAL') or getenv('CUDA'):
|
||||
backend_test.exclude('test_slice_start_out_of_bounds_cpu')
|
||||
backend_test.exclude('test_constantofshape_int_shape_zero_cpu')
|
||||
|
||||
if getenv('GPU') or getenv('METAL'):
|
||||
backend_test.exclude('test_mish_cpu') # weird inaccuracy
|
||||
backend_test.exclude('test_mish_expanded_cpu') # weird inaccuracy
|
||||
backend_test.exclude('test_eyelike_with_dtype_cpu') # backend does not support dtype: Double
|
||||
|
||||
# Segfaults in CI
|
||||
if (getenv('LLVM') or getenv('CUDA')) and CI:
|
||||
backend_test.exclude('test_max_float16_cpu')
|
||||
backend_test.exclude('test_min_float16_cpu')
|
||||
|
||||
# disable model tests for now since they are slow
|
||||
if not getenv("MODELTESTS"):
|
||||
for x in backend_test.test_suite:
|
||||
if 'OnnxBackendRealModelTest' in str(type(x)):
|
||||
backend_test.exclude(str(x).split(" ")[0])
|
||||
else:
|
||||
# model tests all pass!
|
||||
backend_test.include('test_resnet50')
|
||||
backend_test.include('test_inception_v1')
|
||||
backend_test.include('test_inception_v2')
|
||||
backend_test.include('test_densenet121')
|
||||
backend_test.include('test_shufflenet')
|
||||
backend_test.include('test_squeezenet')
|
||||
backend_test.include('test_bvlc_alexnet')
|
||||
backend_test.include('test_zfnet512')
|
||||
backend_test.include('test_vgg19')
|
||||
|
||||
globals().update(backend_test.enable_report().test_cases)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
392
tinygrad_repo/test/external/external_test_opt.py
vendored
Normal file
392
tinygrad_repo/test/external/external_test_opt.py
vendored
Normal file
@@ -0,0 +1,392 @@
|
||||
#!/usr/bin/env python
|
||||
import os
|
||||
|
||||
import torch
|
||||
if "OPT" not in os.environ:
|
||||
os.environ["OPT"] = "2"
|
||||
|
||||
import gc
|
||||
import numpy as np
|
||||
|
||||
import unittest
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
from tinygrad import nn
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.nn import optim
|
||||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.lazy import PUSH_PERMUTES
|
||||
from tinygrad.jit import CacheCollector
|
||||
|
||||
class CLCache:
|
||||
def __init__(self, allowed=None, strict=False, preclear=True): self.allowed, self.strict, self.preclear = allowed, strict, preclear
|
||||
def __enter__(self):
|
||||
if self.preclear:
|
||||
gc.collect()
|
||||
for x in [x for x in gc.get_objects() if isinstance(x, Tensor)]:
|
||||
x.realize()
|
||||
GlobalCounters.reset()
|
||||
CacheCollector.start()
|
||||
print("cache: entering")
|
||||
def __exit__(self, type, value, traceback):
|
||||
cache = CacheCollector.finish()
|
||||
print(f"cache: exiting with size {len(cache)}", f"allowed {self.allowed}" if self.allowed is not None else "")
|
||||
if self.allowed is not None:
|
||||
assert len(cache) <= self.allowed and (not self.strict or len(cache) == self.allowed), f"used too many kernels! {len(cache)} > {self.allowed}"
|
||||
|
||||
from models.convnext import ConvNeXt
|
||||
from models.efficientnet import EfficientNet
|
||||
from models.resnet import ResNet18
|
||||
from models.vit import ViT
|
||||
from tinygrad.nn.state import get_parameters
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
class TestInferenceMinKernels(unittest.TestCase):
|
||||
def setUp(self):
|
||||
Tensor.training = False
|
||||
|
||||
@unittest.skipIf(not PUSH_PERMUTES, "this test requires PUSH_PERMUTES")
|
||||
def test_convnext(self):
|
||||
model = ConvNeXt()
|
||||
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
|
||||
img = Tensor.randn(1, 3, 224, 224)
|
||||
with CLCache(129):
|
||||
model(img).realize()
|
||||
|
||||
def test_enet(self):
|
||||
model = EfficientNet(getenv("ENET_NUM", 0), has_se=False)
|
||||
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
|
||||
img = Tensor.randn(1, 3, 224, 224)
|
||||
with CLCache(51):
|
||||
model.forward(img).realize()
|
||||
|
||||
def test_enet_se(self):
|
||||
model = EfficientNet(getenv("ENET_NUM", 0), has_se=True)
|
||||
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
|
||||
img = Tensor.randn(1, 3, 224, 224)
|
||||
# TODO: this seems very high
|
||||
with CLCache(115):
|
||||
model.forward(img).realize()
|
||||
|
||||
def test_resnet(self):
|
||||
model = ResNet18()
|
||||
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
|
||||
img = Tensor.randn(1, 3, 224, 224)
|
||||
with CLCache(26):
|
||||
model.forward(img).realize()
|
||||
|
||||
def test_vit(self):
|
||||
model = ViT(embed_dim=192, num_heads=3)
|
||||
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
|
||||
img = Tensor.randn(1, 3, 224, 224)
|
||||
with CLCache(222): # NOTE: this is way too high
|
||||
out = model.forward(img)
|
||||
assert len(CacheCollector.cache) == 0, "ViT prerealized?"
|
||||
out.realize()
|
||||
|
||||
def test_llama(self):
|
||||
from examples.llama import Transformer
|
||||
args_tiny = {"dim": 512, "multiple_of": 256, "n_heads": 8, "n_layers": 4, "norm_eps": 1e-05, "vocab_size": 1000}
|
||||
model = Transformer(**args_tiny)
|
||||
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
|
||||
with CLCache(85):
|
||||
model(Tensor([[1,2,3,4]]), 0).realize()
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
class TestOptBinOp(unittest.TestCase):
|
||||
def _test_no_binop_rerun(self, f1, f2=None, allowed=1):
|
||||
a = Tensor.randn(16, 16)
|
||||
b = Tensor.randn(16, 16)
|
||||
with CLCache():
|
||||
c = f1(a, b)
|
||||
if f2 is not None: d = f2(a, b)
|
||||
c.realize()
|
||||
if f2 is not None: d.realize()
|
||||
assert len(CacheCollector.cache) == allowed, "binop was rerun!"
|
||||
if f2 is not None: np.testing.assert_allclose(c.numpy().ravel(), d.numpy().ravel(), rtol=1e-3, atol=1e-5)
|
||||
|
||||
def test_no_binop_rerun(self): return self._test_no_binop_rerun(lambda a,b: a*b, lambda a,b: (a*b).reshape(16, 16, 1))
|
||||
def test_no_binop_rerun_alt(self): return self._test_no_binop_rerun(lambda a,b: (a*b).reshape(16, 16, 1), lambda a,b: a*b)
|
||||
def test_no_binop_rerun_reduce_broadcast(self): return self._test_no_binop_rerun(lambda a,b: a.sum()+b, lambda a,b: a.sum().reshape(1,1)+b, allowed=2)
|
||||
@unittest.skip("this test started failing with the new change, based movementop issue")
|
||||
def test_no_binop_rerun_transposed(self): return self._test_no_binop_rerun(lambda a,b: (a.T*b.T).T, lambda a,b: a*b)
|
||||
def test_no_binop_rerun_mid_reshape(self): return self._test_no_binop_rerun(lambda a,b: (a*b).reshape(256)+a.reshape(256))
|
||||
|
||||
# currently non working tests
|
||||
#def test_no_binop_rerun_preshape(self): return self._test_no_binop_rerun(lambda a,b: a.reshape(16, 16, 1)*b.reshape(16, 16, 1), lambda a,b: a*b)
|
||||
#def test_no_binop_rerun_reduce(self): return self._test_no_binop_rerun(lambda a,b: (a*b).sum(), lambda a,b: (a*b).reshape(16, 16, 1).sum())
|
||||
#def test_no_binop_rerun_reduce_alt(self): return self._test_no_binop_rerun(lambda a,b: a.sum(1)+b[0], lambda a,b: a.sum(1).reshape(1,16)+b[0])
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
class TestOptReduceLoop(unittest.TestCase):
|
||||
@unittest.skip("this is broken")
|
||||
def test_loop_left(self):
|
||||
a = Tensor.randn(16, 16)
|
||||
b = Tensor.randn(16, 16)
|
||||
with CLCache():
|
||||
t = a.sum(0)
|
||||
b = t.reshape(16,1).expand(16,16).sum(0)
|
||||
c = (t+b)
|
||||
c.realize()
|
||||
assert len(CacheCollector.cache) == 2, "loop left fusion broken"
|
||||
|
||||
def test_loop_right(self):
|
||||
a = Tensor.randn(16, 16)
|
||||
b = Tensor.randn(16, 16)
|
||||
with CLCache():
|
||||
t = a.sum(0)
|
||||
b = t.reshape(16,1).expand(16,16).sum(0)
|
||||
c = (b+t)
|
||||
c.realize()
|
||||
assert len(CacheCollector.cache) == 2, "loop right fusion broken"
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
class TestOptWChild(unittest.TestCase):
|
||||
def test_unrealized_child(self):
|
||||
a = Tensor.randn(16, 16)
|
||||
b = Tensor.randn(16, 16)
|
||||
with CLCache():
|
||||
c = (a*b).sum()
|
||||
d = c+1
|
||||
e = c+2
|
||||
d.realize()
|
||||
assert len(CacheCollector.cache) == 2, "don't fuse if you have children"
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
class TestOpt(unittest.TestCase):
|
||||
def test_muladd(self):
|
||||
a,b,c = [Tensor.ones(2,2) for _ in range(3)]
|
||||
with CLCache():
|
||||
d = a * b + c
|
||||
d.realize()
|
||||
assert len(CacheCollector.cache) == 1, "optimizer didn't fold muladd"
|
||||
np.testing.assert_allclose(d.numpy(), np.ones((2,2))*2, rtol=1e-5)
|
||||
|
||||
def test_fold_reduce_elementwise(self):
|
||||
img = Tensor.ones(32)
|
||||
addme = Tensor.ones(1)
|
||||
with CLCache():
|
||||
ret = img.sum() + addme
|
||||
ret.realize()
|
||||
assert len(CacheCollector.cache) == 1, "optimizer didn't fold reduce/elementwise"
|
||||
assert ret.numpy()[0] == 33
|
||||
|
||||
def test_fold_batchnorm(self):
|
||||
with Tensor.train():
|
||||
img = Tensor.ones(1,32,4,4)
|
||||
bn = nn.BatchNorm2d(32, track_running_stats=False)
|
||||
with CLCache():
|
||||
img_bn = bn(img).realize()
|
||||
print(img_bn)
|
||||
assert len(CacheCollector.cache) == 3, f"optimizer didn't fold batchnorm, got {len(CacheCollector.cache)}"
|
||||
# Tensor.training = False
|
||||
|
||||
def test_fold_conv_sgd(self):
|
||||
with Tensor.train():
|
||||
img = Tensor.ones(2,3,4,4)
|
||||
c1 = nn.Conv2d(3,32,3)
|
||||
opt = optim.SGD(get_parameters(c1))
|
||||
with CLCache():
|
||||
opt.zero_grad()
|
||||
c1(img).relu().sum().backward()
|
||||
opt.step()
|
||||
# TODO: this should be 4, but the sum output child stays around
|
||||
# with pushing_permutes it can be 3
|
||||
# TODO: broken with optim fixes
|
||||
assert len(CacheCollector.cache) in [4,5,6], f"optimizer didn't fold conv-backward SGD, got {len(CacheCollector.cache)}"
|
||||
# Tensor.training = False
|
||||
|
||||
def test_fold_2convs_sgd(self):
|
||||
with Tensor.train():
|
||||
img = Tensor.ones(2,3,64,64)
|
||||
c1 = nn.Conv2d(3,16,3,bias=False)
|
||||
c2 = nn.Conv2d(16,32,3,bias=False)
|
||||
opt = optim.SGD(get_parameters([c1, c2]))
|
||||
with CLCache(allowed=9):
|
||||
opt.zero_grad()
|
||||
c2(c1(img).relu()).relu().sum().backward()
|
||||
opt.step()
|
||||
# Tensor.training = False
|
||||
|
||||
def test_fold_4convs_sgd(self):
|
||||
with Tensor.train():
|
||||
img = Tensor.ones(2,3,64,64)
|
||||
c1 = nn.Conv2d(3,4,3,bias=False)
|
||||
c2 = nn.Conv2d(4,8,3,bias=False)
|
||||
c3 = nn.Conv2d(8,16,3,bias=False)
|
||||
c4 = nn.Conv2d(16,32,3,bias=False)
|
||||
opt = optim.SGD(get_parameters([c1, c2, c3, c4]))
|
||||
with CLCache(allowed=19):
|
||||
opt.zero_grad()
|
||||
c4(c3(c2(c1(img).relu()).relu()).relu()).relu().sum().backward()
|
||||
opt.step()
|
||||
# Tensor.training = False
|
||||
|
||||
def test_fold_conv_batchnorm_sgd(self):
|
||||
with Tensor.train():
|
||||
img = Tensor.ones(1,3,4,4)
|
||||
c1 = nn.Conv2d(3,32,3)
|
||||
bn = nn.BatchNorm2d(32, track_running_stats=False)
|
||||
opt = optim.SGD(get_parameters([c1, bn]))
|
||||
with CLCache(allowed=18): # this is too high
|
||||
img_bn = bn(c1(img)).elu().sum()
|
||||
opt.zero_grad()
|
||||
img_bn.backward()
|
||||
opt.step()
|
||||
# Tensor.training = False
|
||||
|
||||
def test_fold_conv_batchnorm_notrain(self):
|
||||
img = Tensor.ones(1,3,8,8)
|
||||
c1 = nn.Conv2d(3,32,3)
|
||||
bn = nn.BatchNorm2d(32, track_running_stats=False)
|
||||
# precache the bn
|
||||
img_conv = bn(c1(img)).relu().realize()
|
||||
with CLCache():
|
||||
img_conv = bn(c1(img)).relu().realize()
|
||||
assert len(CacheCollector.cache) == 1, f"optimizer didn't fold conv-batchnorm at test time, got {len(CacheCollector.cache)}"
|
||||
|
||||
def test_fold_conv_batchnorm(self):
|
||||
with Tensor.train():
|
||||
img = Tensor.ones(1,3,8,8)
|
||||
c1 = nn.Conv2d(3,32,3)
|
||||
bn = nn.BatchNorm2d(32, track_running_stats=False)
|
||||
with CLCache():
|
||||
img_conv = bn(c1(img)).relu().realize()
|
||||
print(img_conv)
|
||||
assert len(CacheCollector.cache) == 4, f"optimizer didn't fold conv-batchnorm, got {len(CacheCollector.cache)}"
|
||||
|
||||
def test_fold_conv_elu(self):
|
||||
img = Tensor.ones(1,4,8,8)
|
||||
c1 = nn.Conv2d(4, 4, kernel_size=3)
|
||||
c2 = nn.Conv2d(4, 4, kernel_size=3)
|
||||
with CLCache():
|
||||
img_conv = img.sequential([c1, Tensor.elu, c2, Tensor.elu]).realize()
|
||||
print(img_conv)
|
||||
assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/elu"
|
||||
|
||||
def test_fold_conv_relu(self):
|
||||
img = Tensor.ones(1,4,8,8)
|
||||
c1 = nn.Conv2d(4, 4, kernel_size=3)
|
||||
c2 = nn.Conv2d(4, 4, kernel_size=3)
|
||||
with CLCache():
|
||||
img_conv = img.sequential([c1, Tensor.relu, c2, Tensor.relu]).realize()
|
||||
print(img_conv)
|
||||
assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/relu"
|
||||
|
||||
def test_fold_conv_relu_nobias(self):
|
||||
img = Tensor.ones(1,4,8,8)
|
||||
c1 = nn.Conv2d(4, 4, kernel_size=3, bias=False)
|
||||
c2 = nn.Conv2d(4, 4, kernel_size=3, bias=False)
|
||||
with CLCache():
|
||||
img_conv = img.sequential([c1, Tensor.relu, c2, Tensor.relu]).realize()
|
||||
print(img_conv)
|
||||
assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/relu"
|
||||
|
||||
def test_permute_was_pushed(self):
|
||||
a = Tensor.randn(16, 16, 16)
|
||||
with CLCache():
|
||||
c = a.sum(2)
|
||||
d = c.permute(1,0).contiguous()
|
||||
d.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(a.numpy().sum(2).transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
|
||||
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
|
||||
|
||||
def test_permute_was_pushed_through_contract_reshape(self):
|
||||
a = Tensor.randn(4, 4, 4, 4, 4)
|
||||
with CLCache():
|
||||
c = a.sum(-1)
|
||||
d = c.reshape(16,16).permute(1,0).contiguous()
|
||||
d.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(a.numpy().sum(-1).reshape(16,16).transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
|
||||
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
|
||||
|
||||
def test_permute_was_pushed_through_contractw1s_reshape(self):
|
||||
a = Tensor.randn(4, 4, 4, 4, 4)
|
||||
with CLCache():
|
||||
c = a.sum(-1)
|
||||
d = c.reshape(16,1,16).permute(2,1,0).contiguous()
|
||||
d.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(a.numpy().sum(-1).reshape(16,1,16).transpose(2,1,0), d.numpy(), rtol=1e-3, atol=1e-5)
|
||||
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
|
||||
|
||||
# TODO: push permute through expansion reshape
|
||||
@unittest.skip("expansion can't push expand permute yet")
|
||||
@unittest.skipIf(not PUSH_PERMUTES, "this test requires PUSH_PERMUTES")
|
||||
def test_permute_was_pushed_through_expand_reshape(self):
|
||||
a = Tensor.randn(16, 16, 16)
|
||||
with CLCache():
|
||||
c = a.sum(2)
|
||||
d = c.reshape(4,4,4,4).permute(2,3,0,1).contiguous()
|
||||
d.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(a.numpy().sum(2).transpose(1,0).reshape(4,4,4,4), d.numpy(), rtol=1e-3, atol=1e-5)
|
||||
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
|
||||
|
||||
@unittest.skipIf(PUSH_PERMUTES, "this test is broken with PUSH_PERMUTES")
|
||||
def test_no_reduceop_rerun(self):
|
||||
a = Tensor.randn(16, 16, 16)
|
||||
with CLCache():
|
||||
c = a.sum(2)
|
||||
d = a.sum(2).permute(1,0)
|
||||
c.realize()
|
||||
d.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(c.numpy().transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
|
||||
assert cache_len == 1, "reduceop was rerun!"
|
||||
|
||||
@unittest.skipIf(PUSH_PERMUTES, "this test is broken with PUSH_PERMUTES")
|
||||
def test_no_reduceop_rerun_alt(self):
|
||||
a = Tensor.randn(16, 16, 16)
|
||||
with CLCache():
|
||||
c = a.sum(2).permute(1,0)
|
||||
d = a.sum(2)
|
||||
c.realize()
|
||||
d.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(c.numpy(), d.numpy().transpose(1,0), rtol=1e-3, atol=1e-5)
|
||||
assert cache_len == 1, "reduceop was rerun!"
|
||||
|
||||
def test_fold_with_contiguous(self):
|
||||
a = Tensor.randn(16, 16, 16)
|
||||
b = Tensor.randn(16, 16)
|
||||
with CLCache():
|
||||
c = (a.sum(2).contiguous() + b).contiguous()
|
||||
c.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
assert cache_len == 1, "contiguous wasn't folded"
|
||||
|
||||
def _test_fold_expand_reduce_helper(self, n, m, axis, allowed):
|
||||
b = torch.ones(n, m).sum(axis).reshape(n, 1).expand(n, m).sum(axis)
|
||||
with CLCache(allowed=allowed):
|
||||
a = Tensor.ones(n, m).sum(axis).reshape(n, 1).expand(n, m).sum(axis)
|
||||
a.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
|
||||
return cache_len
|
||||
|
||||
def test_expand_reduce_is_folded_on_same_axis(self):
|
||||
for axis in [0, 1]:
|
||||
for n in [4, 8, 16]:
|
||||
b = torch.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis)
|
||||
with CLCache(allowed=2):
|
||||
a = Tensor.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis)
|
||||
a.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
|
||||
return cache_len
|
||||
|
||||
def test_expand_reduce_is_not_folded_on_different_axes(self):
|
||||
axis1, axis2 = 0, 1
|
||||
for n in [4, 8, 16]:
|
||||
b = torch.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
|
||||
with CLCache(allowed=3):
|
||||
a = Tensor.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
|
||||
a.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
|
||||
return cache_len
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
75
tinygrad_repo/test/external/external_test_optim.py
vendored
Normal file
75
tinygrad_repo/test/external/external_test_optim.py
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import tensorflow_addons as tfa
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn.optim import LAMB
|
||||
|
||||
np.random.seed(1337)
|
||||
x_init = np.random.randn(1,4).astype(np.float32)
|
||||
W_init = np.random.randn(4,4).astype(np.float32)
|
||||
m_init = np.random.randn(1,4).astype(np.float32)
|
||||
|
||||
class TinyNet:
|
||||
def __init__(self):
|
||||
self.x = Tensor(x_init.copy(), requires_grad=True)
|
||||
self.W = Tensor(W_init.copy(), requires_grad=True)
|
||||
self.m = Tensor(m_init.copy())
|
||||
|
||||
def forward(self):
|
||||
out = self.x.matmul(self.W).relu()
|
||||
out = out.log_softmax(1)
|
||||
out = out.mul(self.m).add(self.m).sum()
|
||||
return out
|
||||
|
||||
class TinyNetTF:
|
||||
def __init__(self):
|
||||
self.x = tf.Variable(x_init.copy(), trainable=True)
|
||||
self.W = tf.Variable(W_init.copy(), trainable=True)
|
||||
self.m = tf.constant(m_init.copy())
|
||||
|
||||
def forward(self):
|
||||
out = tf.matmul(self.x, self.W)
|
||||
out = tf.nn.relu(out)
|
||||
out = tf.nn.log_softmax(out, axis=1)
|
||||
out = tf.multiply(out, self.m) + self.m
|
||||
out = tf.reduce_sum(out)
|
||||
return out
|
||||
|
||||
def step(optim, steps=1, kwargs={}):
|
||||
net = TinyNet()
|
||||
optim = optim([net.x, net.W], **kwargs)
|
||||
for _ in range(steps):
|
||||
out = net.forward()
|
||||
optim.zero_grad()
|
||||
out.backward()
|
||||
optim.step()
|
||||
return net.x.detach().numpy(), net.W.detach().numpy()
|
||||
|
||||
def step_tf(optim, steps=1, kwargs={}):
|
||||
net = TinyNetTF()
|
||||
optim = optim(**kwargs)
|
||||
for _ in range(steps):
|
||||
with tf.GradientTape() as tape:
|
||||
out = net.forward()
|
||||
grads = tape.gradient(out, [net.x, net.W])
|
||||
optim.apply_gradients(zip(grads, [net.x, net.W]))
|
||||
return net.x.numpy(), net.W.numpy()
|
||||
|
||||
class ExternalTestOptim(unittest.TestCase):
|
||||
def _test_optim(self, tinygrad_optim, tensorflow_optim, steps, opts, atol, rtol):
|
||||
for x,y in zip(step(tinygrad_optim, steps, kwargs=opts),
|
||||
step_tf(tensorflow_optim, steps, kwargs=opts)):
|
||||
np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
|
||||
|
||||
def _test_lamb(self, steps, opts, atol, rtol): self._test_optim(LAMB, tfa.optimizers.LAMB, steps, opts, atol, rtol)
|
||||
|
||||
def test_lamb(self): self._test_lamb(1, {'lr': 0.001}, 1e-5, 0)
|
||||
def test_lamb_high_lr(self): self._test_lamb(1, {'lr': 10}, 1e-5, 1e-5)
|
||||
|
||||
def test_multistep_lamb(self): self._test_lamb(10, {'lr': 0.001}, 1e-5, 0)
|
||||
def test_multistep_lamb_high_lr(self): self._test_lamb(10, {'lr': 10}, 1e-5, 3e-4)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
57
tinygrad_repo/test/external/external_test_speed_llama.py
vendored
Normal file
57
tinygrad_repo/test/external/external_test_speed_llama.py
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
# NOTE: this only tests the speed of the LLaMA codegen, it doesn't actually run the net
|
||||
import unittest, time
|
||||
import numpy as np
|
||||
from examples.llama import Transformer, MODEL_PARAMS
|
||||
from test.test_net_speed import start_profile, stop_profile
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad.nn.state import get_state_dict
|
||||
from tinygrad.ops import Compiled
|
||||
from tinygrad.helpers import dtypes, prod
|
||||
from tinygrad.runtime.lib import RawBuffer
|
||||
|
||||
class FakeProgram:
|
||||
def __init__(self, name:str, prg:str): pass
|
||||
def __call__(self, *bufs, global_size, local_size, wait=False): pass
|
||||
|
||||
class RawFakeBuffer(RawBuffer):
|
||||
@classmethod
|
||||
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
|
||||
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
|
||||
|
||||
class TestLLaMASpeed(unittest.TestCase):
|
||||
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
|
||||
def test_llama_compile(self):
|
||||
backup_program = Device[Device.DEFAULT].runtime
|
||||
backup_buffer = Device[Device.DEFAULT].buffer
|
||||
Device[Device.DEFAULT].runtime = FakeProgram
|
||||
Device[Device.DEFAULT].buffer = RawFakeBuffer
|
||||
|
||||
print("testing llama python run time")
|
||||
model = Transformer(**MODEL_PARAMS["1"]["7B"]["args"])
|
||||
print("built model")
|
||||
# assign fake tensors to the values
|
||||
for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
|
||||
print("assigned empty tensors, doing warmup")
|
||||
|
||||
def run_llama(st, empty_method_cache=True):
|
||||
if empty_method_cache: Device[Device.DEFAULT].method_cache.clear()
|
||||
tms = [time.perf_counter()]
|
||||
for i in range(10):
|
||||
model(Tensor([[2]]), i).realize()
|
||||
tms.append(time.perf_counter())
|
||||
timings = [(tms[i+1]-tms[i])*1000 for i in range(len(tms)-1)]
|
||||
print(f"{st:15s} mean runtime: {sum(timings)/len(timings):7.2f}ms, runs: ", ", ".join(f'{x:7.2f}' for x in timings))
|
||||
|
||||
run_llama("codegen")
|
||||
run_llama("methodcache", False)
|
||||
|
||||
pr = start_profile()
|
||||
run_llama("profile")
|
||||
stop_profile(pr, sort='time', frac=0.1)
|
||||
|
||||
Device[Device.DEFAULT].runtime = backup_program
|
||||
Device[Device.DEFAULT].buffer = backup_buffer
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
44
tinygrad_repo/test/external/external_test_uops_graphing.py
vendored
Normal file
44
tinygrad_repo/test/external/external_test_uops_graphing.py
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
from tinygrad.renderer.opencl import OpenCLRenderer
|
||||
from tinygrad.graph import graph_uops
|
||||
from tinygrad.nn import Conv2d
|
||||
|
||||
class TestUopsGraph(unittest.TestCase):
|
||||
def test_matmul(self):
|
||||
N = 1024
|
||||
a = Tensor.rand(N,N)
|
||||
b = Tensor.rand(N,N)
|
||||
si = (a@b).lazydata.schedule()[-1]
|
||||
lin = Linearizer(si.ast)
|
||||
lin.hand_coded_optimizations()
|
||||
print(lin.colored_shape())
|
||||
uops = lin.linearize().uops
|
||||
graph_uops(uops)
|
||||
for u in uops: print(u)
|
||||
print(OpenCLRenderer("matmul", uops)[0])
|
||||
|
||||
def test_reduce(self):
|
||||
a = Tensor.rand(1024*1024)
|
||||
si = a.sum().lazydata.schedule()[-1]
|
||||
lin = Linearizer(si.ast)
|
||||
lin.hand_coded_optimizations()
|
||||
uops = lin.linearize().uops
|
||||
graph_uops(uops)
|
||||
#print(OpenCLRenderer("reduce", uops)[0])
|
||||
|
||||
def test_conv(self):
|
||||
x = Tensor.rand(1,3,16,16)
|
||||
c = Conv2d(3, 16, (3,3))
|
||||
si = c(x).elu().lazydata.schedule()[-1]
|
||||
lin = Linearizer(si.ast)
|
||||
lin.hand_coded_optimizations()
|
||||
uops = lin.linearize().uops
|
||||
graph_uops(uops)
|
||||
print(lin.colored_shape())
|
||||
print(OpenCLRenderer("conv", uops)[0])
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
36
tinygrad_repo/test/external/external_test_yolo.py
vendored
Normal file
36
tinygrad_repo/test/external/external_test_yolo.py
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
import io
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
import cv2
|
||||
import requests # type: ignore
|
||||
import numpy as np
|
||||
|
||||
from tinygrad.tensor import Tensor
|
||||
from examples.yolov3 import Darknet, infer, show_labels
|
||||
from extra.utils import fetch
|
||||
|
||||
chicken_img = cv2.imread(str(Path(__file__).parent / 'efficientnet/Chicken.jpg'))
|
||||
car_img = cv2.imread(str(Path(__file__).parent / 'efficientnet/car.jpg'))
|
||||
|
||||
class TestYOLO(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = Darknet(fetch("https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg"))
|
||||
print("Loading weights file (237MB). This might take a while…")
|
||||
cls.model.load_weights("https://pjreddie.com/media/files/yolov3.weights")
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
del cls.model
|
||||
|
||||
def test_chicken(self):
|
||||
labels = show_labels(infer(self.model, chicken_img), confidence=0.56)
|
||||
self.assertEqual(labels, ["bird"])
|
||||
|
||||
def test_car(self):
|
||||
labels = show_labels(infer(self.model, car_img))
|
||||
self.assertEqual(labels, ["car"])
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
76
tinygrad_repo/test/external/external_test_yolov8.py
vendored
Normal file
76
tinygrad_repo/test/external/external_test_yolov8.py
vendored
Normal file
@@ -0,0 +1,76 @@
|
||||
import numpy as np
|
||||
from extra.utils import fetch, download_file, get_child
|
||||
from examples.yolov8 import YOLOv8, get_variant_multiples, preprocess, postprocess, label_predictions
|
||||
from pathlib import Path
|
||||
import unittest
|
||||
import io, cv2, os
|
||||
import onnxruntime as ort
|
||||
import ultralytics
|
||||
from tinygrad.nn.state import safe_load, load_state_dict
|
||||
|
||||
class TestYOLOv8(unittest.TestCase):
|
||||
def test_all_load_weights(self):
|
||||
for variant in ['n', 's', 'm', 'l', 'x']:
|
||||
weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
|
||||
download_file(f'https://gitlab.com/r3sist/yolov8_weights/-/raw/master/yolov8{variant}.safetensors', weights_location)
|
||||
|
||||
depth, width, ratio = get_variant_multiples(variant)
|
||||
TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
|
||||
state_dict = safe_load(weights_location)
|
||||
load_state_dict(TinyYolov8, state_dict)
|
||||
print(f'successfully loaded weights for yolov{variant}')
|
||||
|
||||
def test_predictions(self):
|
||||
test_image_urls = ['https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg', 'https://www.aljazeera.com/wp-content/uploads/2022/10/2022-04-28T192650Z_1186456067_UP1EI4S1I0P14_RTRMADP_3_SOCCER-ENGLAND-MUN-CHE-REPORT.jpg']
|
||||
variant = 'n'
|
||||
weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
|
||||
depth, width, ratio = get_variant_multiples(variant)
|
||||
TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
|
||||
state_dict = safe_load(weights_location)
|
||||
load_state_dict(TinyYolov8, state_dict)
|
||||
|
||||
for i in range(len(test_image_urls)):
|
||||
img_stream = io.BytesIO(fetch(test_image_urls[i]))
|
||||
img = cv2.imdecode(np.frombuffer(img_stream.read(), np.uint8), 1)
|
||||
test_image = preprocess([img])
|
||||
predictions = TinyYolov8(test_image)
|
||||
post_predictions = postprocess(preds=predictions, img=test_image, orig_imgs=[img])
|
||||
labels = label_predictions(post_predictions)
|
||||
assert labels == {5: 1, 0: 4, 11: 1} if i == 0 else labels == {0: 13, 29: 1, 32: 1}
|
||||
|
||||
def test_forward_pass_torch_onnx(self):
|
||||
variant = 'n'
|
||||
weights_location_onnx = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.onnx'
|
||||
weights_location_pt = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.pt'
|
||||
weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
|
||||
|
||||
download_file(f'https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8{variant}.pt', weights_location_pt)
|
||||
# the ultralytics export prints a lot of unneccesary things
|
||||
if not weights_location_onnx.is_file():
|
||||
model = ultralytics.YOLO(model=weights_location_pt, task='Detect')
|
||||
model.export(format="onnx",imgsz=[640, 480])
|
||||
|
||||
depth, width, ratio = get_variant_multiples(variant)
|
||||
TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
|
||||
state_dict = safe_load(weights_location)
|
||||
load_state_dict(TinyYolov8, state_dict)
|
||||
|
||||
image_location = [np.frombuffer(io.BytesIO(fetch('https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg')).read(), np.uint8)]
|
||||
orig_image = [cv2.imdecode(image_location[0], 1)]
|
||||
|
||||
input_image = preprocess(orig_image)
|
||||
|
||||
onnx_session = ort.InferenceSession(weights_location_onnx)
|
||||
onnx_input_name = onnx_session.get_inputs()[0].name
|
||||
onnx_output_name = onnx_session.get_outputs()[0].name
|
||||
onnx_output = onnx_session.run([onnx_output_name], {onnx_input_name: input_image.numpy()})
|
||||
|
||||
tiny_output = TinyYolov8(input_image)
|
||||
|
||||
# currently rtol is 0.025 because there is a 1-2% difference in our predictions
|
||||
# because of the zero padding in SPPF module (line 280) maxpooling layers rather than the -infinity in torch.
|
||||
# This difference does not make a difference "visually".
|
||||
np.testing.assert_allclose(onnx_output[0], tiny_output.numpy(), atol=5e-4, rtol=0.025)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
61
tinygrad_repo/test/external/fuzz_shapetracker.py
vendored
Normal file
61
tinygrad_repo/test/external/fuzz_shapetracker.py
vendored
Normal file
@@ -0,0 +1,61 @@
|
||||
import random
|
||||
from tinygrad.helpers import DEBUG
|
||||
from test.unit.test_shapetracker import CheckingShapeTracker
|
||||
random.seed(42)
|
||||
|
||||
def do_permute(st):
|
||||
perm = list(range(0, len(st.shape)))
|
||||
random.shuffle(perm)
|
||||
perm = tuple(perm)
|
||||
if DEBUG >= 1: print("st.permute(", perm, ")")
|
||||
st.permute(perm)
|
||||
|
||||
def do_pad(st):
|
||||
c = random.randint(0, len(st.shape)-1)
|
||||
pad = tuple((random.randint(0,2), random.randint(0,2)) if i==c else (0,0) for i in range(len(st.shape)))
|
||||
if DEBUG >= 1: print("st.pad(", pad, ")")
|
||||
st.pad(pad)
|
||||
|
||||
def do_reshape_split_one(st):
|
||||
c = random.randint(0, len(st.shape)-1)
|
||||
poss = [n for n in [1,2,3,4,5] if st.shape[c]%n == 0]
|
||||
spl = random.choice(poss)
|
||||
shp = st.shape[0:c] + (st.shape[c]//spl, spl) + st.shape[c+1:]
|
||||
if DEBUG >= 1: print("st.reshape(", shp, ")")
|
||||
st.reshape(shp)
|
||||
|
||||
def do_reshape_combine_two(st):
|
||||
if len(st.shape) < 2: return
|
||||
c = random.randint(0, len(st.shape)-2)
|
||||
shp = st.shape[:c] + (st.shape[c] * st.shape[c+1], ) + st.shape[c+2:]
|
||||
if DEBUG >= 1: print("st.reshape(", shp, ")")
|
||||
st.reshape(shp)
|
||||
|
||||
def do_shrink(st):
|
||||
c = random.randint(0, len(st.shape)-1)
|
||||
while 1:
|
||||
shrink = tuple((random.randint(0,s), random.randint(0,s)) if i == c else (0,s) for i,s in enumerate(st.shape))
|
||||
if all(x<y for (x,y) in shrink): break
|
||||
if DEBUG >= 1: print("st.shrink(", shrink, ")")
|
||||
st.shrink(shrink)
|
||||
|
||||
def do_stride(st):
|
||||
c = random.randint(0, len(st.shape)-1)
|
||||
stride = tuple(random.choice([-2,-1,2]) if i==c else 1 for i in range(len(st.shape)))
|
||||
if DEBUG >= 1: print("st.stride(", stride, ")")
|
||||
st.stride(stride)
|
||||
|
||||
def do_expand(st):
|
||||
c = [i for i,s in enumerate(st.shape) if s==1]
|
||||
if len(c) == 0: return
|
||||
c = random.choice(c)
|
||||
expand = tuple(random.choice([2,3,4]) if i==c else s for i,s in enumerate(st.shape))
|
||||
if DEBUG >= 1: print("st.expand(", expand, ")")
|
||||
st.expand(expand)
|
||||
|
||||
if __name__ == "__main__":
|
||||
ops = [do_permute, do_pad, do_shrink, do_reshape_split_one, do_reshape_combine_two, do_stride, do_expand]
|
||||
for _ in range(200):
|
||||
st = CheckingShapeTracker((random.randint(2, 10), random.randint(2, 10), random.randint(2, 10)))
|
||||
for i in range(8): random.choice(ops)(st)
|
||||
st.assert_same()
|
||||
69
tinygrad_repo/test/external/fuzz_symbolic.py
vendored
Normal file
69
tinygrad_repo/test/external/fuzz_symbolic.py
vendored
Normal file
@@ -0,0 +1,69 @@
|
||||
import itertools
|
||||
import random
|
||||
from tinygrad.helpers import DEBUG
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
random.seed(42)
|
||||
|
||||
def add_v(expr, rng=None):
|
||||
if rng is None: rng = random.randint(0,2)
|
||||
return expr + v[rng], rng
|
||||
|
||||
def div(expr, rng=None):
|
||||
if rng is None: rng = random.randint(1,9)
|
||||
return expr // rng, rng
|
||||
|
||||
def mul(expr, rng=None):
|
||||
if rng is None: rng = random.randint(-4,4)
|
||||
return expr * rng, rng
|
||||
|
||||
def mod(expr, rng=None):
|
||||
if rng is None: rng = random.randint(1,9)
|
||||
return expr % rng, rng
|
||||
|
||||
def add_num(expr, rng=None):
|
||||
if rng is None: rng = random.randint(-4,4)
|
||||
return expr + rng, rng
|
||||
|
||||
def lt(expr, rng=None):
|
||||
if rng is None: rng = random.randint(-4,4)
|
||||
return expr < rng, rng
|
||||
|
||||
def ge(expr, rng=None):
|
||||
if rng is None: rng = random.randint(-4,4)
|
||||
return expr >= rng, rng
|
||||
|
||||
def le(expr, rng=None):
|
||||
if rng is None: rng = random.randint(-4,4)
|
||||
return expr <= rng, rng
|
||||
|
||||
def gt(expr, rng=None):
|
||||
if rng is None: rng = random.randint(-4,4)
|
||||
return expr > rng, rng
|
||||
|
||||
if __name__ == "__main__":
|
||||
ops = [add_v, div, mul, add_num, mod]
|
||||
for _ in range(1000):
|
||||
upper_bounds = [*list(range(1, 10)), 16, 32, 64, 128, 256]
|
||||
u1 = Variable("v1", 0, random.choice(upper_bounds))
|
||||
u2 = Variable("v2", 0, random.choice(upper_bounds))
|
||||
u3 = Variable("v3", 0, random.choice(upper_bounds))
|
||||
v = [u1,u2,u3]
|
||||
tape = [random.choice(ops) for _ in range(random.randint(2, 30))]
|
||||
# 10% of the time, add one of lt, le, gt, ge
|
||||
if random.random() < 0.1: tape.append(random.choice([lt, le, gt, ge]))
|
||||
expr = Variable.num(0)
|
||||
rngs = []
|
||||
for t in tape:
|
||||
expr, rng = t(expr)
|
||||
if DEBUG >= 1: print(t.__name__, rng)
|
||||
rngs.append(rng)
|
||||
if DEBUG >=1: print(expr)
|
||||
space = list(itertools.product(range(u1.min, u1.max+1), range(u2.min, u2.max+1), range(u3.min, u3.max+1)))
|
||||
volume = len(space)
|
||||
for (v1, v2, v3) in random.sample(space, min(100, volume)):
|
||||
v = [v1,v2,v3]
|
||||
rn = 0
|
||||
for t,r in zip(tape, rngs): rn, _ = t(rn, r)
|
||||
num = eval(expr.render())
|
||||
assert num == rn, f"mismatched {expr.render()} at {v1=} {v2=} {v3=} = {num} != {rn}"
|
||||
if DEBUG >= 1: print(f"matched {expr.render()} at {v1=} {v2=} {v3=} = {num} == {rn}")
|
||||
61
tinygrad_repo/test/external/graph_batchnorm.py
vendored
Normal file
61
tinygrad_repo/test/external/graph_batchnorm.py
vendored
Normal file
@@ -0,0 +1,61 @@
|
||||
import unittest
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn import Conv2d, BatchNorm2d, optim
|
||||
|
||||
def model_step(lm):
|
||||
with Tensor.train():
|
||||
x = Tensor.ones(8,12,128,256, requires_grad=False)
|
||||
optimizer = optim.SGD(get_parameters(lm), lr=0.001)
|
||||
loss = lm.forward(x).sum()
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
del x,loss
|
||||
optimizer.step()
|
||||
|
||||
class TestBatchnorm(unittest.TestCase):
|
||||
def test_conv(self):
|
||||
class LilModel:
|
||||
def __init__(self):
|
||||
self.c = Conv2d(12, 32, 3, padding=1, bias=False)
|
||||
def forward(self, x):
|
||||
return self.c(x).relu()
|
||||
lm = LilModel()
|
||||
model_step(lm)
|
||||
|
||||
def test_two_conv(self):
|
||||
class LilModel:
|
||||
def __init__(self):
|
||||
self.c = Conv2d(12, 32, 3, padding=1, bias=False)
|
||||
self.c2 = Conv2d(32, 32, 3, padding=1, bias=False)
|
||||
def forward(self, x):
|
||||
return self.c2(self.c(x)).relu()
|
||||
lm = LilModel()
|
||||
model_step(lm)
|
||||
|
||||
def test_two_conv_bn(self):
|
||||
class LilModel:
|
||||
def __init__(self):
|
||||
self.c = Conv2d(12, 24, 3, padding=1, bias=False)
|
||||
self.bn = BatchNorm2d(24, track_running_stats=False)
|
||||
self.c2 = Conv2d(24, 32, 3, padding=1, bias=False)
|
||||
self.bn2 = BatchNorm2d(32, track_running_stats=False)
|
||||
def forward(self, x):
|
||||
x = self.bn(self.c(x)).relu()
|
||||
return self.bn2(self.c2(x)).relu()
|
||||
lm = LilModel()
|
||||
model_step(lm)
|
||||
|
||||
def test_conv_bn(self):
|
||||
class LilModel:
|
||||
def __init__(self):
|
||||
self.c = Conv2d(12, 32, 3, padding=1, bias=False)
|
||||
self.bn = BatchNorm2d(32, track_running_stats=False)
|
||||
def forward(self, x):
|
||||
return self.bn(self.c(x)).relu()
|
||||
lm = LilModel()
|
||||
model_step(lm)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
74
tinygrad_repo/test/external/test_example.py
vendored
Normal file
74
tinygrad_repo/test/external/test_example.py
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import getenv, CI
|
||||
|
||||
def multidevice_test(fxn):
|
||||
exclude_devices = getenv("EXCLUDE_DEVICES", "").split(",")
|
||||
def ret(self):
|
||||
for device in Device._buffers:
|
||||
if device in ["DISK", "SHM", "FAKE"]: continue
|
||||
if not CI: print(device)
|
||||
if device in exclude_devices:
|
||||
if not CI: print(f"WARNING: {device} test is excluded")
|
||||
continue
|
||||
with self.subTest(device=device):
|
||||
try:
|
||||
Device[device]
|
||||
except Exception:
|
||||
if not CI: print(f"WARNING: {device} test isn't running")
|
||||
continue
|
||||
fxn(self, device)
|
||||
return ret
|
||||
|
||||
class TestExample(unittest.TestCase):
|
||||
@multidevice_test
|
||||
def test_convert_to_cpu(self, device):
|
||||
a = Tensor([[1,2],[3,4]], device=device)
|
||||
assert a.numpy().shape == (2,2)
|
||||
b = a.cpu()
|
||||
assert b.numpy().shape == (2,2)
|
||||
|
||||
@multidevice_test
|
||||
def test_2_plus_3(self, device):
|
||||
a = Tensor([2], device=device)
|
||||
b = Tensor([3], device=device)
|
||||
result = a + b
|
||||
print(f"{a.numpy()} + {b.numpy()} = {result.numpy()}")
|
||||
assert result.numpy()[0] == 5.
|
||||
|
||||
@multidevice_test
|
||||
def test_example_readme(self, device):
|
||||
x = Tensor.eye(3, device=device, requires_grad=True)
|
||||
y = Tensor([[2.0,0,-2.0]], device=device, requires_grad=True)
|
||||
z = y.matmul(x).sum()
|
||||
z.backward()
|
||||
|
||||
x.grad.numpy() # dz/dx
|
||||
y.grad.numpy() # dz/dy
|
||||
|
||||
assert x.grad.device == device
|
||||
assert y.grad.device == device
|
||||
|
||||
@multidevice_test
|
||||
def test_example_matmul(self, device):
|
||||
try:
|
||||
Device[device]
|
||||
except Exception:
|
||||
print(f"WARNING: {device} test isn't running")
|
||||
return
|
||||
|
||||
x = Tensor.eye(64, device=device, requires_grad=True)
|
||||
y = Tensor.eye(64, device=device, requires_grad=True)
|
||||
z = y.matmul(x).sum()
|
||||
z.backward()
|
||||
|
||||
x.grad.numpy() # dz/dx
|
||||
y.grad.numpy() # dz/dy
|
||||
|
||||
assert x.grad.device == device
|
||||
assert y.grad.device == device
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
50
tinygrad_repo/test/extra/test_export_model.py
Normal file
50
tinygrad_repo/test/extra/test_export_model.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import unittest
|
||||
from extra.export_model import export_model, EXPORT_SUPPORTED_DEVICE
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
import json
|
||||
|
||||
class MockMultiInputModel:
|
||||
def forward(self, x1, x2, x3):
|
||||
return x1 + x2 + x3
|
||||
|
||||
class MockMultiOutputModel:
|
||||
def __call__(self, x1):
|
||||
return x1 + 2.0, x1.pad(((0, 0), (0, 1))) + 1.0
|
||||
|
||||
# TODO: move compile_efficientnet tests here
|
||||
@unittest.skipUnless(Device.DEFAULT in EXPORT_SUPPORTED_DEVICE, f"Model export is not supported on {Device.DEFAULT}")
|
||||
class TextModelExport(unittest.TestCase):
|
||||
def test_multi_input_model_export(self):
|
||||
model = MockMultiInputModel()
|
||||
inputs = [Tensor.rand(2,2), Tensor.rand(2,2), Tensor.rand(2,2)]
|
||||
prg, inp_sizes, _, _ = export_model(model, "", *inputs)
|
||||
prg = json.loads(prg)
|
||||
|
||||
assert len(inputs) == len(prg["inputs"]) == len(inp_sizes), f"Model and exported inputs don't match: mdl={len(inputs)}, prg={len(prg['inputs'])}, inp_sizes={len(inp_sizes)}"
|
||||
|
||||
for i in range(len(inputs)):
|
||||
assert f"input{i}" in inp_sizes, f"input{i} not captured in inp_sizes"
|
||||
assert f"input{i}" in prg["buffers"], f"input{i} not captured in exported buffers"
|
||||
|
||||
for i, exported_input in enumerate(prg["inputs"]):
|
||||
assert inputs[i].dtype.name == exported_input["dtype"], f"Model and exported input dtype don't match: mdl={inputs[i].dtype.name}, prg={exported_input['dtype']}"
|
||||
|
||||
def test_multi_output_model_export(self):
|
||||
model = MockMultiOutputModel()
|
||||
input = Tensor.rand(2,2)
|
||||
outputs = model(input)
|
||||
prg, _, out_sizes, _ = export_model(model, "", input)
|
||||
prg = json.loads(prg)
|
||||
|
||||
assert len(outputs) == len(prg["outputs"]) == len(out_sizes), f"Model and exported outputs don't match: mdl={len(outputs)}, prg={len(prg['outputs'])}, inp_sizes={len(out_sizes)}"
|
||||
|
||||
for i in range(len(outputs)):
|
||||
assert f"output{i}" in out_sizes, f"output{i} not captured in out_sizes"
|
||||
assert f"output{i}" in prg["buffers"], f"output{i} not captured in exported buffers"
|
||||
|
||||
for i, exported_output in enumerate(prg["outputs"]):
|
||||
assert outputs[i].dtype.name == exported_output["dtype"], f"Model and exported output dtype don't match: mdl={outputs[i].dtype.name}, prg={exported_output['dtype']}"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
57
tinygrad_repo/test/extra/test_extra_helpers.py
Normal file
57
tinygrad_repo/test/extra/test_extra_helpers.py
Normal file
@@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python
|
||||
import os, cloudpickle, tempfile, unittest, subprocess
|
||||
from extra.helpers import enable_early_exec, cross_process, _CloudpickleFunctionWrapper
|
||||
|
||||
def normalize_line_endings(s): return s.replace(b'\r\n', b'\n')
|
||||
|
||||
class TestEarlyExec(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.early_exec = enable_early_exec()
|
||||
|
||||
def early_exec_py_file(self, file_content, exec_args):
|
||||
with tempfile.NamedTemporaryFile(suffix=".py", delete=False) as temp:
|
||||
temp.write(file_content)
|
||||
temp_path = temp.name
|
||||
try:
|
||||
output = self.early_exec((["python3", temp_path] + exec_args, None))
|
||||
return output
|
||||
finally:
|
||||
os.remove(temp_path)
|
||||
|
||||
def test_enable_early_exec(self):
|
||||
output = self.early_exec_py_file(b'print("Hello, world!")', [])
|
||||
self.assertEqual(b"Hello, world!\n", normalize_line_endings(output))
|
||||
|
||||
def test_enable_early_exec_with_arg(self):
|
||||
output = self.early_exec_py_file(b'import sys\nprint("Hello, " + sys.argv[1] + "!")', ["world"])
|
||||
self.assertEqual(b"Hello, world!\n", normalize_line_endings(output))
|
||||
|
||||
def test_enable_early_exec_process_exception(self):
|
||||
with self.assertRaises(subprocess.CalledProcessError):
|
||||
self.early_exec_py_file(b'raise Exception("Test exception")', [])
|
||||
|
||||
def test_enable_early_exec_type_exception(self):
|
||||
with self.assertRaises(TypeError):
|
||||
self.early_exec((["python3"], "print('Hello, world!')"))
|
||||
|
||||
class TestCrossProcess(unittest.TestCase):
|
||||
|
||||
def test_cross_process(self):
|
||||
def _iterate():
|
||||
for i in range(10): yield i
|
||||
results = list(cross_process(_iterate))
|
||||
self.assertEqual(list(range(10)), results)
|
||||
|
||||
def test_cross_process_exception(self):
|
||||
def _iterate():
|
||||
for i in range(10):
|
||||
if i == 5: raise ValueError("Test exception")
|
||||
yield i
|
||||
with self.assertRaises(ValueError): list(cross_process(_iterate))
|
||||
|
||||
def test_CloudpickleFunctionWrapper(self):
|
||||
def add(x, y): return x + y
|
||||
self.assertEqual(7, cloudpickle.loads(cloudpickle.dumps(_CloudpickleFunctionWrapper(add)))(3, 4))
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
107
tinygrad_repo/test/extra/test_lr_scheduler.py
Normal file
107
tinygrad_repo/test/extra/test_lr_scheduler.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import unittest
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from tinygrad.nn.optim import Adam
|
||||
from extra.lr_scheduler import MultiStepLR, ReduceLROnPlateau, CosineAnnealingLR, OneCycleLR
|
||||
from extra.training import train, evaluate
|
||||
from extra.datasets import fetch_mnist
|
||||
import pytest
|
||||
|
||||
pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu]
|
||||
|
||||
np.random.seed(1337)
|
||||
Tensor.manual_seed(1337)
|
||||
|
||||
X_train, Y_train, X_test, Y_test = fetch_mnist()
|
||||
|
||||
class TinyBobNet:
|
||||
def __init__(self):
|
||||
self.l1 = Tensor.scaled_uniform(784, 128)
|
||||
self.l2 = Tensor.scaled_uniform(128, 10)
|
||||
|
||||
def parameters(self):
|
||||
return get_parameters(self)
|
||||
|
||||
def forward(self, x):
|
||||
return x.dot(self.l1).relu().dot(self.l2).log_softmax()
|
||||
|
||||
def lr_scheduler_training(sched_fn=None, args=None):
|
||||
model = TinyBobNet()
|
||||
optim = Adam(model.parameters(), lr=0.01)
|
||||
if sched_fn is not None: sched = sched_fn(optim, **args)
|
||||
for _ in range(25):
|
||||
train(model, X_train, Y_train, optim, 100)
|
||||
if sched_fn is not None:
|
||||
if isinstance(sched, ReduceLROnPlateau):
|
||||
sched.step(evaluate(model, X_test, Y_test))
|
||||
else:
|
||||
sched.step()
|
||||
return evaluate(model, X_test, Y_test)
|
||||
|
||||
def current_lr(optim): return optim.param_groups[0]['lr'] if hasattr(optim, 'param_groups') else optim.lr
|
||||
def get_lrs(optim, sched, epochs, steps=1, accs=None):
|
||||
lr = current_lr(optim)
|
||||
if not isinstance(lr, float): lr = lr.numpy()[0]
|
||||
lrs = [lr]
|
||||
for e in range(epochs):
|
||||
for _ in range(steps):
|
||||
optim.step()
|
||||
sched.step() if accs is None else sched.step(accs[e])
|
||||
lr = current_lr(optim)
|
||||
if not isinstance(lr, float): lr = lr.numpy()[0]
|
||||
lrs.append(lr)
|
||||
return lrs
|
||||
|
||||
class TestLrScheduler(unittest.TestCase):
|
||||
def _test_lr_scheduler(self, tinygrad_sched, torch_sched, epochs, opts, atol, rtol):
|
||||
accs = opts.pop('accs', None)
|
||||
tinygrad_optim, torch_optim = Adam([], lr=0.01), torch.optim.Adam([torch.tensor([0.], requires_grad=True)], lr=0.01)
|
||||
tinygrad_sched, torch_sched = tinygrad_sched(tinygrad_optim, **opts), torch_sched(torch_optim, **opts)
|
||||
|
||||
tinygrad_lrs = get_lrs(tinygrad_optim, tinygrad_sched, epochs, accs=accs)
|
||||
torch_lrs = get_lrs(torch_optim, torch_sched, epochs, accs=accs)
|
||||
|
||||
np.testing.assert_allclose(tinygrad_lrs, torch_lrs, atol=atol, rtol=rtol)
|
||||
|
||||
def _test_multisteplr(self, epochs, opts, atol, rtol):
|
||||
self._test_lr_scheduler(MultiStepLR, torch.optim.lr_scheduler.MultiStepLR, epochs, opts, atol, rtol)
|
||||
def _test_reducelronplateau(self, epochs, opts, atol, rtol):
|
||||
opts['accs'] = np.random.randn(epochs)
|
||||
self._test_lr_scheduler(ReduceLROnPlateau, torch.optim.lr_scheduler.ReduceLROnPlateau, epochs, opts, atol, rtol)
|
||||
def _test_cosineannealinglr(self, epochs, opts, atol, rtol):
|
||||
opts['T_max'] = epochs
|
||||
self._test_lr_scheduler(CosineAnnealingLR, torch.optim.lr_scheduler.CosineAnnealingLR, epochs, opts, atol, rtol)
|
||||
def _test_onecyclelr(self, epochs, opts, atol, rtol):
|
||||
opts['total_steps'] = epochs
|
||||
self._test_lr_scheduler(OneCycleLR, torch.optim.lr_scheduler.OneCycleLR, epochs, opts, atol, rtol)
|
||||
|
||||
def test_multisteplr(self): self._test_multisteplr(10, {'milestones': [1, 2, 7]}, 1e-6, 1e-6)
|
||||
def test_multisteplr_gamma(self): self._test_multisteplr(10, {'milestones': [1, 2, 7], 'gamma': 0.1337}, 1e-6, 1e-6)
|
||||
|
||||
def test_reducelronplateau(self): self._test_reducelronplateau(100, {}, 1e-6, 1e-6)
|
||||
def test_reducelronplateau_max(self): self._test_reducelronplateau(100, {'mode': 'max'}, 1e-6, 1e-6)
|
||||
def test_reducelronplateau_factor(self): self._test_reducelronplateau(100, {'factor': 0.1337}, 1e-6, 1e-6)
|
||||
def test_reducelronplateau_patience(self): self._test_reducelronplateau(100, {'patience': 3}, 1e-6, 1e-6)
|
||||
def test_reducelronplateau_threshold(self): self._test_reducelronplateau(100, {'threshold': 1e-6}, 1e-6, 1e-6)
|
||||
def test_reducelronplateau_threshold_mode(self): self._test_reducelronplateau(100, {'threshold_mode': 'abs'}, 1e-6, 1e-6)
|
||||
|
||||
def test_cosineannealinglr(self): self._test_cosineannealinglr(100, {}, 1e-6, 1e-6)
|
||||
def test_cosineannealinglr_eta_min(self): self._test_cosineannealinglr(100, {'eta_min': 0.001}, 1e-6, 1e-6)
|
||||
|
||||
def test_onecyclelr(self): self._test_onecyclelr(1000, {'pct_start': 0.3, 'anneal_strategy': 'linear',
|
||||
'cycle_momentum': False, 'div_factor': 25.0,
|
||||
'final_div_factor': 10000.0, 'max_lr':1e-5}, 1e-6, 1e-6)
|
||||
@unittest.skip("slow")
|
||||
def test_training(self):
|
||||
without = lr_scheduler_training()
|
||||
sched_fns = [MultiStepLR, ReduceLROnPlateau, CosineAnnealingLR, OneCycleLR]
|
||||
argss = [{'milestones': [5, 7, 10, 15], 'gamma': 0.5}, {'factor': 0.5, 'patience': 2}, {'T_max': 25, 'eta_min': 0.001},
|
||||
{'pct_start': 0.3, 'anneal_strategy': 'linear', 'cycle_momentum': False, 'div_factor': 25.0, 'final_div_factor': 10000.0, 'max_lr':1e-5, 'total_steps': 25}]
|
||||
for sched_fn, args in zip(sched_fns, argss):
|
||||
with_sched = lr_scheduler_training(sched_fn, args)
|
||||
assert with_sched > without
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
106
tinygrad_repo/test/extra/test_utils.py
Normal file
106
tinygrad_repo/test/extra/test_utils.py
Normal file
@@ -0,0 +1,106 @@
|
||||
#!/usr/bin/env python
|
||||
import io, unittest
|
||||
import os
|
||||
import tempfile
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from tinygrad.helpers import CI
|
||||
from extra.utils import fetch, temp, download_file
|
||||
from tinygrad.nn.state import torch_load
|
||||
from PIL import Image
|
||||
|
||||
@unittest.skipIf(CI, "no internet tests in CI")
|
||||
class TestFetch(unittest.TestCase):
|
||||
def test_fetch_bad_http(self):
|
||||
self.assertRaises(AssertionError, fetch, 'http://httpstat.us/500')
|
||||
self.assertRaises(AssertionError, fetch, 'http://httpstat.us/404')
|
||||
self.assertRaises(AssertionError, fetch, 'http://httpstat.us/400')
|
||||
|
||||
def test_fetch_small(self):
|
||||
assert(len(fetch('https://google.com'))>0)
|
||||
|
||||
def test_fetch_img(self):
|
||||
img = fetch("https://media.istockphoto.com/photos/hen-picture-id831791190")
|
||||
pimg = Image.open(io.BytesIO(img))
|
||||
assert pimg.size == (705, 1024)
|
||||
|
||||
class TestFetchRelative(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.working_dir = os.getcwd()
|
||||
self.tempdir = tempfile.TemporaryDirectory()
|
||||
os.chdir(self.tempdir.name)
|
||||
with open('test_file.txt', 'x') as f:
|
||||
f.write("12345")
|
||||
|
||||
def tearDown(self):
|
||||
os.chdir(self.working_dir)
|
||||
self.tempdir.cleanup()
|
||||
|
||||
#test ./
|
||||
def test_fetch_relative_dotslash(self):
|
||||
self.assertEqual(b'12345', fetch("./test_file.txt"))
|
||||
|
||||
#test ../
|
||||
def test_fetch_relative_dotdotslash(self):
|
||||
os.mkdir('test_file_path')
|
||||
os.chdir('test_file_path')
|
||||
self.assertEqual(b'12345', fetch("../test_file.txt"))
|
||||
|
||||
class TestDownloadFile(unittest.TestCase):
|
||||
def setUp(self):
|
||||
from pathlib import Path
|
||||
self.test_file = Path(temp("test_download_file/test_file.txt"))
|
||||
|
||||
def tearDown(self):
|
||||
os.remove(self.test_file)
|
||||
os.removedirs(self.test_file.parent)
|
||||
|
||||
@patch('requests.get')
|
||||
def test_download_file_with_mkdir(self, mock_requests):
|
||||
mock_response = MagicMock()
|
||||
mock_response.iter_content.return_value = [b'1234', b'5678']
|
||||
mock_response.status_code = 200
|
||||
mock_response.headers = {'content-length': '8'}
|
||||
mock_requests.return_value = mock_response
|
||||
self.assertFalse(self.test_file.parent.exists())
|
||||
download_file("https://www.mock.com/fake.txt", self.test_file, skip_if_exists=False)
|
||||
self.assertTrue(self.test_file.parent.exists())
|
||||
self.assertTrue(self.test_file.is_file())
|
||||
self.assertEqual('12345678', self.test_file.read_text())
|
||||
|
||||
class TestUtils(unittest.TestCase):
|
||||
def test_fake_torch_load_zipped(self): self._test_fake_torch_load_zipped()
|
||||
def test_fake_torch_load_zipped_float16(self): self._test_fake_torch_load_zipped(isfloat16=True)
|
||||
def _test_fake_torch_load_zipped(self, isfloat16=False):
|
||||
class LayerWithOffset(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super(LayerWithOffset, self).__init__()
|
||||
d = torch.randn(16)
|
||||
self.param1 = torch.nn.Parameter(
|
||||
d.as_strided([2, 2], [1, 2], storage_offset=5)
|
||||
)
|
||||
self.param2 = torch.nn.Parameter(
|
||||
d.as_strided([2, 2], [1, 2], storage_offset=4)
|
||||
)
|
||||
|
||||
model = torch.nn.Sequential(
|
||||
torch.nn.Linear(4, 8),
|
||||
torch.nn.Linear(8, 3),
|
||||
LayerWithOffset()
|
||||
)
|
||||
if isfloat16: model = model.half()
|
||||
|
||||
path = temp(f"test_load_{isfloat16}.pt")
|
||||
torch.save(model.state_dict(), path)
|
||||
model2 = torch_load(path)
|
||||
|
||||
for name, a in model.state_dict().items():
|
||||
b = model2[name]
|
||||
a, b = a.numpy(), b.numpy()
|
||||
assert a.shape == b.shape
|
||||
assert a.dtype == b.dtype
|
||||
assert np.array_equal(a, b)
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
15
tinygrad_repo/test/helpers.py
Normal file
15
tinygrad_repo/test/helpers.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from tinygrad.ops import LazyOp, LoadOps
|
||||
from tinygrad.nn.state import get_parameters
|
||||
|
||||
# for speed
|
||||
def derandomize(x):
|
||||
if isinstance(x, LazyOp):
|
||||
new_op = LoadOps.EMPTY if x.op == LoadOps.RAND else x.op
|
||||
return LazyOp(new_op, tuple([derandomize(s) for s in x.src]), x.arg)
|
||||
x.op = derandomize(x.op)
|
||||
return x
|
||||
|
||||
def derandomize_model(model):
|
||||
for p in get_parameters(model):
|
||||
p.lazydata = derandomize(p.lazydata)
|
||||
p.realize()
|
||||
BIN
tinygrad_repo/test/models/efficientnet/Chicken.jpg
Normal file
BIN
tinygrad_repo/test/models/efficientnet/Chicken.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 108 KiB |
BIN
tinygrad_repo/test/models/efficientnet/car.jpg
Normal file
BIN
tinygrad_repo/test/models/efficientnet/car.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 7.9 KiB |
File diff suppressed because it is too large
Load Diff
57
tinygrad_repo/test/models/test_bert.py
Normal file
57
tinygrad_repo/test/models/test_bert.py
Normal file
@@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
import torch
|
||||
|
||||
def get_question_samp(bsz, seq_len, vocab_size, seed):
|
||||
np.random.seed(seed)
|
||||
in_ids= np.random.randint(vocab_size, size=(bsz, seq_len))
|
||||
mask = np.random.choice([True, False], size=(bsz, seq_len))
|
||||
seg_ids = np.random.randint(1, size=(bsz, seq_len))
|
||||
return in_ids, mask, seg_ids
|
||||
|
||||
def set_equal_weights(mdl, torch_mdl):
|
||||
from tinygrad.nn.state import get_state_dict
|
||||
state, torch_state = get_state_dict(mdl), torch_mdl.state_dict()
|
||||
assert len(state) == len(torch_state)
|
||||
for k, v in state.items():
|
||||
assert k in torch_state
|
||||
torch_state[k].copy_(torch.from_numpy(v.numpy()))
|
||||
torch_mdl.eval()
|
||||
|
||||
class TestBert(unittest.TestCase):
|
||||
def test_questions(self):
|
||||
from models.bert import BertForQuestionAnswering
|
||||
from transformers import BertForQuestionAnswering as TorchBertForQuestionAnswering
|
||||
from transformers import BertConfig
|
||||
|
||||
# small
|
||||
config = {
|
||||
'vocab_size':24, 'hidden_size':2, 'num_hidden_layers':2, 'num_attention_heads':2,
|
||||
'intermediate_size':32, 'hidden_dropout_prob':0.1, 'attention_probs_dropout_prob':0.1,
|
||||
'max_position_embeddings':512, 'type_vocab_size':2
|
||||
}
|
||||
|
||||
# Create in tinygrad
|
||||
Tensor.manual_seed(1337)
|
||||
mdl = BertForQuestionAnswering(**config)
|
||||
|
||||
# Create in torch
|
||||
with torch.no_grad():
|
||||
torch_mdl = TorchBertForQuestionAnswering(BertConfig(**config))
|
||||
|
||||
set_equal_weights(mdl, torch_mdl)
|
||||
|
||||
seeds = (1337, 3141)
|
||||
bsz, seq_len = 1, 16
|
||||
for _, seed in enumerate(seeds):
|
||||
in_ids, mask, seg_ids = get_question_samp(bsz, seq_len, config['vocab_size'], seed)
|
||||
out = mdl(Tensor(in_ids), Tensor(mask), Tensor(seg_ids))
|
||||
torch_out = torch_mdl.forward(torch.from_numpy(in_ids).long(), torch.from_numpy(mask), torch.from_numpy(seg_ids).long())[:2]
|
||||
torch_out = torch.cat(torch_out).unsqueeze(2)
|
||||
np.testing.assert_allclose(out.numpy(), torch_out.detach().numpy(), atol=5e-4, rtol=5e-4)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
115
tinygrad_repo/test/models/test_efficientnet.py
Normal file
115
tinygrad_repo/test/models/test_efficientnet.py
Normal file
@@ -0,0 +1,115 @@
|
||||
import ast
|
||||
import pathlib
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.tensor import Tensor
|
||||
from models.efficientnet import EfficientNet
|
||||
from models.vit import ViT
|
||||
from models.resnet import ResNet50
|
||||
|
||||
def _load_labels():
|
||||
labels_filename = pathlib.Path(__file__).parent / 'efficientnet/imagenet1000_clsidx_to_labels.txt'
|
||||
return ast.literal_eval(labels_filename.read_text())
|
||||
|
||||
_LABELS = _load_labels()
|
||||
|
||||
def preprocess(img, new=False):
|
||||
# preprocess image
|
||||
aspect_ratio = img.size[0] / img.size[1]
|
||||
img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
|
||||
|
||||
img = np.array(img)
|
||||
y0, x0 =(np.asarray(img.shape)[:2] - 224) // 2
|
||||
img = img[y0: y0 + 224, x0: x0 + 224]
|
||||
|
||||
# low level preprocess
|
||||
if new:
|
||||
img = img.astype(np.float32)
|
||||
img -= [127.0, 127.0, 127.0]
|
||||
img /= [128.0, 128.0, 128.0]
|
||||
img = img[None]
|
||||
else:
|
||||
img = np.moveaxis(img, [2, 0, 1], [0, 1, 2])
|
||||
img = img.astype(np.float32)[:3].reshape(1, 3, 224, 224)
|
||||
img /= 255.0
|
||||
img -= np.array([0.485, 0.456, 0.406]).reshape((1, -1, 1, 1))
|
||||
img /= np.array([0.229, 0.224, 0.225]).reshape((1, -1, 1, 1))
|
||||
return img
|
||||
|
||||
|
||||
def _infer(model: EfficientNet, img, bs=1):
|
||||
Tensor.training = False
|
||||
img = preprocess(img)
|
||||
# run the net
|
||||
if bs > 1: img = img.repeat(bs, axis=0)
|
||||
out = model.forward(Tensor(img)).cpu()
|
||||
return _LABELS[np.argmax(out.numpy()[0])]
|
||||
|
||||
chicken_img = Image.open(pathlib.Path(__file__).parent / 'efficientnet/Chicken.jpg')
|
||||
car_img = Image.open(pathlib.Path(__file__).parent / 'efficientnet/car.jpg')
|
||||
|
||||
class TestEfficientNet(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = EfficientNet(number=getenv("NUM"))
|
||||
cls.model.load_from_pretrained()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
del cls.model
|
||||
|
||||
def test_chicken(self):
|
||||
label = _infer(self.model, chicken_img)
|
||||
self.assertEqual(label, "hen")
|
||||
|
||||
def test_chicken_bigbatch(self):
|
||||
label = _infer(self.model, chicken_img, 2)
|
||||
self.assertEqual(label, "hen")
|
||||
|
||||
def test_car(self):
|
||||
label = _infer(self.model, car_img)
|
||||
self.assertEqual(label, "sports car, sport car")
|
||||
|
||||
class TestViT(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = ViT()
|
||||
cls.model.load_from_pretrained()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
del cls.model
|
||||
|
||||
def test_chicken(self):
|
||||
label = _infer(self.model, chicken_img)
|
||||
self.assertEqual(label, "cock")
|
||||
|
||||
def test_car(self):
|
||||
label = _infer(self.model, car_img)
|
||||
self.assertEqual(label, "racer, race car, racing car")
|
||||
|
||||
class TestResNet(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = ResNet50()
|
||||
cls.model.load_from_pretrained()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
del cls.model
|
||||
|
||||
def test_chicken(self):
|
||||
label = _infer(self.model, chicken_img)
|
||||
self.assertEqual(label, "hen")
|
||||
|
||||
def test_car(self):
|
||||
label = _infer(self.model, car_img)
|
||||
self.assertEqual(label, "sports car, sport car")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
165
tinygrad_repo/test/models/test_end2end.py
Normal file
165
tinygrad_repo/test/models/test_end2end.py
Normal file
@@ -0,0 +1,165 @@
|
||||
import torch
|
||||
from torch import nn
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.nn.state import get_parameters, get_state_dict
|
||||
from tinygrad.nn import optim, Linear, Conv2d, BatchNorm2d
|
||||
from tinygrad.tensor import Tensor
|
||||
from extra.datasets import fetch_mnist
|
||||
from tinygrad.helpers import CI
|
||||
|
||||
def compare_tiny_torch(model, model_torch, X, Y):
|
||||
with Tensor.train():
|
||||
model_torch.train()
|
||||
model_state_dict = get_state_dict(model)
|
||||
for k,v in model_torch.named_parameters():
|
||||
if not CI: print(f"initting {k} from torch")
|
||||
model_state_dict[k].assign(Tensor(v.detach().numpy())).realize()
|
||||
|
||||
optimizer = optim.SGD(get_parameters(model), lr=0.001)
|
||||
optimizer_torch = torch.optim.SGD(model_torch.parameters(), lr=0.001)
|
||||
|
||||
Xt = torch.Tensor(X.numpy())
|
||||
np.testing.assert_allclose(X.numpy(), Xt.detach().numpy())
|
||||
|
||||
out = model(X)
|
||||
loss = (out * Y).mean()
|
||||
if not CI: print(loss.realize().numpy())
|
||||
|
||||
out_torch = model_torch(torch.Tensor(X.numpy()))
|
||||
loss_torch = (out_torch * torch.Tensor(Y.numpy())).mean()
|
||||
if not CI: print(loss_torch.detach().numpy())
|
||||
|
||||
# assert losses match
|
||||
np.testing.assert_allclose(loss.realize().numpy(), loss_torch.detach().numpy(), atol=1e-4)
|
||||
|
||||
# zero and backward
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer_torch.zero_grad()
|
||||
loss_torch.backward()
|
||||
|
||||
for k,v in list(model_torch.named_parameters())[::-1]:
|
||||
g = model_state_dict[k].grad.numpy()
|
||||
gt = v.grad.detach().numpy()
|
||||
if not CI: print("testing grads", k)
|
||||
np.testing.assert_allclose(g, gt, atol=1e-3, err_msg=f'grad mismatch {k}')
|
||||
|
||||
# take the steps
|
||||
optimizer.step()
|
||||
optimizer_torch.step()
|
||||
|
||||
# assert weights match (they don't!)
|
||||
for k,v in model_torch.named_parameters():
|
||||
if not CI: print("testing weight", k)
|
||||
np.testing.assert_allclose(model_state_dict[k].numpy(), v.detach().numpy(), atol=1e-3, err_msg=f'weight mismatch {k}')
|
||||
|
||||
def get_mnist_data():
|
||||
X_train, Y_train, X_test, Y_test = fetch_mnist()
|
||||
BS = 32
|
||||
num_classes = 10
|
||||
X = Tensor(X_test[0:BS].astype(np.float32))
|
||||
Y = np.zeros((BS, num_classes), np.float32)
|
||||
Y[range(BS),Y_test[0:BS]] = -1.0*num_classes
|
||||
return X, Tensor(Y)
|
||||
|
||||
class TestEnd2End(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.X, cls.Y = get_mnist_data()
|
||||
|
||||
def setUp(self):
|
||||
torch.manual_seed(123)
|
||||
|
||||
def test_linear_mnist(self):
|
||||
class LinTiny:
|
||||
def __init__(self, has_batchnorm=False):
|
||||
self.l1 = Linear(784, 128)
|
||||
self.l2 = Linear(128, 10)
|
||||
self.bn1 = BatchNorm2d(128) if has_batchnorm else lambda x: x
|
||||
def __call__(self, x):
|
||||
return self.l2(self.l1(x)).relu().log_softmax(-1)
|
||||
class LinTorch(nn.Module):
|
||||
def __init__(self, has_batchnorm=False):
|
||||
super().__init__()
|
||||
self.l1 = nn.Linear(784, 128)
|
||||
self.l2 = nn.Linear(128, 10)
|
||||
def forward(self, x):
|
||||
return self.l2(self.l1(x)).relu().log_softmax(-1)
|
||||
compare_tiny_torch(LinTiny(), LinTorch(), self.X, self.Y)
|
||||
|
||||
def test_bn_mnist(self):
|
||||
class LinTiny:
|
||||
def __init__(self):
|
||||
self.l1 = Linear(784, 128)
|
||||
self.l2 = Linear(128, 10)
|
||||
self.bn1 = BatchNorm2d(128)
|
||||
def __call__(self, x):
|
||||
return self.l2(self.bn1(self.l1(x).reshape(x.shape[0], -1, 1, 1)).reshape(x.shape[0], -1).relu()).log_softmax(-1)
|
||||
class LinTorch(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.l1 = nn.Linear(784, 128)
|
||||
self.l2 = nn.Linear(128, 10)
|
||||
self.bn1 = nn.BatchNorm2d(128)
|
||||
def forward(self, x):
|
||||
return self.l2(self.bn1(self.l1(x).reshape(x.shape[0], -1, 1, 1)).reshape(x.shape[0], -1).relu()).log_softmax(-1)
|
||||
compare_tiny_torch(LinTiny(), LinTorch(), self.X, self.Y)
|
||||
|
||||
def test_bn_alone(self):
|
||||
np.random.seed(1337)
|
||||
X = Tensor(np.random.randn(32, 10, 1, 1).astype(np.float32))
|
||||
Y = Tensor(np.random.randn(32, 10, 1, 1).astype(np.float32))
|
||||
compare_tiny_torch(BatchNorm2d(10), nn.BatchNorm2d(10), X, Y)
|
||||
|
||||
def test_bn_linear(self):
|
||||
BS, K = 2, 1
|
||||
eps = 0
|
||||
X = Tensor([1,0]).reshape(BS, K, 1, 1)
|
||||
Y = Tensor([-1,0]).reshape(BS, K, 1, 1)
|
||||
class LinTiny:
|
||||
def __init__(self):
|
||||
self.l1 = Conv2d(K, K, 1, bias=False)
|
||||
self.bn1 = BatchNorm2d(K, affine=False, track_running_stats=False, eps=eps)
|
||||
def __call__(self, x): return self.bn1(self.l1(x))
|
||||
class LinTorch(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.l1 = nn.Conv2d(K, K, 1, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(K, affine=False, track_running_stats=False, eps=eps)
|
||||
def forward(self, x): return self.bn1(self.l1(x))
|
||||
model_torch = LinTorch()
|
||||
with torch.no_grad():
|
||||
model_torch.l1.weight[:] = 1.
|
||||
compare_tiny_torch(LinTiny(), model_torch, X, Y)
|
||||
|
||||
def test_conv_mnist(self):
|
||||
class LinTiny:
|
||||
def __init__(self, has_batchnorm=False):
|
||||
self.c1 = Conv2d(1, 8, 3, stride=2)
|
||||
self.c2 = Conv2d(8, 16, 3, stride=2)
|
||||
self.l1 = Linear(16*6*6, 10)
|
||||
if has_batchnorm:
|
||||
self.bn1, self.bn2 = BatchNorm2d(8), BatchNorm2d(16)
|
||||
else:
|
||||
self.bn1, self.bn2 = lambda x: x, lambda x: x
|
||||
def __call__(self, x):
|
||||
return self.l1(self.bn2(self.c2(self.bn1(self.c1(x)).relu())).relu().reshape(x.shape[0], -1)).log_softmax(-1)
|
||||
class LinTorch(nn.Module):
|
||||
def __init__(self, has_batchnorm=False):
|
||||
super().__init__()
|
||||
self.c1 = nn.Conv2d(1, 8, 3, stride=2)
|
||||
self.c2 = nn.Conv2d(8, 16, 3, stride=2)
|
||||
self.l1 = nn.Linear(16*6*6, 10)
|
||||
if has_batchnorm:
|
||||
self.bn1, self.bn2 = nn.BatchNorm2d(8), nn.BatchNorm2d(16)
|
||||
else:
|
||||
self.bn1, self.bn2 = lambda x: x, lambda x: x
|
||||
def forward(self, x):
|
||||
return self.l1(self.bn2(self.c2(self.bn1(self.c1(x)).relu())).relu().reshape(x.shape[0], -1)).log_softmax(-1)
|
||||
for has_batchnorm in [False, True]:
|
||||
with self.subTest(has_batchnorm=has_batchnorm):
|
||||
compare_tiny_torch(LinTiny(has_batchnorm), LinTorch(has_batchnorm), self.X.reshape((-1, 1, 28, 28)), self.Y)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
116
tinygrad_repo/test/models/test_mnist.py
Normal file
116
tinygrad_repo/test/models/test_mnist.py
Normal file
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
from tinygrad.nn import optim, BatchNorm2d
|
||||
from extra.training import train, evaluate
|
||||
from extra.datasets import fetch_mnist
|
||||
import pytest
|
||||
|
||||
pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
|
||||
|
||||
# load the mnist dataset
|
||||
X_train, Y_train, X_test, Y_test = fetch_mnist()
|
||||
|
||||
# create a model
|
||||
class TinyBobNet:
|
||||
def __init__(self):
|
||||
self.l1 = Tensor.scaled_uniform(784, 128)
|
||||
self.l2 = Tensor.scaled_uniform(128, 10)
|
||||
|
||||
def parameters(self):
|
||||
return get_parameters(self)
|
||||
|
||||
def forward(self, x):
|
||||
return x.dot(self.l1).relu().dot(self.l2).log_softmax()
|
||||
|
||||
# create a model with a conv layer
|
||||
class TinyConvNet:
|
||||
def __init__(self, has_batchnorm=False):
|
||||
# https://keras.io/examples/vision/mnist_convnet/
|
||||
conv = 3
|
||||
#inter_chan, out_chan = 32, 64
|
||||
inter_chan, out_chan = 8, 16 # for speed
|
||||
self.c1 = Tensor.scaled_uniform(inter_chan,1,conv,conv)
|
||||
self.c2 = Tensor.scaled_uniform(out_chan,inter_chan,conv,conv)
|
||||
self.l1 = Tensor.scaled_uniform(out_chan*5*5, 10)
|
||||
if has_batchnorm:
|
||||
self.bn1 = BatchNorm2d(inter_chan)
|
||||
self.bn2 = BatchNorm2d(out_chan)
|
||||
else:
|
||||
self.bn1, self.bn2 = lambda x: x, lambda x: x
|
||||
|
||||
def parameters(self):
|
||||
return get_parameters(self)
|
||||
|
||||
def forward(self, x:Tensor):
|
||||
x = x.reshape(shape=(-1, 1, 28, 28)) # hacks
|
||||
x = self.bn1(x.conv2d(self.c1)).relu().max_pool2d()
|
||||
x = self.bn2(x.conv2d(self.c2)).relu().max_pool2d()
|
||||
x = x.reshape(shape=[x.shape[0], -1])
|
||||
return x.dot(self.l1).log_softmax()
|
||||
|
||||
class TestMNIST(unittest.TestCase):
|
||||
def test_sgd_onestep(self):
|
||||
np.random.seed(1337)
|
||||
model = TinyBobNet()
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.001)
|
||||
train(model, X_train, Y_train, optimizer, BS=69, steps=1)
|
||||
for p in model.parameters(): p.realize()
|
||||
|
||||
def test_sgd_threestep(self):
|
||||
np.random.seed(1337)
|
||||
model = TinyBobNet()
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.001)
|
||||
train(model, X_train, Y_train, optimizer, BS=69, steps=3)
|
||||
|
||||
def test_sgd_sixstep(self):
|
||||
np.random.seed(1337)
|
||||
model = TinyBobNet()
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.001)
|
||||
train(model, X_train, Y_train, optimizer, BS=69, steps=6, noloss=True)
|
||||
|
||||
def test_adam_onestep(self):
|
||||
np.random.seed(1337)
|
||||
model = TinyBobNet()
|
||||
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
||||
train(model, X_train, Y_train, optimizer, BS=69, steps=1)
|
||||
for p in model.parameters(): p.realize()
|
||||
|
||||
def test_adam_threestep(self):
|
||||
np.random.seed(1337)
|
||||
model = TinyBobNet()
|
||||
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
||||
train(model, X_train, Y_train, optimizer, BS=69, steps=3)
|
||||
|
||||
def test_conv_onestep(self):
|
||||
np.random.seed(1337)
|
||||
model = TinyConvNet()
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.001)
|
||||
train(model, X_train, Y_train, optimizer, BS=69, steps=1, noloss=True)
|
||||
for p in model.parameters(): p.realize()
|
||||
|
||||
def test_conv(self):
|
||||
np.random.seed(1337)
|
||||
model = TinyConvNet()
|
||||
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
||||
train(model, X_train, Y_train, optimizer, steps=100)
|
||||
assert evaluate(model, X_test, Y_test) > 0.93 # torch gets 0.9415 sometimes
|
||||
|
||||
def test_conv_with_bn(self):
|
||||
np.random.seed(1337)
|
||||
model = TinyConvNet(has_batchnorm=True)
|
||||
optimizer = optim.AdamW(model.parameters(), lr=0.003)
|
||||
train(model, X_train, Y_train, optimizer, steps=200)
|
||||
assert evaluate(model, X_test, Y_test) > 0.94
|
||||
|
||||
def test_sgd(self):
|
||||
np.random.seed(1337)
|
||||
model = TinyBobNet()
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.001)
|
||||
train(model, X_train, Y_train, optimizer, steps=600)
|
||||
assert evaluate(model, X_test, Y_test) > 0.94 # CPU gets 0.9494 sometimes
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
143
tinygrad_repo/test/models/test_onnx.py
Normal file
143
tinygrad_repo/test/models/test_onnx.py
Normal file
@@ -0,0 +1,143 @@
|
||||
#!/usr/bin/env python
|
||||
import os
|
||||
import time
|
||||
import io
|
||||
import unittest
|
||||
import numpy as np
|
||||
import onnx
|
||||
from extra.utils import fetch, temp
|
||||
from extra.onnx import get_run_onnx
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import CI
|
||||
import pytest
|
||||
|
||||
pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
|
||||
|
||||
def run_onnx_torch(onnx_model, inputs):
|
||||
import torch
|
||||
from onnx2torch import convert
|
||||
torch_model = convert(onnx_model).float()
|
||||
with torch.no_grad():
|
||||
torch_out = torch_model(*[torch.tensor(x) for x in inputs.values()])
|
||||
return torch_out
|
||||
|
||||
OPENPILOT_MODEL = "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx"
|
||||
|
||||
np.random.seed(1337)
|
||||
|
||||
class TestOnnxModel(unittest.TestCase):
|
||||
def test_benchmark_openpilot_model(self):
|
||||
dat = fetch(OPENPILOT_MODEL)
|
||||
onnx_model = onnx.load(io.BytesIO(dat))
|
||||
run_onnx = get_run_onnx(onnx_model)
|
||||
def get_inputs():
|
||||
np_inputs = {
|
||||
"input_imgs": np.random.randn(*(1, 12, 128, 256)),
|
||||
"big_input_imgs": np.random.randn(*(1, 12, 128, 256)),
|
||||
"desire": np.zeros((1, 100, 8)),
|
||||
"traffic_convention": np.array([[1., 0.]]),
|
||||
"nav_features": np.zeros((1, 256)),
|
||||
"features_buffer": np.zeros((1, 99, 128)),
|
||||
}
|
||||
inputs = {k:Tensor(v.astype(np.float32), requires_grad=False) for k,v in np_inputs.items()}
|
||||
return inputs
|
||||
|
||||
for _ in range(7):
|
||||
inputs = get_inputs()
|
||||
st = time.monotonic()
|
||||
tinygrad_out = run_onnx(inputs)['outputs']
|
||||
mt = time.monotonic()
|
||||
tinygrad_out.realize()
|
||||
mt2 = time.monotonic()
|
||||
tinygrad_out = tinygrad_out.numpy()
|
||||
et = time.monotonic()
|
||||
if not CI: print(f"ran openpilot model in {(et-st)*1000.0:.2f} ms, waited {(mt2-mt)*1000.0:.2f} ms for realize, {(et-mt2)*1000.0:.2f} ms for GPU queue")
|
||||
|
||||
if not CI:
|
||||
import cProfile
|
||||
import pstats
|
||||
inputs = get_inputs()
|
||||
pr = cProfile.Profile(timer=time.perf_counter_ns, timeunit=1e-6)
|
||||
pr.enable()
|
||||
tinygrad_out = run_onnx(inputs)['outputs']
|
||||
tinygrad_out.realize()
|
||||
tinygrad_out = tinygrad_out.numpy()
|
||||
if not CI:
|
||||
pr.disable()
|
||||
stats = pstats.Stats(pr)
|
||||
stats.dump_stats(temp("net.prof"))
|
||||
os.system(f"flameprof {temp('net.prof')} > {temp('prof.svg')}")
|
||||
ps = stats.sort_stats(pstats.SortKey.TIME)
|
||||
ps.print_stats(30)
|
||||
|
||||
def test_openpilot_model(self):
|
||||
dat = fetch(OPENPILOT_MODEL)
|
||||
onnx_model = onnx.load(io.BytesIO(dat))
|
||||
run_onnx = get_run_onnx(onnx_model)
|
||||
print("got run_onnx")
|
||||
inputs = {
|
||||
"input_imgs": np.random.randn(*(1, 12, 128, 256)),
|
||||
"big_input_imgs": np.random.randn(*(1, 12, 128, 256)),
|
||||
"desire": np.zeros((1, 100, 8)),
|
||||
"traffic_convention": np.array([[1., 0.]]),
|
||||
"nav_features": np.zeros((1, 256)),
|
||||
"features_buffer": np.zeros((1, 99, 128)),
|
||||
}
|
||||
inputs = {k:v.astype(np.float32) for k,v in inputs.items()}
|
||||
|
||||
st = time.monotonic()
|
||||
print("****** run onnx ******")
|
||||
tinygrad_out = run_onnx(inputs)['outputs']
|
||||
mt = time.monotonic()
|
||||
print("****** realize ******")
|
||||
tinygrad_out.realize()
|
||||
mt2 = time.monotonic()
|
||||
tinygrad_out = tinygrad_out.numpy()
|
||||
et = time.monotonic()
|
||||
print(f"ran openpilot model in {(et-st)*1000.0:.2f} ms, waited {(mt2-mt)*1000.0:.2f} ms for realize, {(et-mt2)*1000.0:.2f} ms for GPU queue")
|
||||
|
||||
Tensor.no_grad = True
|
||||
torch_out = run_onnx_torch(onnx_model, inputs).numpy()
|
||||
Tensor.no_grad = False
|
||||
print(tinygrad_out, torch_out)
|
||||
np.testing.assert_allclose(torch_out, tinygrad_out, atol=1e-4, rtol=1e-2)
|
||||
|
||||
def test_efficientnet(self):
|
||||
dat = fetch("https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx")
|
||||
input_name, input_new = "images:0", True
|
||||
self._test_model(dat, input_name, input_new)
|
||||
|
||||
def test_shufflenet(self):
|
||||
dat = fetch("https://github.com/onnx/models/raw/main/vision/classification/shufflenet/model/shufflenet-9.onnx")
|
||||
print(f"shufflenet downloaded : {len(dat)/1e6:.2f} MB")
|
||||
input_name, input_new = "gpu_0/data_0", False
|
||||
self._test_model(dat, input_name, input_new)
|
||||
|
||||
@unittest.skip("test is very slow")
|
||||
def test_resnet(self):
|
||||
# NOTE: many onnx models can't be run right now due to max pool with strides != kernel_size
|
||||
dat = fetch("https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx")
|
||||
print(f"resnet downloaded : {len(dat)/1e6:.2f} MB")
|
||||
input_name, input_new = "data", False
|
||||
self._test_model(dat, input_name, input_new)
|
||||
|
||||
def _test_model(self, dat, input_name, input_new, debug=False):
|
||||
onnx_model = onnx.load(io.BytesIO(dat))
|
||||
print("onnx loaded")
|
||||
from test.models.test_efficientnet import chicken_img, car_img, preprocess, _LABELS
|
||||
run_onnx = get_run_onnx(onnx_model)
|
||||
|
||||
def run(img):
|
||||
inputs = {input_name: preprocess(img, new=input_new)}
|
||||
tinygrad_out = list(run_onnx(inputs, debug=debug).values())[0].numpy()
|
||||
return tinygrad_out.argmax()
|
||||
|
||||
cls = run(chicken_img)
|
||||
print(cls, _LABELS[cls])
|
||||
assert _LABELS[cls] == "hen" or _LABELS[cls] == "cock"
|
||||
cls = run(car_img)
|
||||
print(cls, _LABELS[cls])
|
||||
assert "car" in _LABELS[cls] or _LABELS[cls] == "convertible"
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
100
tinygrad_repo/test/models/test_real_world.py
Normal file
100
tinygrad_repo/test/models/test_real_world.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import unittest, time
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn import optim
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
|
||||
from tinygrad.ops import Device, GlobalCounters
|
||||
from tinygrad.helpers import CI, dtypes, getenv, prod
|
||||
from test.helpers import derandomize_model
|
||||
|
||||
from examples.gpt2 import Transformer as GPT2Transformer, MODEL_PARAMS as GPT2_MODEL_PARAMS
|
||||
from examples.hlb_cifar10 import SpeedyResNet
|
||||
from examples.llama import Transformer as LLaMaTransformer, MODEL_PARAMS as LLAMA_MODEL_PARAMS
|
||||
from examples.stable_diffusion import UNetModel
|
||||
|
||||
def helper_test(nm, gen, train, max_memory_allowed, max_kernels_allowed, all_jitted=False):
|
||||
tms = []
|
||||
for _ in range(4):
|
||||
GlobalCounters.reset()
|
||||
GlobalCounters.mem_used = 0
|
||||
Device[Device.DEFAULT].synchronize()
|
||||
st = time.perf_counter_ns()
|
||||
train(*gen())
|
||||
Device[Device.DEFAULT].synchronize()
|
||||
tms.append(time.perf_counter_ns() - st)
|
||||
|
||||
kernels_used = len(train.jit_cache) if hasattr(train, "jit_cache") else None
|
||||
print(f"{nm}: used {GlobalCounters.mem_used/1e9:.2f} GB and {kernels_used} kernels in {min(tms)/1e6:.2f} ms")
|
||||
assert GlobalCounters.mem_used/1e9 < max_memory_allowed, f"{nm} used more than {max_memory_allowed:.2f} GB"
|
||||
assert not kernels_used or kernels_used <= max_kernels_allowed, f"{nm} used more than {max_kernels_allowed} kernels"
|
||||
if all_jitted:
|
||||
assert kernels_used > 0 and kernels_used == GlobalCounters.kernel_count, f"only {kernels_used} out of {GlobalCounters.kernel_count} were jitted"
|
||||
|
||||
class TestRealWorld(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.old_type = Tensor.default_type
|
||||
np.random.seed(2002)
|
||||
|
||||
def tearDown(self):
|
||||
Tensor.default_type = self.old_type
|
||||
|
||||
@unittest.skipUnless(not CI, "too big for CI")
|
||||
def test_stable_diffusion(self):
|
||||
model = UNetModel()
|
||||
derandomize_model(model)
|
||||
@TinyJit
|
||||
def test(t, t2): return model(t, 801, t2).realize()
|
||||
helper_test("test_sd", lambda: (Tensor.randn(1, 4, 64, 64),Tensor.randn(1, 77, 768)), test, 18.0, 967)
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["LLVM"], "needs JIT, too long on CI LLVM")
|
||||
def test_llama(self):
|
||||
Tensor.default_type = dtypes.float16
|
||||
|
||||
args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
|
||||
model = LLaMaTransformer(**(args_tiny if CI else LLAMA_MODEL_PARAMS["1"]["7B"]["args"]))
|
||||
derandomize_model(model)
|
||||
@TinyJit
|
||||
def test(t): return model(t, 0).realize()
|
||||
# NOTE: only test one pass, not testing the dynamic shape autoregressive part
|
||||
helper_test("test_llama", lambda: (Tensor([[1,]]),), test, 0.22 if CI else 13.5, 126 if CI else 486, all_jitted=True)
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and (Device.DEFAULT not in ["LLVM"] or not CI), "needs JIT, too long on CI LLVM")
|
||||
def test_gpt2(self):
|
||||
Tensor.default_type = dtypes.float16
|
||||
|
||||
args_tiny = {"dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-5, "vocab_size": 1000}
|
||||
model = GPT2Transformer(**(args_tiny if CI else GPT2_MODEL_PARAMS["gpt2-medium"]))
|
||||
derandomize_model(model)
|
||||
@TinyJit
|
||||
def test(t): return model(t, 0).realize()
|
||||
helper_test("test_gpt2", lambda: (Tensor([[1,]]),), test, 0.21 if CI else 0.9, 129 if CI else 369, all_jitted=True)
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and (Device.DEFAULT not in ["LLVM", "CLANG"] or not CI), "needs JIT, too long on CI LLVM and CLANG")
|
||||
def test_train_cifar(self):
|
||||
# TODO: with default device
|
||||
#old_default = Device.DEFAULT
|
||||
#Device.DEFAULT = "FAKE"
|
||||
#Device['fake'].codegen = Device[old_default].codegen
|
||||
|
||||
with Tensor.train():
|
||||
model = SpeedyResNet(Tensor.ones((12,3,2,2)))
|
||||
optimizer = optim.SGD(get_parameters(model), lr=0.01, momentum=0.8, nesterov=True, weight_decay=0.15)
|
||||
|
||||
BS = 32 if CI else 512
|
||||
|
||||
@TinyJit
|
||||
def train(X):
|
||||
out = model(X)
|
||||
loss = out.mean()
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
helper_test("train_cifar", lambda: (Tensor.randn(BS, 3, 32, 32),), train, (1.0/48)*BS, 154) # it's 154 on metal
|
||||
|
||||
# reset device
|
||||
#Device.DEFAULT = old_default
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
47
tinygrad_repo/test/models/test_rnnt.py
Normal file
47
tinygrad_repo/test/models/test_rnnt.py
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from models.rnnt import LSTM
|
||||
import torch
|
||||
|
||||
class TestRNNT(unittest.TestCase):
|
||||
def test_lstm(self):
|
||||
BS, SQ, IS, HS, L = 2, 20, 40, 128, 2
|
||||
|
||||
# create in torch
|
||||
with torch.no_grad():
|
||||
torch_layer = torch.nn.LSTM(IS, HS, L)
|
||||
|
||||
# create in tinygrad
|
||||
layer = LSTM(IS, HS, L, 0.0)
|
||||
|
||||
# copy weights
|
||||
with torch.no_grad():
|
||||
layer.cells[0].weights_ih.assign(Tensor(torch_layer.weight_ih_l0.numpy()))
|
||||
layer.cells[0].weights_hh.assign(Tensor(torch_layer.weight_hh_l0.numpy()))
|
||||
layer.cells[0].bias_ih.assign(Tensor(torch_layer.bias_ih_l0.numpy()))
|
||||
layer.cells[0].bias_hh.assign(Tensor(torch_layer.bias_hh_l0.numpy()))
|
||||
layer.cells[1].weights_ih.assign(Tensor(torch_layer.weight_ih_l1.numpy()))
|
||||
layer.cells[1].weights_hh.assign(Tensor(torch_layer.weight_hh_l1.numpy()))
|
||||
layer.cells[1].bias_ih.assign(Tensor(torch_layer.bias_ih_l1.numpy()))
|
||||
layer.cells[1].bias_hh.assign(Tensor(torch_layer.bias_hh_l1.numpy()))
|
||||
|
||||
# test initial hidden
|
||||
for _ in range(3):
|
||||
x = Tensor.randn(SQ, BS, IS)
|
||||
z, hc = layer(x, None)
|
||||
torch_x = torch.tensor(x.numpy())
|
||||
torch_z, torch_hc = torch_layer(torch_x)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
|
||||
|
||||
# test passing hidden
|
||||
for _ in range(3):
|
||||
x = Tensor.randn(SQ, BS, IS)
|
||||
z, hc = layer(x, hc)
|
||||
torch_x = torch.tensor(x.numpy())
|
||||
torch_z, torch_hc = torch_layer(torch_x, torch_hc)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
83
tinygrad_repo/test/models/test_train.py
Normal file
83
tinygrad_repo/test/models/test_train.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import unittest
|
||||
import time
|
||||
import numpy as np
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from tinygrad.nn import optim
|
||||
from tinygrad.tensor import Device
|
||||
from tinygrad.helpers import getenv
|
||||
from extra.training import train
|
||||
from models.convnext import ConvNeXt
|
||||
from models.efficientnet import EfficientNet
|
||||
from models.transformer import Transformer
|
||||
from models.vit import ViT
|
||||
from models.resnet import ResNet18
|
||||
import pytest
|
||||
|
||||
pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
|
||||
|
||||
BS = getenv("BS", 2)
|
||||
|
||||
def train_one_step(model,X,Y):
|
||||
params = get_parameters(model)
|
||||
pcount = 0
|
||||
for p in params:
|
||||
pcount += np.prod(p.shape)
|
||||
optimizer = optim.SGD(params, lr=0.001)
|
||||
print("stepping %r with %.1fM params bs %d" % (type(model), pcount/1e6, BS))
|
||||
st = time.time()
|
||||
train(model, X, Y, optimizer, steps=1, BS=BS)
|
||||
et = time.time()-st
|
||||
print("done in %.2f ms" % (et*1000.))
|
||||
|
||||
def check_gc():
|
||||
if Device.DEFAULT == "GPU":
|
||||
from extra.introspection import print_objects
|
||||
assert print_objects() == 0
|
||||
|
||||
class TestTrain(unittest.TestCase):
|
||||
def test_convnext(self):
|
||||
model = ConvNeXt(depths=[1], dims=[16])
|
||||
X = np.zeros((BS,3,224,224), dtype=np.float32)
|
||||
Y = np.zeros((BS), dtype=np.int32)
|
||||
train_one_step(model,X,Y)
|
||||
check_gc()
|
||||
|
||||
def test_efficientnet(self):
|
||||
model = EfficientNet(0)
|
||||
X = np.zeros((BS,3,224,224), dtype=np.float32)
|
||||
Y = np.zeros((BS), dtype=np.int32)
|
||||
train_one_step(model,X,Y)
|
||||
check_gc()
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "too many buffers for webgpu")
|
||||
def test_vit(self):
|
||||
model = ViT()
|
||||
X = np.zeros((BS,3,224,224), dtype=np.float32)
|
||||
Y = np.zeros((BS,), dtype=np.int32)
|
||||
train_one_step(model,X,Y)
|
||||
check_gc()
|
||||
|
||||
def test_transformer(self):
|
||||
# this should be small GPT-2, but the param count is wrong
|
||||
# (real ff_dim is 768*4)
|
||||
model = Transformer(syms=10, maxlen=6, layers=12, embed_dim=768, num_heads=12, ff_dim=768//4)
|
||||
X = np.zeros((BS,6), dtype=np.float32)
|
||||
Y = np.zeros((BS,6), dtype=np.int32)
|
||||
train_one_step(model,X,Y)
|
||||
check_gc()
|
||||
|
||||
def test_resnet(self):
|
||||
X = np.zeros((BS, 3, 224, 224), dtype=np.float32)
|
||||
Y = np.zeros((BS), dtype=np.int32)
|
||||
for resnet_v in [ResNet18]:
|
||||
model = resnet_v()
|
||||
model.load_from_pretrained()
|
||||
train_one_step(model, X, Y)
|
||||
check_gc()
|
||||
|
||||
def test_bert(self):
|
||||
# TODO: write this
|
||||
pass
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
25
tinygrad_repo/test/models/test_waifu2x.py
Normal file
25
tinygrad_repo/test/models/test_waifu2x.py
Normal file
@@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env python
|
||||
import pathlib
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
|
||||
class TestVGG7(unittest.TestCase):
|
||||
def test_vgg7(self):
|
||||
from examples.vgg7_helpers.waifu2x import Vgg7, image_load
|
||||
|
||||
# Create in tinygrad
|
||||
Tensor.manual_seed(1337)
|
||||
mdl = Vgg7()
|
||||
mdl.load_from_pretrained()
|
||||
|
||||
# Scale up an image
|
||||
test_x = image_load(pathlib.Path(__file__).parent / 'waifu2x/input.png')
|
||||
test_y = image_load(pathlib.Path(__file__).parent / 'waifu2x/output.png')
|
||||
scaled = mdl.forward_tiled(test_x, 156)
|
||||
scaled = np.fmax(0, np.fmin(1, scaled))
|
||||
np.testing.assert_allclose(scaled, test_y, atol=5e-3, rtol=5e-3)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
25
tinygrad_repo/test/models/test_whisper.py
Normal file
25
tinygrad_repo/test/models/test_whisper.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import unittest
|
||||
import pathlib
|
||||
from tinygrad.ops import Device
|
||||
from examples.whisper import init_whisper, transcribe_file
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "METAL", "Some non-metal backends spend too long trying to allocate a 20GB array")
|
||||
class TestWhisper(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
model, enc = init_whisper("tiny.en")
|
||||
cls.model = model
|
||||
cls.enc = enc
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
del cls.model
|
||||
del cls.enc
|
||||
|
||||
def test_transcribe_file(self):
|
||||
# Audio generated with the command on MacOS:
|
||||
# say "Could you please let me out of the box?" --file-format=WAVE --data-format=LEUI8@16000 -o test
|
||||
# We use the WAVE type because it's easier to decode in CI test environments
|
||||
filename = str(pathlib.Path(__file__).parent / "whisper/test.wav")
|
||||
transcription = transcribe_file(self.model, self.enc, filename)
|
||||
self.assertEqual("<|startoftranscript|><|notimestamps|> Could you please let me out of the box?<|endoftext|>", transcription)
|
||||
BIN
tinygrad_repo/test/models/waifu2x/input.png
Normal file
BIN
tinygrad_repo/test/models/waifu2x/input.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 7.1 KiB |
BIN
tinygrad_repo/test/models/waifu2x/output.png
Normal file
BIN
tinygrad_repo/test/models/waifu2x/output.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 15 KiB |
BIN
tinygrad_repo/test/models/whisper/test.wav
Normal file
BIN
tinygrad_repo/test/models/whisper/test.wav
Normal file
Binary file not shown.
136
tinygrad_repo/test/test_allocators.py
Normal file
136
tinygrad_repo/test/test_allocators.py
Normal file
@@ -0,0 +1,136 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from weakref import ref
|
||||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.runtime.lib import RawBuffer, LRUAllocator
|
||||
from tinygrad.helpers import dtypes, prod
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
def check_gc():
|
||||
if Device.DEFAULT == "GPU":
|
||||
from extra.introspection import print_objects
|
||||
assert print_objects() == 0
|
||||
|
||||
class FakeDeviceBuffer:
|
||||
def __init__(self, sz, dt, device):
|
||||
self.id = 1
|
||||
self.size = sz
|
||||
self.dtype = dt
|
||||
self.device = device
|
||||
def __del__(self):
|
||||
assert self.id == 0, "Should called _do_free() before"
|
||||
|
||||
class FakeAllocator(LRUAllocator):
|
||||
def _do_alloc(self, size, dtype, device, **kwargs): return FakeDeviceBuffer(size, dtype, device)
|
||||
def _do_free(self, buf):
|
||||
buf.id -= 1
|
||||
assert buf.id == 0, f"Free should be called once, but {buf.id}"
|
||||
def __del__(self): # Fake allocator should clear all buffers after each test.
|
||||
for v in self.cached_buffers.values():
|
||||
for buf, _ in v: self._free_buffer(buf)
|
||||
|
||||
FAKE_GLOBAL_ALLOCATOR = None
|
||||
class FakeBuffer(RawBuffer):
|
||||
def __init__(self, size, dtype, device='0'):
|
||||
global FAKE_GLOBAL_ALLOCATOR
|
||||
super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
|
||||
assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
|
||||
@classmethod
|
||||
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
|
||||
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
|
||||
|
||||
def alloc(allocator, size, dtype, **kwargs):
|
||||
global FAKE_GLOBAL_ALLOCATOR
|
||||
FAKE_GLOBAL_ALLOCATOR = allocator
|
||||
buf = FakeBuffer(size, dtype, **kwargs)
|
||||
assert buf.dtype == dtype and buf.size == size
|
||||
FAKE_GLOBAL_ALLOCATOR = None
|
||||
return buf
|
||||
|
||||
def alloc_free_trace(allocator, size, dtype, **kwargs):
|
||||
buf = alloc(allocator, size, dtype, **kwargs)
|
||||
return ref(buf._buf)
|
||||
|
||||
def cmp_trace_and_buf(buf, trace_ref): return trace_ref and trace_ref() == buf._buf
|
||||
|
||||
class TestAllocators(unittest.TestCase):
|
||||
def test_lru_allocator_reusage(self):
|
||||
mc, mu = GlobalCounters.mem_cached, GlobalCounters.mem_used
|
||||
def test():
|
||||
lru_allocator = FakeAllocator(2048)
|
||||
traced_buf = alloc_free_trace(lru_allocator, 16, dtypes.float32)
|
||||
assert GlobalCounters.mem_cached - mc == 16*dtypes.float32.itemsize, "Buffer should be cached"
|
||||
for _ in range(32):
|
||||
def __test():
|
||||
buf = alloc(lru_allocator, 16, dtypes.float32)
|
||||
assert cmp_trace_and_buf(buf, traced_buf), "Buffer should be reused"
|
||||
__test()
|
||||
|
||||
usedbuf = alloc(lru_allocator, 16, dtypes.float32)
|
||||
for _ in range(32):
|
||||
def __test():
|
||||
buf = alloc(lru_allocator, 16, dtypes.float32)
|
||||
assert usedbuf != buf, "Nobody should get used buffer"
|
||||
__test()
|
||||
assert GlobalCounters.mem_used - mu == 16*dtypes.float32.itemsize, "Only usedbuf is still allocated."
|
||||
test()
|
||||
check_gc()
|
||||
|
||||
def test_lru_allocator_cache_free(self):
|
||||
mc, mu = GlobalCounters.mem_cached, GlobalCounters.mem_used
|
||||
def test():
|
||||
lru_allocator = FakeAllocator(128)
|
||||
refs = []
|
||||
for _ in range(32):
|
||||
refs.append(alloc_free_trace(lru_allocator, 16, dtypes.float32))
|
||||
for sz in range(1, 32):
|
||||
alloc_free_trace(lru_allocator, sz, dtypes.float32)
|
||||
assert GlobalCounters.mem_used + GlobalCounters.mem_cached - mc - mu <= 128, "Should not allocate on device more than allowed (128)"
|
||||
for r in refs: assert r() is None, "All refs should be dead, since buffers were cleared from cache"
|
||||
test()
|
||||
check_gc()
|
||||
|
||||
def test_lru_allocator_multidevice(self):
|
||||
def test():
|
||||
lru_allocator = FakeAllocator(256)
|
||||
refs=[]
|
||||
for i in range(8):
|
||||
refs.append(alloc_free_trace(lru_allocator, 16, dtypes.float32, device=str(i)))
|
||||
for i in range(64):
|
||||
def __test():
|
||||
dev = str(i % 8)
|
||||
buf = alloc(lru_allocator, 16, dtypes.float32, device=dev)
|
||||
assert cmp_trace_and_buf(buf, refs[i%8]), "Buffer should be reused"
|
||||
__test()
|
||||
for r in refs: assert r() is not None, "All refs should be cached"
|
||||
test()
|
||||
check_gc()
|
||||
|
||||
@unittest.skip("failing in CI")
|
||||
def test_gpu_copyout(self):
|
||||
def test():
|
||||
from tinygrad.runtime.ops_gpu import CL
|
||||
|
||||
# Allocation to init the allocator.
|
||||
tx = Tensor.rand(1)
|
||||
tx.realize()
|
||||
free_space = CL.cl_allocator.free_space[tx.lazydata.realized._device]
|
||||
|
||||
# Spawning 128mb objects to fill half of free_space
|
||||
will_allocate = free_space // 3
|
||||
trash_allocation_size = free_space // 2
|
||||
|
||||
def sp():
|
||||
trash_buffer = Tensor.rand(trash_allocation_size // 4)
|
||||
trash_buffer.realize()
|
||||
sp()
|
||||
|
||||
xx = Tensor.rand(will_allocate // 4)
|
||||
_ = xx.numpy()
|
||||
test()
|
||||
check_gc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
67
tinygrad_repo/test/test_assign.py
Normal file
67
tinygrad_repo/test/test_assign.py
Normal file
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad.helpers import dtypes
|
||||
|
||||
N = 200 # has to be bigger than the cache to fail
|
||||
|
||||
class TestAssign(unittest.TestCase):
|
||||
def test_simple_assignment(self):
|
||||
a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
|
||||
b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
|
||||
a.realize()
|
||||
b.realize()
|
||||
ba1 = a.lazydata.realized
|
||||
bb1 = b.lazydata.realized
|
||||
a += b
|
||||
a.realize()
|
||||
ba2 = a.lazydata.realized
|
||||
assert ba1 == ba2 and ba1 != bb1
|
||||
np.testing.assert_allclose(a.numpy(), (np.arange(N*N)*2).reshape((N,N)))
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "CPU" or Device.DEFAULT == "TORCH", "questionable tests")
|
||||
def test_permuted_assignment(self):
|
||||
a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
|
||||
b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
|
||||
a.realize()
|
||||
b.realize()
|
||||
ba1 = a.lazydata.realized
|
||||
bb1 = b.lazydata.realized
|
||||
a = a.permute(1,0)
|
||||
a += b
|
||||
a.realize()
|
||||
ba2 = a.lazydata.realized
|
||||
assert ba1 != ba2 and ba1 != bb1
|
||||
np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
|
||||
|
||||
def test_post_permuted_assignment(self):
|
||||
a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
|
||||
b = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
|
||||
a.realize()
|
||||
b.realize()
|
||||
#GlobalCounters.cache = []
|
||||
ba1 = a.lazydata.realized
|
||||
bb1 = b.lazydata.realized
|
||||
a.assign(a.permute(1,0) + b) # this should not work!
|
||||
a.realize()
|
||||
ba2 = a.lazydata.realized
|
||||
# NOTE: don't test that it's assigned
|
||||
#assert ba1 == ba2 and ba1 != bb1
|
||||
np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
|
||||
|
||||
# TODO: is there a way to sneak in a permute such that it returns the wrong answer?
|
||||
|
||||
def test_cast_assignment(self):
|
||||
a = Tensor(np.arange(N*N, dtype=np.float32)).reshape(N,N)
|
||||
a.realize()
|
||||
oba1 = a.lazydata.output_buffer
|
||||
a.assign(a.cast(dtypes.int32).realize())
|
||||
a.realize()
|
||||
oba2 = a.lazydata.output_buffer
|
||||
assert oba1 is None and oba2 is None
|
||||
np.testing.assert_allclose(a.numpy(), np.arange(N*N,dtype=np.int32).reshape((N,N)))
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
147
tinygrad_repo/test/test_conv.py
Normal file
147
tinygrad_repo/test/test_conv.py
Normal file
@@ -0,0 +1,147 @@
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
import pytest
|
||||
|
||||
pytestmark = [pytest.mark.exclude_cuda]
|
||||
|
||||
class TestConv(unittest.TestCase):
|
||||
def test_simple(self):
|
||||
x = Tensor.ones(1,12,128,256).contiguous().realize()
|
||||
w = Tensor.ones(32,12,3,3).contiguous().realize()
|
||||
ret = x.conv2d(w, stride=(2,2), padding=(1,1)).numpy()
|
||||
# it's not 108 around the padding
|
||||
assert (ret[:, :, 1:-1, 1:-1] == 108).all()
|
||||
assert ret[0,0,0,0] == 48
|
||||
assert ret[0,0,0,1] == 72
|
||||
|
||||
def test_simple_rand(self):
|
||||
x = Tensor.rand(1,12,128,256)
|
||||
w = Tensor.rand(32,12,3,3)
|
||||
ret = x.conv2d(w, stride=(2,2), padding=(1,1)).numpy()
|
||||
|
||||
def test_many_simple(self):
|
||||
x = Tensor(np.arange(8*2*8).reshape(1,8,2,8).astype(np.float32))
|
||||
#w = Tensor(np.arange(8*8*1*1).reshape(8,8,1,1).astype(np.float32))
|
||||
w = Tensor.eye(8).reshape((8,8,1,1))
|
||||
ret = x.conv2d(w, stride=(1,2), padding=(0,0)).numpy()
|
||||
print(ret)
|
||||
|
||||
def test_lazycache(self):
|
||||
Tensor.no_grad = True
|
||||
x = Tensor.rand(1, 32)
|
||||
y = Tensor.rand(32)
|
||||
out = x + y.reshape((1,32,1)).reshape((1,32)) + y.reshape((1,32,1)).reshape((1,32))
|
||||
out.numpy()
|
||||
Tensor.no_grad = False
|
||||
|
||||
def test_simple_biased(self):
|
||||
C = 8
|
||||
x = Tensor.rand(1,C,5,5)
|
||||
w = Tensor.eye(C).reshape((C,C,1,1))
|
||||
b = Tensor(np.arange(C).astype(np.float32))
|
||||
ret = Tensor.conv2d(x,w,b).relu().conv2d(w,b)
|
||||
|
||||
print(ret.numpy())
|
||||
|
||||
def test_two_binops_no_rerun(self):
|
||||
Tensor.no_grad = True
|
||||
x = Tensor.randn(1,12,128,256)
|
||||
w = Tensor.randn(32,12,3,3)
|
||||
out = x.conv2d(w, stride=(2,2), padding=(1,1))
|
||||
r1, r2 = out.relu(), (out-1)
|
||||
np.testing.assert_allclose(r1.numpy(), np.maximum(out.numpy(), 0))
|
||||
np.testing.assert_allclose(r2.numpy(), out.numpy() - 1)
|
||||
Tensor.no_grad = False
|
||||
|
||||
def test_two_overlapping_binops_no_rerun(self):
|
||||
Tensor.no_grad = True
|
||||
x = Tensor.randn(1,12,128,256)
|
||||
w = Tensor.randn(32,12,3,3)
|
||||
out = x.conv2d(w, stride=(2,2), padding=(1,1))
|
||||
r1, r2 = out.relu(), out.elu()
|
||||
np.testing.assert_allclose(r1.numpy(), np.maximum(out.numpy(), 0))
|
||||
np.testing.assert_allclose(r2.numpy(), np.where(out.numpy() > 0, out.numpy(), (np.exp(out.numpy()) - 1)), atol=1e-5)
|
||||
Tensor.no_grad = False
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT != "TORCH", "Takes too long to compile for Compiled backends")
|
||||
def test_two_overlapping_binops_no_rerun_wino(self):
|
||||
Tensor.no_grad = True
|
||||
old_wino = Tensor.wino
|
||||
Tensor.wino = True
|
||||
x = Tensor.randn(1,4,16,16)
|
||||
w = Tensor.randn(6,4,3,3)
|
||||
out = x.conv2d(w, padding=(1,1))
|
||||
r1, r2 = out.relu(), out.elu()
|
||||
np.testing.assert_allclose(r1.numpy(), np.maximum(out.numpy(), 0))
|
||||
np.testing.assert_allclose(r2.numpy(), np.where(out.numpy() > 0, out.numpy(), (np.exp(out.numpy()) - 1)), atol=1e-5)
|
||||
Tensor.wino = old_wino
|
||||
Tensor.no_grad = False
|
||||
|
||||
def test_first_three(self):
|
||||
Tensor.no_grad = True
|
||||
x = Tensor.rand(1,12,128,256)
|
||||
|
||||
w = Tensor.rand(32,12,3,3)
|
||||
x = x.conv2d(w, stride=(2,2), padding=(1,1)).elu()
|
||||
|
||||
w = Tensor.rand(32,1,3,3)
|
||||
x = x.conv2d(w, padding=(1,1), groups=32).elu()
|
||||
|
||||
w = Tensor.rand(16,32,1,1)
|
||||
x = x.conv2d(w).elu()
|
||||
|
||||
x = x.numpy()
|
||||
print(x.shape)
|
||||
Tensor.no_grad = False
|
||||
|
||||
def test_elu(self):
|
||||
Tensor.no_grad = True
|
||||
x = Tensor.rand(1,12,128,256)
|
||||
|
||||
w = Tensor.rand(32,12,3,3)
|
||||
x = x.conv2d(w, stride=(2,2), padding=(1,1))
|
||||
|
||||
x = x.elu()
|
||||
|
||||
w = Tensor.rand(32,1,3,3)
|
||||
x = x.conv2d(w, padding=(1,1), groups=32)
|
||||
out = x.numpy()
|
||||
Tensor.no_grad = False
|
||||
|
||||
def test_reduce_relu(self):
|
||||
Tensor.no_grad = True
|
||||
x = Tensor.rand(1,12,128,256)
|
||||
x = x.sum(keepdim=True).relu()
|
||||
out = x.numpy()
|
||||
Tensor.no_grad = False
|
||||
|
||||
def test_bias(self):
|
||||
Tensor.no_grad = True
|
||||
from tinygrad.nn import Conv2d
|
||||
x = Tensor.rand(1,12,128,256)
|
||||
c = Conv2d(12, 32, 3)
|
||||
x = c(x).relu()
|
||||
w = Tensor.uniform(32, 1, 3, 3)
|
||||
x = x.conv2d(w, groups=32)
|
||||
out = x.numpy()
|
||||
Tensor.no_grad = False
|
||||
|
||||
def test_multiadd(self):
|
||||
w = Tensor.rand(32)
|
||||
x = Tensor.rand(32).relu()
|
||||
(w+x).numpy()
|
||||
|
||||
def test_reorder(self):
|
||||
x = Tensor.rand(1,12,128,256)
|
||||
w = Tensor.rand(12,12,3,3)
|
||||
x = x.conv2d(w, padding=(1,1))
|
||||
print(x.shape)
|
||||
x = x.reshape((1, 12, 256, 128))
|
||||
x += 1
|
||||
x += 1
|
||||
x = x.reshape((1, 12, 128, 256))
|
||||
x.numpy()
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
27
tinygrad_repo/test/test_conv_shapetracker.py
Normal file
27
tinygrad_repo/test/test_conv_shapetracker.py
Normal file
@@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
from tinygrad.nn import Conv2d
|
||||
from tinygrad.jit import CacheCollector
|
||||
import pytest
|
||||
|
||||
pytestmark = pytest.mark.webgpu
|
||||
|
||||
#@unittest.skipUnless(Device.DEFAULT == "GPU", "Only GPU supports cache")
|
||||
@unittest.skip("with JIT changes, you only get the raw buffer")
|
||||
class TestConvShapetracker(unittest.TestCase):
|
||||
def test_conv_3x3_one_view(self):
|
||||
inp = Tensor.randn(1,16,10,10).realize()
|
||||
conv = Conv2d(16, 32, (3,3))
|
||||
conv(inp).realize()
|
||||
CacheCollector.start()
|
||||
conv(inp).realize()
|
||||
test = CacheCollector.finish()
|
||||
assert len(test) == 1, f"conv should only have one kernel {[x[0].name for x in test]}"
|
||||
print(test[0][0].prg)
|
||||
for arg in test[0][1]:
|
||||
print(arg.st)
|
||||
assert len(arg.st.views) == 1
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
107
tinygrad_repo/test/test_custom_function.py
Normal file
107
tinygrad_repo/test/test_custom_function.py
Normal file
@@ -0,0 +1,107 @@
|
||||
# this is an example of how you can write terrible DSP compute breaking ops like warpPerspective
|
||||
# here we use a CUSTOM op to write atan2
|
||||
|
||||
import unittest
|
||||
import numpy as np
|
||||
from typing import Optional, Tuple
|
||||
from tinygrad.helpers import prod, dtypes
|
||||
|
||||
# *** first, we implement the atan2 op at the lowest level ***
|
||||
# `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
|
||||
from tinygrad.lazy import LazyBuffer, create_lazybuffer
|
||||
from tinygrad.ops import ASTRunner, Device
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
import pytest
|
||||
|
||||
pytestmark = pytest.mark.webgpu
|
||||
|
||||
# we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer
|
||||
def atan2_gpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
|
||||
assert a.device == "GPU" and b.device == "GPU", "gpu function requires GPUBuffers"
|
||||
assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32"
|
||||
ret.realized = Device[ret.device].buffer(prod(ret.shape), ret.dtype)
|
||||
ASTRunner("atan2_gpu", """
|
||||
__kernel void atan2_gpu(global float *c, global float *a, global float *b) {
|
||||
int idx = get_global_id(0);
|
||||
c[idx] = atan2(a[idx], b[idx]);
|
||||
}""", global_size=[prod(ret.shape)]).build(Device[ret.device].compiler, Device[ret.device].runtime).exec([ret.realized, a.realized, b.realized])
|
||||
return ret.realized
|
||||
|
||||
def atan2_cpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
|
||||
return Device[ret.device].from_underlying(np.arctan2(a.realized._buf, b.realized._buf))
|
||||
|
||||
# *** second, we write the ATan2 mlop ***
|
||||
# NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative
|
||||
# In general, it is also optional to write a backward function, just your backward pass won't work without it
|
||||
|
||||
from tinygrad.ops import LazyOp, LoadOps, BinaryOps, UnaryOps
|
||||
from tinygrad.lazy import LazyBuffer
|
||||
from tinygrad.tensor import Function
|
||||
|
||||
class ATan2(Function):
|
||||
def forward(self, a:LazyBuffer, b:LazyBuffer) -> LazyBuffer:
|
||||
assert prod(a.shape) == prod(b.shape) and a.device == b.device, "shape or device mismatch"
|
||||
self.a, self.b = a, b
|
||||
ast = LazyOp(LoadOps.CUSTOM, (a.contiguous(), b.contiguous()), {"GPU": atan2_gpu, "CPU": atan2_cpu}[a.device])
|
||||
return create_lazybuffer(a.device, ShapeTracker.from_shape(a.shape), LoadOps, ast, max(a.dtype, b.dtype))
|
||||
def backward(self, grad_output:LazyBuffer) -> Tuple[Optional[LazyBuffer], Optional[LazyBuffer]]:
|
||||
denom = (self.a.e(BinaryOps.MUL, self.a)).e(BinaryOps.ADD, self.b.e(BinaryOps.MUL, self.b))
|
||||
return grad_output.e(BinaryOps.MUL, self.b.e(BinaryOps.DIV, denom)) if self.needs_input_grad[0] else None, \
|
||||
grad_output.e(BinaryOps.MUL, self.a.const(0).e(BinaryOps.SUB, self.a).e(BinaryOps.DIV, denom)) if self.needs_input_grad[1] else None
|
||||
|
||||
# *** third, we use our lovely new mlop in some tests ***
|
||||
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT in ["CPU", "GPU"], "atan2 is only implemented for CPU and GPU")
|
||||
class TestCustomFunction(unittest.TestCase):
|
||||
def test_atan2_forward(self):
|
||||
# create some random Tensors, permute them just because we can
|
||||
a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
|
||||
b = Tensor.randn(4,4,requires_grad=True).permute(1,0)
|
||||
|
||||
# run the forward pass. note: up until the .numpy(), it's all lazy
|
||||
c = ATan2.apply(a, b)
|
||||
print(c.numpy())
|
||||
|
||||
# check the forward pass (in numpy)
|
||||
np.testing.assert_allclose(c.numpy(), np.arctan2(a.numpy(), b.numpy()), atol=1e-5)
|
||||
|
||||
# fun fact, this never actually calls forward, so it works in all the backends
|
||||
def test_atan2_backward(self):
|
||||
# have to go forward before we can go backward
|
||||
a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
|
||||
b = Tensor.randn(4,4,requires_grad=True).permute(1,0)
|
||||
c = ATan2.apply(a, b)
|
||||
|
||||
# run the backward pass
|
||||
c.mean().backward()
|
||||
assert a.grad is not None and b.grad is not None, "tinygrad didn't compute gradients"
|
||||
print(a.grad.numpy())
|
||||
print(b.grad.numpy())
|
||||
|
||||
# check the backward pass (in torch)
|
||||
import torch
|
||||
ta, tb = torch.tensor(a.numpy(), requires_grad=True), torch.tensor(b.numpy(), requires_grad=True)
|
||||
tc = torch.atan2(ta, tb)
|
||||
tc.mean().backward()
|
||||
assert ta.grad is not None and tb.grad is not None, "torch didn't compute gradients"
|
||||
np.testing.assert_allclose(a.grad.numpy(), ta.grad.numpy(), atol=1e-5)
|
||||
np.testing.assert_allclose(b.grad.numpy(), tb.grad.numpy(), atol=1e-5)
|
||||
|
||||
def test_atan2_jit(self):
|
||||
# custom ops even work in the JIT!
|
||||
from tinygrad.jit import TinyJit
|
||||
|
||||
@TinyJit
|
||||
def jitted_atan2(a:Tensor, b:Tensor) -> Tensor:
|
||||
return ATan2.apply(a, b).realize()
|
||||
|
||||
for _ in range(5):
|
||||
a = Tensor.randn(4,4,requires_grad=True).permute(1,0)
|
||||
b = Tensor.randn(4,4,requires_grad=True).permute(1,0)
|
||||
c = jitted_atan2(a, b)
|
||||
np.testing.assert_allclose(c.numpy(), np.arctan2(a.numpy(), b.numpy()), atol=1e-5)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
182
tinygrad_repo/test/test_dtype.py
Normal file
182
tinygrad_repo/test/test_dtype.py
Normal file
@@ -0,0 +1,182 @@
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.helpers import CI, DTYPES_DICT, getenv, DType, DEBUG, ImageDType, PtrDType
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad.tensor import Tensor, dtypes
|
||||
from typing import Any, List
|
||||
from extra.utils import OSX, temp
|
||||
|
||||
def is_dtype_supported(dtype: DType):
|
||||
# for GPU, cl_khr_fp16 isn't supported (except now we don't need it!)
|
||||
# for LLVM, it segfaults because it can't link to the casting function
|
||||
if dtype == dtypes.half: return not (CI and Device.DEFAULT in ["GPU", "LLVM"]) and Device.DEFAULT != "WEBGPU" and getenv("CUDACPU") != 1
|
||||
if dtype == dtypes.bfloat16: return False # numpy doesn't support bf16, tested separately in TestBFloat16DType
|
||||
if dtype == dtypes.float64: return Device.DEFAULT not in ["WEBGPU", "METAL"] and not OSX
|
||||
if dtype in [dtypes.int8, dtypes.uint8]: return Device.DEFAULT not in ["WEBGPU"]
|
||||
if dtype in [dtypes.int16, dtypes.uint16]: return Device.DEFAULT not in ["WEBGPU", "TORCH"]
|
||||
if dtype == dtypes.uint32: return Device.DEFAULT not in ["TORCH"]
|
||||
if dtype in [dtypes.int64, dtypes.uint64]: return Device.DEFAULT not in ["WEBGPU", "TORCH"]
|
||||
if dtype == dtypes.bool:
|
||||
# host-shareablity is a requirement for storage buffers, but 'bool' type is not host-shareable
|
||||
if Device.DEFAULT == "WEBGPU": return False
|
||||
# TODO remove triton from here once internal casting is fixed. CAST of fp32s between 0-1 is broken in triton
|
||||
if getenv("TRITON") == 1: return False
|
||||
return True
|
||||
|
||||
def get_available_cast_dtypes(dtype: DType) -> List[DType]: return [v for k, v in DTYPES_DICT.items() if v != dtype and is_dtype_supported(v) and not k.startswith("_")] # dont cast internal dtypes
|
||||
|
||||
def _test_to_np(a:Tensor, np_dtype, target):
|
||||
if DEBUG >= 2: print(a)
|
||||
na = a.numpy()
|
||||
if DEBUG >= 2: print(na, na.dtype, a.lazydata.realized)
|
||||
try:
|
||||
assert na.dtype == np_dtype
|
||||
np.testing.assert_allclose(na, target)
|
||||
except AssertionError as e:
|
||||
raise AssertionError(f"\ntensor {a.numpy()} does not match target {target} with np_dtype {np_dtype}") from e
|
||||
|
||||
def _assert_eq(tensor:Tensor, target_dtype:DType, target):
|
||||
if DEBUG >= 2: print(tensor.numpy())
|
||||
try:
|
||||
assert tensor.dtype == target_dtype
|
||||
np.testing.assert_allclose(tensor.numpy(), target)
|
||||
except AssertionError as e:
|
||||
raise AssertionError(f"\ntensor {tensor.numpy()} dtype {tensor.dtype} does not match target {target} with dtype {target_dtype}") from e
|
||||
|
||||
def _test_op(fxn, target_dtype:DType, target): _assert_eq(fxn(), target_dtype, target)
|
||||
def _test_cast(a:Tensor, target_dtype:DType): _test_op(lambda: a.cast(target_dtype), target_dtype, a.numpy().astype(target_dtype.np).tolist())
|
||||
def _test_bitcast(a:Tensor, target_dtype:DType, target): _test_op(lambda: a.bitcast(target_dtype), target_dtype, target)
|
||||
|
||||
class TestDType(unittest.TestCase):
|
||||
DTYPE: Any = None
|
||||
DATA: Any = None
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
if not is_dtype_supported(cls.DTYPE): raise unittest.SkipTest("dtype not supported")
|
||||
cls.DATA = np.random.randint(0, 100, size=10, dtype=cls.DTYPE.np).tolist() if dtypes.is_int(cls.DTYPE) else np.random.choice([True, False], size=10).tolist() if cls.DTYPE == dtypes.bool else np.random.uniform(0, 1, size=10).tolist()
|
||||
def setUp(self):
|
||||
if self.DTYPE is None: raise unittest.SkipTest("base class")
|
||||
|
||||
def test_to_np(self): _test_to_np(Tensor(self.DATA, dtype=self.DTYPE), self.DTYPE.np, np.array(self.DATA, dtype=self.DTYPE.np))
|
||||
|
||||
def test_casts_to(self): list(map(
|
||||
lambda dtype: _test_cast(Tensor(self.DATA, dtype=dtype), self.DTYPE),
|
||||
get_available_cast_dtypes(self.DTYPE)
|
||||
))
|
||||
def test_casts_from(self): list(map(
|
||||
lambda dtype: _test_cast(Tensor(self.DATA, dtype=self.DTYPE), dtype),
|
||||
get_available_cast_dtypes(self.DTYPE)
|
||||
))
|
||||
|
||||
def test_upcast_ops(self): list(map(
|
||||
lambda dtype: _test_ops(a_dtype=self.DTYPE, b_dtype=dtype, target_dtype=dtype) if dtype.sz > self.DTYPE.sz else None,
|
||||
get_available_cast_dtypes(self.DTYPE)
|
||||
))
|
||||
def test_upcast_to_ops(self): list(map(
|
||||
lambda dtype: _test_ops(a_dtype=dtype, b_dtype=self.DTYPE, target_dtype=self.DTYPE) if dtype.sz < self.DTYPE.sz else None,
|
||||
get_available_cast_dtypes(self.DTYPE)
|
||||
))
|
||||
|
||||
def _test_ops(a_dtype:DType, b_dtype:DType, target_dtype:DType):
|
||||
if not is_dtype_supported(a_dtype) or not is_dtype_supported(b_dtype): raise unittest.SkipTest("dtype not supported")
|
||||
_assert_eq(Tensor([1,2,3,4], dtype=a_dtype)+Tensor([1,2,3,4], dtype=b_dtype), target_dtype, [2,4,6,8])
|
||||
_assert_eq(Tensor([1,2,3,4], dtype=a_dtype)*Tensor([1,2,3,4], dtype=b_dtype), target_dtype, [1,4,9,16])
|
||||
_assert_eq(Tensor([[1,2],[3,4]], dtype=a_dtype)@Tensor.eye(2, dtype=b_dtype), target_dtype, [[1,2],[3,4]])
|
||||
_assert_eq(Tensor([1,1,1,1], dtype=a_dtype)+Tensor.ones((4,4), dtype=b_dtype), target_dtype, 2*Tensor.ones(4,4).numpy())
|
||||
|
||||
class TestBFloat16DType(unittest.TestCase):
|
||||
def setUp(self):
|
||||
if not is_dtype_supported(dtypes.bfloat16): raise unittest.SkipTest("bfloat16 not supported")
|
||||
def test_bf16_to_float(self):
|
||||
with self.assertRaises(AssertionError):
|
||||
_test_cast(Tensor([100000], dtype=dtypes.bfloat16), dtypes.float32, [100000])
|
||||
|
||||
def test_float_to_bf16(self):
|
||||
with self.assertRaises(AssertionError):
|
||||
_test_cast(Tensor([100000], dtype=dtypes.float32), dtypes.bfloat16, [100000])
|
||||
|
||||
# torch.tensor([10000, -1, -1000, -10000, 20]).type(torch.bfloat16)
|
||||
|
||||
def test_bf16(self):
|
||||
t = Tensor([10000, -1, -1000, -10000, 20]).cast(dtypes.bfloat16)
|
||||
t.realize()
|
||||
back = t.cast(dtypes.float32)
|
||||
assert tuple(back.numpy().tolist()) == (9984., -1, -1000, -9984, 20)
|
||||
|
||||
def test_bf16_disk_write_read(self):
|
||||
t = Tensor([10000, -1, -1000, -10000, 20]).cast(dtypes.float32)
|
||||
t.to(f"disk:{temp('f32')}").realize()
|
||||
|
||||
# hack to "cast" f32 -> bf16
|
||||
dat = open(temp('f32'), "rb").read()
|
||||
adat = b''.join([dat[i+2:i+4] for i in range(0, len(dat), 4)])
|
||||
with open(temp('bf16'), "wb") as f: f.write(adat)
|
||||
|
||||
t = Tensor.empty(5, dtype=dtypes.bfloat16, device=f"disk:{temp('bf16')}").llvm().realize()
|
||||
back = t.cast(dtypes.float32)
|
||||
assert tuple(back.numpy().tolist()) == (9984., -1, -1000, -9984, 20)
|
||||
|
||||
class TestHalfDtype(TestDType): DTYPE = dtypes.half
|
||||
|
||||
class TestFloatDType(TestDType): DTYPE = dtypes.float
|
||||
|
||||
class TestDoubleDtype(TestDType): DTYPE = dtypes.double
|
||||
|
||||
class TestInt8Dtype(TestDType):
|
||||
DTYPE = dtypes.int8
|
||||
@unittest.skipIf(getenv("CUDA",0)==1 or getenv("PTX", 0)==1, "cuda saturation works differently")
|
||||
def test_int8_to_uint8_negative(self): _test_op(lambda: Tensor([-1, -2, -3, -4], dtype=dtypes.int8).cast(dtypes.uint8), dtypes.uint8, [255, 254, 253, 252])
|
||||
|
||||
class TestUint8Dtype(TestDType):
|
||||
DTYPE = dtypes.uint8
|
||||
@unittest.skipIf(getenv("CUDA",0)==1 or getenv("PTX", 0)==1, "cuda saturation works differently")
|
||||
def test_uint8_to_int8_overflow(self): _test_op(lambda: Tensor([255, 254, 253, 252], dtype=dtypes.uint8).cast(dtypes.int8), dtypes.int8, [-1, -2, -3, -4])
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT not in {"CPU", "TORCH"}, "only bitcast in CPU and TORCH")
|
||||
class TestBitCast(unittest.TestCase):
|
||||
def test_float32_bitcast_to_int32(self): _test_bitcast(Tensor([1,2,3,4], dtype=dtypes.float32), dtypes.int32, [1065353216, 1073741824, 1077936128, 1082130432])
|
||||
@unittest.skipIf(Device.DEFAULT == "TORCH", "no uint32 in torch")
|
||||
def test_float32_bitcast_to_uint32(self): _test_bitcast(Tensor([1,2,3,4], dtype=dtypes.float32), dtypes.uint32, [1065353216, 1073741824, 1077936128, 1082130432])
|
||||
def test_int32_bitcast_to_float32(self): _test_bitcast(Tensor([1065353216, 1073741824, 1077936128, 1082130432], dtype=dtypes.int32), dtypes.float32, [1.0, 2.0, 3.0, 4.0])
|
||||
|
||||
# NOTE: these are the same as normal casts
|
||||
def test_int8_bitcast_to_uint8(self): _test_bitcast(Tensor([-1, -2, -3, -4], dtype=dtypes.int8), dtypes.uint8, [255, 254, 253, 252])
|
||||
def test_uint8_bitcast_to_int8(self): _test_bitcast(Tensor([255, 254, 253, 252], dtype=dtypes.uint8), dtypes.int8, [-1, -2, -3, -4])
|
||||
@unittest.skipIf(Device.DEFAULT == "TORCH", "no uint64 in torch")
|
||||
def test_int64_bitcast_to_uint64(self): _test_bitcast(Tensor([-1, -2, -3, -4], dtype=dtypes.int64), dtypes.uint64, [18446744073709551615, 18446744073709551614, 18446744073709551613, 18446744073709551612])
|
||||
@unittest.skipIf(Device.DEFAULT == "TORCH", "no uint64 in torch")
|
||||
def test_uint64_bitcast_to_int64(self): _test_bitcast(Tensor([18446744073709551615, 18446744073709551614, 18446744073709551613, 18446744073709551612], dtype=dtypes.uint64), dtypes.int64, [-1, -2, -3, -4])
|
||||
|
||||
def test_shape_change_bitcast(self):
|
||||
with self.assertRaises(AssertionError):
|
||||
_test_bitcast(Tensor([100000], dtype=dtypes.float32), dtypes.uint8, [100000])
|
||||
|
||||
class TestInt16Dtype(TestDType): DTYPE = dtypes.int16
|
||||
class TestUint16Dtype(TestDType): DTYPE = dtypes.uint16
|
||||
|
||||
class TestInt32Dtype(TestDType): DTYPE = dtypes.int32
|
||||
class TestUint32Dtype(TestDType): DTYPE = dtypes.uint32
|
||||
|
||||
class TestInt64Dtype(TestDType): DTYPE = dtypes.int64
|
||||
class TestUint64Dtype(TestDType): DTYPE = dtypes.uint64
|
||||
|
||||
class TestBoolDtype(TestDType): DTYPE = dtypes.bool
|
||||
|
||||
class TestEqStrDType(unittest.TestCase):
|
||||
def test_image_ne(self):
|
||||
assert dtypes.float == dtypes.float32, "float doesn't match?"
|
||||
assert dtypes.imagef((1,2,4)) != dtypes.imageh((1,2,4)), "different image dtype doesn't match"
|
||||
assert dtypes.imageh((1,2,4)) != dtypes.imageh((1,4,2)), "different shape doesn't match"
|
||||
assert dtypes.imageh((1,2,4)) == dtypes.imageh((1,2,4)), "same shape matches"
|
||||
assert isinstance(dtypes.imageh((1,2,4)), ImageDType)
|
||||
def test_ptr_ne(self):
|
||||
# TODO: is this the wrong behavior?
|
||||
assert PtrDType(dtypes.float32) == dtypes.float32
|
||||
#assert PtrDType(dtypes.float32) == PtrDType(dtypes.float32)
|
||||
#assert PtrDType(dtypes.float32) != dtypes.float32
|
||||
def test_strs(self):
|
||||
self.assertEqual(str(dtypes.imagef((1,2,4))), "dtypes.imagef((1, 2, 4))")
|
||||
self.assertEqual(str(PtrDType(dtypes.float32)), "ptr.dtypes.float")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
37
tinygrad_repo/test/test_gc.py
Normal file
37
tinygrad_repo/test/test_gc.py
Normal file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env python
|
||||
import gc
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
def tensors_allocated():
|
||||
return sum([isinstance(x, Tensor) for x in gc.get_objects()])
|
||||
|
||||
class TestGC(unittest.TestCase):
|
||||
|
||||
def test_gc(self):
|
||||
a = Tensor.zeros(4, 4, requires_grad=True)
|
||||
b = Tensor.zeros(4, 4, requires_grad=True)
|
||||
(a*b).mean().backward()
|
||||
assert(tensors_allocated() > 0)
|
||||
del a,b
|
||||
assert(tensors_allocated() == 0)
|
||||
|
||||
def test_gc_complex(self):
|
||||
a = Tensor(np.zeros((4, 4), dtype=np.float32), requires_grad=True)
|
||||
b = Tensor(np.zeros((4, 4), dtype=np.float32), requires_grad=True)
|
||||
assert(tensors_allocated() == 2)
|
||||
(a*b).mean().backward()
|
||||
assert(tensors_allocated() == 4)
|
||||
del b
|
||||
assert(tensors_allocated() == 2)
|
||||
b = Tensor(np.zeros((4, 4), dtype=np.float32), requires_grad=True)
|
||||
print(tensors_allocated())
|
||||
(a*b).mean().backward()
|
||||
print(tensors_allocated())
|
||||
assert(tensors_allocated() == 4)
|
||||
del b
|
||||
assert(tensors_allocated() == 2)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
194
tinygrad_repo/test/test_jit.py
Normal file
194
tinygrad_repo/test/test_jit.py
Normal file
@@ -0,0 +1,194 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
|
||||
import pytest
|
||||
|
||||
pytestmark = pytest.mark.webgpu
|
||||
|
||||
# NOTE: METAL fails, might be platform and optimization options dependent.
|
||||
@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["METAL", "WEBGPU"], f"no JIT on {Device.DEFAULT}")
|
||||
class TestJit(unittest.TestCase):
|
||||
def test_simple_jit(self):
|
||||
@TinyJit
|
||||
def add(a, b): return (a+b).realize()
|
||||
for _ in range(5):
|
||||
a = Tensor.randn(10, 10)
|
||||
b = Tensor.randn(10, 10)
|
||||
c = add(a, b)
|
||||
np.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
|
||||
assert len(add.jit_cache) == 1
|
||||
|
||||
def test_jit_multiple_outputs(self):
|
||||
@TinyJit
|
||||
def f(a, b): return (a+b).realize(), (a-b).realize(), (a*b).realize()
|
||||
for _ in range(5):
|
||||
a = Tensor.randn(10, 10)
|
||||
b = Tensor.randn(10, 10)
|
||||
c, d, e = f(a, b)
|
||||
np.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
|
||||
np.testing.assert_allclose(d.numpy(), a.numpy()-b.numpy(), atol=1e-4, rtol=1e-5)
|
||||
np.testing.assert_allclose(e.numpy(), a.numpy()*b.numpy(), atol=1e-4, rtol=1e-5)
|
||||
assert len(f.jit_cache) == 3
|
||||
|
||||
def test_nothing_jitted(self):
|
||||
@TinyJit
|
||||
def add(a, b): return a+b
|
||||
with self.assertRaises(AssertionError):
|
||||
for _ in range(5):
|
||||
a = Tensor.randn(10, 10)
|
||||
b = Tensor.randn(10, 10)
|
||||
c = add(a, b)
|
||||
|
||||
def test_jit_shape_mismatch(self):
|
||||
@TinyJit
|
||||
def add(a, b): return (a+b).realize()
|
||||
for _ in range(5):
|
||||
a = Tensor.randn(10, 10)
|
||||
b = Tensor.randn(10, 10)
|
||||
c = add(a, b)
|
||||
bad = Tensor.randn(20, 20)
|
||||
with self.assertRaises(AssertionError):
|
||||
add(a, bad)
|
||||
|
||||
def test_jit_shape_views_mismatch(self):
|
||||
@TinyJit
|
||||
def add(a): return (a+1).realize()
|
||||
with self.assertRaises(AssertionError):
|
||||
for i in range(1,5):
|
||||
# a has an offset that the kernel doesn't know about
|
||||
a = Tensor.randn(10, 10).realize()[:, i:i+2]
|
||||
add(a)
|
||||
|
||||
def test_jit_duplicate_fail(self):
|
||||
# the jit doesn't support duplicate arguments
|
||||
@TinyJit
|
||||
def add(a, b): return (a+b).realize()
|
||||
a = Tensor.randn(10, 10)
|
||||
with self.assertRaises(AssertionError):
|
||||
add(a, a)
|
||||
|
||||
def test_kwargs_jit(self):
|
||||
@TinyJit
|
||||
def add_kwargs(first, second): return (first+second).realize()
|
||||
for _ in range(5):
|
||||
a = Tensor.randn(10, 10)
|
||||
b = Tensor.randn(10, 10)
|
||||
c = add_kwargs(first=a, second=b)
|
||||
np.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
|
||||
assert len(add_kwargs.jit_cache) == 1
|
||||
|
||||
def test_array_jit(self):
|
||||
@TinyJit
|
||||
def add_array(a, arr): return (a+arr[0]).realize()
|
||||
for i in range(5):
|
||||
a = Tensor.randn(10, 10)
|
||||
b = Tensor.randn(10, 10)
|
||||
a.realize(), b.realize()
|
||||
c = add_array(a, [b])
|
||||
if i >= 2:
|
||||
# should fail once jitted since jit can't handle arrays
|
||||
np.testing.assert_allclose(np.any(np.not_equal(c.numpy(),a.numpy()+b.numpy())), True, atol=1e-4, rtol=1e-5)
|
||||
else:
|
||||
np.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
|
||||
assert len(add_array.jit_cache) == 1
|
||||
|
||||
def test_method_jit(self):
|
||||
class Fun:
|
||||
def __init__(self):
|
||||
self.a = Tensor.randn(10, 10)
|
||||
@TinyJit
|
||||
def __call__(self, b:Tensor) -> Tensor:
|
||||
return (self.a+b).realize()
|
||||
fun = Fun()
|
||||
for _ in range(5):
|
||||
b = Tensor.randn(10, 10)
|
||||
c = fun(b)
|
||||
np.testing.assert_allclose(c.numpy(), fun.a.numpy()+b.numpy(), atol=1e-4, rtol=1e-5)
|
||||
assert len(fun.__call__.func.__self__.jit_cache) == 1
|
||||
|
||||
def test_jit_size1_input(self):
|
||||
@TinyJit
|
||||
def f(a, b): return (a+b).realize()
|
||||
a = Tensor([1, 2, 3])
|
||||
for i in range(5):
|
||||
np.testing.assert_allclose(f(a, Tensor([i])).numpy(), (a+i).numpy(), atol=1e-4, rtol=1e-5)
|
||||
assert len(f.jit_cache) == 1
|
||||
|
||||
def test_jit_output_non_tensor_fail(self):
|
||||
@TinyJit
|
||||
def f(a, b, i): return (a+b).realize(), i
|
||||
output1, output2 = [], []
|
||||
expect1, expect2 = [], []
|
||||
for i in range(5):
|
||||
a = Tensor.randn(10, 10)
|
||||
b = Tensor.randn(10, 10)
|
||||
o1, o2 = f(a, b, i)
|
||||
output1.append(o1.numpy().copy())
|
||||
output2.append(o2)
|
||||
expect1.append(a.numpy().copy()+b.numpy().copy())
|
||||
expect2.append(i)
|
||||
np.testing.assert_allclose(output1, expect1, atol=1e-4, rtol=1e-5)
|
||||
# the jit only works with Tensor outputs
|
||||
assert output2 != expect2
|
||||
assert len(f.jit_cache) == 1
|
||||
|
||||
@unittest.skip("random isn't working in JIT")
|
||||
def test_jit_random_regen(self):
|
||||
def f(a, b):
|
||||
rn = Tensor.randn(*a.shape)
|
||||
return ((a+b)*rn).realize()
|
||||
a = Tensor.randn(10, 10)
|
||||
b = Tensor.randn(10, 10)
|
||||
|
||||
Tensor._seed = 1234
|
||||
jf = TinyJit(f)
|
||||
res = set()
|
||||
for _ in range(5):
|
||||
o1 = jf(a, b)
|
||||
res.add(o1.numpy()[0][0])
|
||||
assert len(res) == 5, "All values should be different, rand works in jit."
|
||||
|
||||
Tensor._seed = 1234
|
||||
jf2 = TinyJit(f)
|
||||
res2 = set()
|
||||
for _ in range(5):
|
||||
o1 = jf2(a, b)
|
||||
res2.add(o1.numpy()[0][0])
|
||||
assert len(res2) == 5, "All values should be different, rand works in jit."
|
||||
assert res == res2, "Jit rand is not reproducible with the same seed"
|
||||
|
||||
Tensor._seed = 3421
|
||||
jf3 = TinyJit(f)
|
||||
res3 = set()
|
||||
for _ in range(5):
|
||||
o1 = jf3(a, b)
|
||||
res3.add(o1.numpy()[0][0])
|
||||
assert len(res3) == 5, "All values should be different, rand works in jit."
|
||||
assert res3 != res2, "Jit rand is diff with diff seeds"
|
||||
|
||||
def test_jit_realization_and_sampling(self):
|
||||
w = Tensor.eye(5)
|
||||
|
||||
@TinyJit
|
||||
def foo (x): return w.dot(x).realize()
|
||||
|
||||
arg = [
|
||||
Tensor([1,2,3,4,5]),
|
||||
Tensor([1,3,3,4,6]),
|
||||
Tensor([1,2,5,4,7]),
|
||||
Tensor([0,2,3,1,0]),
|
||||
]
|
||||
|
||||
Y = [foo(e).numpy() for e in arg]
|
||||
|
||||
foo(Tensor([7,7,7,7,7]))
|
||||
want = [[1., 2., 3., 4., 5.],
|
||||
[1., 3., 3., 4., 6.],
|
||||
[1., 2., 5., 4., 7.],
|
||||
[0., 2., 3., 1., 0.]]
|
||||
np.testing.assert_allclose(want, Y)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
54
tinygrad_repo/test/test_kernel_cache.py
Normal file
54
tinygrad_repo/test/test_kernel_cache.py
Normal file
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import secrets
|
||||
import string
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad.helpers import diskcache
|
||||
|
||||
def generate_random_string(length=16):
|
||||
alphabet = string.ascii_letters + string.digits
|
||||
return ''.join(secrets.choice(alphabet) for _ in range(length))
|
||||
|
||||
compile_call_count = 0
|
||||
|
||||
@diskcache
|
||||
def helper_test_compile(prg:str) -> bytes:
|
||||
global compile_call_count
|
||||
compile_call_count += 1
|
||||
return prg.encode()
|
||||
|
||||
class TestKernelCache(unittest.TestCase):
|
||||
def test_compile_cache(self):
|
||||
prg1 = generate_random_string(64) + "a"
|
||||
prg2 = generate_random_string(64) + "b"
|
||||
cold_compile_res = helper_test_compile(prg1)
|
||||
warm_compile_res = helper_test_compile(prg1)
|
||||
assert cold_compile_res == warm_compile_res == prg1.encode()
|
||||
assert compile_call_count == 1
|
||||
|
||||
prg2_res = helper_test_compile(prg2)
|
||||
assert prg2_res == prg2.encode()
|
||||
assert compile_call_count == 2
|
||||
|
||||
def test_kernel_cache_in_action(self):
|
||||
if Device.DEFAULT not in ["CLANG"]:
|
||||
self.skipTest("No custom kernel cache is implemented")
|
||||
|
||||
a = Tensor.rand(4,4)
|
||||
b = Tensor.rand(4,4)
|
||||
x = a + b
|
||||
x.realize()
|
||||
|
||||
orig_compile_func = Device['CLANG'].compiler
|
||||
Device['CLANG'].compiler = None # making it not callable
|
||||
|
||||
a1 = Tensor.rand(4,4)
|
||||
b1 = Tensor.rand(4,4)
|
||||
x1 = a1 + b1
|
||||
x1.realize() # Same kernel should be from cache.
|
||||
|
||||
Device['CLANG'].compiler = orig_compile_func
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
73
tinygrad_repo/test/test_lazybuffer.py
Normal file
73
tinygrad_repo/test/test_lazybuffer.py
Normal file
@@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env python
|
||||
import numpy as np
|
||||
import unittest
|
||||
from tinygrad.lazy import LazyBuffer
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
from tinygrad.jit import CacheCollector
|
||||
|
||||
class TestLazyBuffer(unittest.TestCase):
|
||||
def test_fromcpu_buffer_sharing(self):
|
||||
a = np.arange(8)
|
||||
assert LazyBuffer.fromCPU(a).realized._buf is a
|
||||
|
||||
def test_fromcpu_shape_tracker(self):
|
||||
def helper(a: np.ndarray):
|
||||
print(a.shape, a.strides, a.flags.c_contiguous)
|
||||
b = LazyBuffer.fromCPU(a)
|
||||
#assert b.st.contiguous == a.flags.c_contiguous
|
||||
assert b.st.shape == a.shape
|
||||
np.testing.assert_equal(a, Tensor(b).numpy())
|
||||
|
||||
for ndims in range(1, 4):
|
||||
a = np.random.randn(*(4,)*ndims).astype(np.float32)
|
||||
for stride in [-2, 1, 2]:
|
||||
for start in [0, 1]:
|
||||
helper(a[(slice(start, None, stride),)*ndims])
|
||||
|
||||
def test_shuffle_pad_ops_cmpeq(self):
|
||||
y = Tensor([1]).cat(Tensor([1]) == 0).numpy()
|
||||
z = Tensor([1, 0]).numpy()
|
||||
np.testing.assert_allclose(y, z)
|
||||
|
||||
def test_shuffle_pad_ops_div(self):
|
||||
y = Tensor([1]).cat(Tensor([1]).div(Tensor([2.0]))).numpy()
|
||||
z = Tensor([1, 0.5]).numpy()
|
||||
np.testing.assert_allclose(y, z)
|
||||
|
||||
def test_shuffle_pad_ops_log(self):
|
||||
y = Tensor([1]).cat(Tensor([1]).log()).numpy()
|
||||
z = Tensor([1, 0]).numpy()
|
||||
np.testing.assert_allclose(y, z)
|
||||
|
||||
def test_shuffle_pad_ops_exp(self):
|
||||
y = Tensor([1]).cat(Tensor([1]).exp()).numpy()
|
||||
z = Tensor([1, np.e]).numpy()
|
||||
np.testing.assert_allclose(y, z)
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT in ["METAL", "CUDA", "GPU"], "Only GPU backends supports cache")
|
||||
def test_children_count(self):
|
||||
a = Tensor.ones(8,8,8)
|
||||
d1 = a.sum((0))
|
||||
d2 = a.sum((0)).reshape(32,2)
|
||||
assert len(d1.lazydata.op.src[0].children) == 1
|
||||
in1 = d1.reshape(16,4)
|
||||
d3 = in1.reshape(8,8)
|
||||
assert len(d3.lazydata.op.src[0].children) == 2
|
||||
|
||||
CacheCollector.start()
|
||||
l = Tensor.ones(8,8)
|
||||
r = Tensor.ones(8,8)
|
||||
dd = d1 + l
|
||||
dd.realize()
|
||||
de = d3 + r
|
||||
de.realize()
|
||||
cache = CacheCollector.finish()
|
||||
assert len(cache) == 3
|
||||
assert cache[0][0].name.startswith("r_") # Reduce should not merged 2 times.
|
||||
assert cache[1][0].name.startswith("E_")
|
||||
assert cache[2][0].name.startswith("E_")
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
21
tinygrad_repo/test/test_lazyop.py
Normal file
21
tinygrad_repo/test/test_lazyop.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import unittest
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
# stuff needed to unpack a kernel
|
||||
from tinygrad.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer, ConstBuffer
|
||||
from tinygrad.helpers import dtypes
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
from tinygrad.shape.view import View
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
inf, nan = float('inf'), float('nan')
|
||||
|
||||
class TestLazyOp(unittest.TestCase):
|
||||
def test_lazyop_str(self):
|
||||
t = Tensor.rand(10) + Tensor.rand(10)
|
||||
s = t.lazydata.schedule()
|
||||
ast = s[-1].ast
|
||||
ast_remade = eval(str(ast))
|
||||
self.assertEqual(ast, ast_remade)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
492
tinygrad_repo/test/test_linearizer.py
Normal file
492
tinygrad_repo/test/test_linearizer.py
Normal file
@@ -0,0 +1,492 @@
|
||||
import numpy as np
|
||||
import unittest, os
|
||||
|
||||
from tinygrad.codegen.kernel import Opt, OptOps, tensor_cores
|
||||
from tinygrad.codegen.linearizer import Linearizer, UOps
|
||||
from tinygrad.ops import Compiled, Device, LoadOps
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.jit import CacheCollector
|
||||
from tinygrad.realize import run_schedule
|
||||
from tinygrad.helpers import dtypes, prod
|
||||
|
||||
class TestLinearizer(unittest.TestCase):
|
||||
def test_arg_dedup(self):
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled):
|
||||
self.skipTest("Only Compiled supports cache")
|
||||
a, b = Tensor.randn(4), Tensor.randn(4)
|
||||
np_a, np_b = a.numpy(), b.numpy()
|
||||
CacheCollector.start()
|
||||
c = ((a.shrink(((0, 2),)) - a.shrink(((2, 4),))) - (b.shrink(((0, 2),)) - b.shrink(((2, 4),)))).realize()
|
||||
rawbufs = CacheCollector.finish()[0][1]
|
||||
assert len(rawbufs) == 3 and set(rawbufs[1:]) == {a.lazydata.realized, b.lazydata.realized}
|
||||
np_c = (np_a[:2] - np_a[2:]) - (np_b[:2] - np_b[2:])
|
||||
np.testing.assert_allclose(np_c, c.numpy(), atol=1e-4, rtol=1e-4)
|
||||
|
||||
def test_load_dedup(self):
|
||||
# for different leaves in the AST, the same loads may occur.
|
||||
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled):
|
||||
self.skipTest("Only Compiled uses linearizer")
|
||||
|
||||
a = Tensor.randn(4).realize()
|
||||
# these are of size 3 to avoid float4 coalesce
|
||||
r = a[:-1] + a[1:]
|
||||
|
||||
k = Linearizer(r.lazydata.schedule()[-1].ast)
|
||||
k.upcast()
|
||||
k.linearize()
|
||||
num_loads = len([uop for uop in k.uops if uop.uop == UOps.LOAD])
|
||||
assert num_loads <= 4, "more load uops than needed"
|
||||
assert num_loads >= 4, "unexpected number of uops, maybe this test needs updating?"
|
||||
|
||||
def test_upcast_cse(self):
|
||||
# when upcasting, within a subtree, there may be common expressions.
|
||||
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled):
|
||||
self.skipTest("Only Compiled uses linearizer")
|
||||
|
||||
a, b = Tensor.randn(1).realize(), Tensor.randn(1).realize()
|
||||
r = a.expand([2]) + b.expand([2])
|
||||
|
||||
k = Linearizer(r.lazydata.schedule()[-1].ast)
|
||||
k.upcast()
|
||||
k.linearize()
|
||||
num_ops = len([uop for uop in k.uops if uop.uop == UOps.ALU])
|
||||
assert num_ops <= 1, "more alu uops than needed"
|
||||
|
||||
def test_zero_fold(self):
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled):
|
||||
self.skipTest("Only Compiled uses linearizer")
|
||||
|
||||
a, b = Tensor.randn(1).realize(), Tensor.randn(1).realize()
|
||||
r = Tensor.stack([a, b])
|
||||
|
||||
k = Linearizer(r.lazydata.schedule()[-1].ast)
|
||||
k.upcast()
|
||||
k.linearize()
|
||||
num_ops = len([uop for uop in k.uops if uop.uop == UOps.ALU])
|
||||
assert num_ops == 0, "more alu uops than needed"
|
||||
|
||||
@unittest.skip("constant folding not supported yet")
|
||||
def test_constant_fold(self):
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled):
|
||||
self.skipTest("Only Compiled uses linearizer")
|
||||
|
||||
a, b = Tensor(2), Tensor(3)
|
||||
r = a * b
|
||||
|
||||
k = Linearizer(r.lazydata.schedule()[-1][0])
|
||||
k.linearize()
|
||||
num_ops = len([uop for uop in k.uops if uop.uop in [UOps.LOAD, UOps.ALU]])
|
||||
assert num_ops <= 0, "more load or alu uops than needed"
|
||||
|
||||
def test_tensor_cores(self):
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled):
|
||||
self.skipTest("Only Compiled uses linearizer")
|
||||
if Device.DEFAULT not in tensor_cores:
|
||||
self.skipTest("No tensor cores for device")
|
||||
|
||||
for tc in tensor_cores[Device.DEFAULT]:
|
||||
if tc.arch is not None and tc.arch != os.uname().machine: continue
|
||||
a, b = Tensor.rand(tc.dims[0], tc.dims[2], dtype=tc.dtype_in), Tensor.rand(tc.dims[2], tc.dims[1], dtype=tc.dtype_in)
|
||||
np_a, np_b = a.numpy(), b.numpy()
|
||||
if tc.dtype_out != tc.dtype_in:
|
||||
r = (a.reshape(tc.dims[0], 1, tc.dims[2]) * b.permute(1,0).reshape(1, tc.dims[1], tc.dims[2])).cast(tc.dtype_out).sum(axis=2)
|
||||
else:
|
||||
r = a @ b
|
||||
realized_ast, _ = helper_realized_ast(r)
|
||||
k = Linearizer(realized_ast)
|
||||
k.apply_tensor_cores(1)
|
||||
k.linearize()
|
||||
assert len([uop for uop in k.uops if uop.uop == UOps.WMMA]) == 1, "tensor core not triggered"
|
||||
np_c = np_a @ np_b
|
||||
np.testing.assert_allclose(np_c, r.numpy(), atol=5e-3, rtol=1e-4)
|
||||
|
||||
def test_limit_dims_to_max_5d_global(self):
|
||||
t = Tensor.rand(3, 4, 5, 6, 7).pad(((1, 1), (1, 1), (1, 1), (1, 1), (1, 1))) + 1
|
||||
sched = [si for si in t.lazydata.schedule() if si.ast.op not in LoadOps]
|
||||
assert len(sched) == 1
|
||||
lin = Linearizer(sched[0].ast)
|
||||
assert lin.full_shape[:lin.global_dims] == (5, 6, 7, 8, 9)
|
||||
lin.limit_dims_to_max(global_max=[16, 16, 16], local_max=[16, 16, 16])
|
||||
|
||||
def helper_realized_ast(r:Tensor):
|
||||
s = r.lazydata.schedule()
|
||||
run_schedule(s[:-1]) # run all kernels except the last one
|
||||
# now all input LazyBuffers buffers in s[-1] should be realized
|
||||
output_buffer = Device[s[-1].out.device].buffer(prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype, **s[-1].out._device_extra_args()) # allocate an output buffer
|
||||
return s[-1].ast, [output_buffer] + [l.realized for l in s[-1].inputs]
|
||||
|
||||
class TestFloat4(unittest.TestCase):
|
||||
def setUp(self):
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.supports_float4:
|
||||
self.skipTest("Device does not support float4")
|
||||
|
||||
@staticmethod
|
||||
def count_float4(k):
|
||||
return (len([uop for uop in k.uops if uop.uop == UOps.LOAD and uop.dtype == dtypes._float4]),
|
||||
len([uop for uop in k.uops if uop.uop == UOps.STORE and len(uop.vin) == 3 and uop.vin[2].dtype == dtypes._float4]))
|
||||
|
||||
# TODO: express opts below as auto opts
|
||||
|
||||
def test_float4_basic(self):
|
||||
a = Tensor.rand(2, 8).realize()
|
||||
b = Tensor.rand(2, 8).realize()
|
||||
c = a + b
|
||||
|
||||
s = c.lazydata.schedule()[0]
|
||||
k = Linearizer(s.ast)
|
||||
k.hand_coded_optimizations()
|
||||
k.linearize()
|
||||
|
||||
assert TestFloat4.count_float4(k) == (2, 1)
|
||||
|
||||
def test_float4_multidim(self):
|
||||
a = Tensor.rand(2, 8).realize()
|
||||
b = Tensor.rand(2, 8).realize()
|
||||
c = a + b
|
||||
|
||||
s = c.lazydata.schedule()[0]
|
||||
k = Linearizer(s.ast)
|
||||
k.shift_to(0, 4) # float4 dimension
|
||||
k.shift_to(0, 2, insert_before=k.shape_len-1)
|
||||
k.upcast()
|
||||
k.upcast()
|
||||
k.local_dims += 1
|
||||
k.linearize()
|
||||
|
||||
assert TestFloat4.count_float4(k) == (4, 2)
|
||||
|
||||
def test_float4_unaligned_load(self):
|
||||
a = Tensor.rand(9).realize().shrink(((1, 9),))
|
||||
b = Tensor.rand(9).realize().shrink(((1, 9),))
|
||||
c = a + b
|
||||
|
||||
s = c.lazydata.schedule()[0]
|
||||
k = Linearizer(s.ast)
|
||||
k.hand_coded_optimizations() # implicit trigger float4 dim
|
||||
k.linearize()
|
||||
|
||||
assert TestFloat4.count_float4(k) == (0, 1)
|
||||
|
||||
def test_float4_multidim_unaligned_load(self):
|
||||
a = Tensor.rand(2, 9).realize().shrink(((0, 2), (1, 9),))
|
||||
b = Tensor.rand(2, 9).realize().shrink(((0, 2), (1, 9),))
|
||||
c = a + b
|
||||
|
||||
s = c.lazydata.schedule()[0]
|
||||
k = Linearizer(s.ast)
|
||||
k.shift_to(len(k.full_unupcasted_shape)-1, 4) # manual trigger float4 dim
|
||||
k.upcast()
|
||||
k.shift_to(len(k.full_unupcasted_shape)-1, 2, insert_before=k.shape_len-1)
|
||||
k.upcast()
|
||||
k.local_dims += 1
|
||||
k.linearize()
|
||||
|
||||
assert TestFloat4.count_float4(k) == (0, 2)
|
||||
|
||||
def test_float4_sometimes_unaligned(self):
|
||||
a = Tensor.rand(1, 1, 8).realize()
|
||||
b = Tensor.rand(1, 1, 5).realize().shrink(((0, 1), (0, 1), (1, 5)))
|
||||
c = a.conv2d(b)
|
||||
# only the first and last conv dot products are aligned in a, and b is never aligned, so no
|
||||
# float4 should be emitted (the reduce axis of size 4 is the float4 axis here)
|
||||
|
||||
s = c.lazydata.schedule()[0]
|
||||
k = Linearizer(s.ast)
|
||||
k.upcast()
|
||||
k.linearize()
|
||||
|
||||
assert TestFloat4.count_float4(k) == (0, 0)
|
||||
|
||||
def test_float4_multidim_sometimes_unaligned(self):
|
||||
a = Tensor.rand(1, 1, 7).realize()
|
||||
b = Tensor.rand(1, 1, 5).realize().shrink(((0, 1), (0, 1), (1, 5)))
|
||||
c = a.conv2d(b)
|
||||
# the first conv dot product is aligned in a. If we upcast the output and reduce
|
||||
# dimension, then we could do float4 for only that one set of loads, but we currently
|
||||
# don't.
|
||||
|
||||
s = c.lazydata.schedule()[0]
|
||||
k = Linearizer(s.ast)
|
||||
k.upcast()
|
||||
k.upcast()
|
||||
k.linearize()
|
||||
|
||||
assert TestFloat4.count_float4(k) == (0, 1)
|
||||
|
||||
def test_float4_noncontiguous(self):
|
||||
a = Tensor.rand(4, 2).realize()
|
||||
b = Tensor.rand(4, 2).realize()
|
||||
c = a + b
|
||||
|
||||
# we will upcast the top axis of sz 4. they should not be coalesced into float4,
|
||||
# since the top axis is not contiguous.
|
||||
|
||||
s = c.lazydata.schedule()[0]
|
||||
k = Linearizer(s.ast)
|
||||
k.shift_to(0, 4, top=True) # top axes are float4 axes
|
||||
k.upcast()
|
||||
k.linearize()
|
||||
|
||||
assert TestFloat4.count_float4(k) == (0, 0)
|
||||
|
||||
def test_float4_expand(self):
|
||||
a = Tensor.rand(9).realize().shrink(((1, 9),))
|
||||
b = Tensor.rand(2).realize().reshape((2, 1)).expand((2,4)).reshape((8,))
|
||||
c = a + b
|
||||
|
||||
# we will upcast the top axis of sz 4. they should not be coalesced into float4,
|
||||
# since the top axis is not contiguous.
|
||||
|
||||
s = c.lazydata.schedule()[0]
|
||||
k = Linearizer(s.ast)
|
||||
k.shift_to(0, 4) # float4 axis
|
||||
k.upcast()
|
||||
k.linearize()
|
||||
|
||||
assert TestFloat4.count_float4(k) == (0, 1)
|
||||
|
||||
def test_float4_heterogeneous(self):
|
||||
a = Tensor.rand(8).realize()
|
||||
b = Tensor.rand(9).realize().shrink(((1, 9),))
|
||||
c = a + b
|
||||
|
||||
# should float4 b but not a
|
||||
|
||||
s = c.lazydata.schedule()[0]
|
||||
k = Linearizer(s.ast)
|
||||
k.shift_to(0, 4) # float4 axis
|
||||
k.upcast()
|
||||
k.linearize()
|
||||
|
||||
assert TestFloat4.count_float4(k) == (1, 1)
|
||||
|
||||
class TestHandCodedOpts(unittest.TestCase):
|
||||
def setUp(self):
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled):
|
||||
self.skipTest("Device does not use linearizer")
|
||||
|
||||
def test_masked_upcast(self):
|
||||
layer_1 = Tensor.cat(*[Tensor.rand(5) for _ in range(4)])
|
||||
layer_2 = Tensor.cat(layer_1.unsqueeze(0), Tensor.rand(6, 20))
|
||||
|
||||
s = layer_2.lazydata.schedule()[-1]
|
||||
k = Linearizer(s.ast)
|
||||
k.hand_coded_optimizations()
|
||||
assert len(k.bufs) == 6 # make sure all ops are done in one kernel
|
||||
# masked upcast should upcast masked axis of size 7
|
||||
# masked upcast should not upcast large (20) last axis
|
||||
# float4/other hcopt shouldn't upcast last axis, since we already have 7 upcast, and the last axis is not very contiguous
|
||||
assert k.upcasted == 1 and k.full_shape[-1] == 7
|
||||
|
||||
def test_masked_upcast_wino(self):
|
||||
monster = Tensor.stack([Tensor.stack([Tensor.rand(16) for _ in range(6)]) for _ in range(6)])
|
||||
|
||||
s = monster.lazydata.schedule()[-1]
|
||||
k = Linearizer(s.ast)
|
||||
k.hand_coded_optimizations()
|
||||
assert len(k.bufs) == 37 # make sure all ops are done in one kernel
|
||||
# should upcast the two Tensor.stacks
|
||||
assert k.upcasted >= 2 and k.full_shape[k.shape_len-k.upcasted:k.shape_len].count(6) == 2
|
||||
|
||||
def test_masked_upcast_wino_full(self):
|
||||
old_wino = Tensor.wino
|
||||
Tensor.wino = True
|
||||
x,w = Tensor.rand(1,4,9,9, requires_grad=True).realize(), Tensor.rand(4,4,3,3, requires_grad=True).realize()
|
||||
out = Tensor.conv2d(x,w, padding=1)
|
||||
upcasts = []
|
||||
# collect upcasts of tile transform kernels
|
||||
for i, si in enumerate(out.lazydata.schedule()):
|
||||
k = Linearizer(si.ast)
|
||||
k.hand_coded_optimizations()
|
||||
if k.reduceop is not None: continue # not a tile transform kernel (there is a gemm reduce kernel)
|
||||
if len(k.bufs) < 100: continue # not a tile transform kernel (there's a permute kernel at the end)
|
||||
upcasts.append(tuple(k.full_shape[k.shape_len - k.upcasted:k.shape_len]))
|
||||
assert len(upcasts) == 3 # 3 transformation matrices
|
||||
assert upcasts.count((6, 6)) == 2 and upcasts.count((4, 4)) == 1
|
||||
|
||||
out.mean().backward()
|
||||
for si in x.grad.lazydata.schedule() + w.grad.lazydata.schedule():
|
||||
k = Linearizer(si.ast)
|
||||
k.hand_coded_optimizations()
|
||||
k.linearize()
|
||||
if len(k.bufs) < 20: continue # not a tile transform kernel
|
||||
# heuristic number to make sure that at least some upcasts but not too many upcasts are being done
|
||||
assert 6 <= prod(k.full_shape[k.shape_len - k.upcasted:k.shape_len]) <= 49
|
||||
|
||||
Tensor.wino = old_wino
|
||||
|
||||
def test_masked_upcast_many(self):
|
||||
layer_1 = Tensor.cat(Tensor.rand(3, 4), Tensor.rand(4, 4))
|
||||
layer_2 = Tensor.cat(layer_1.unsqueeze(0), Tensor.rand(6, 7, 4))
|
||||
layer_3 = Tensor.cat(layer_2.unsqueeze(0), Tensor.rand(6, 7, 7, 4))
|
||||
|
||||
s = layer_3.lazydata.schedule()[-1]
|
||||
k = Linearizer(s.ast)
|
||||
k.hand_coded_optimizations()
|
||||
assert len(k.bufs) == 5 # make sure all ops are done in one kernel
|
||||
# check that we don't do too many upcasts
|
||||
assert prod(k.full_shape[k.shape_len-k.upcasted:k.shape_len]) <= 49
|
||||
|
||||
def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False):
|
||||
wanna_output = None
|
||||
realized_ast, real_bufs = helper_realized_ast(r)
|
||||
|
||||
def check_opt(opts, create_k, to_prg):
|
||||
k = create_k()
|
||||
if apply_tc:
|
||||
k.apply_tensor_cores(1, opts)
|
||||
else:
|
||||
for opt in opts:
|
||||
k.apply_opt(opt)
|
||||
prg = to_prg(k)
|
||||
real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
|
||||
prg.exec(real_bufs, force_wait=True)
|
||||
np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
|
||||
|
||||
# Get baseline, which is not optimized at all.
|
||||
k = Linearizer(realized_ast)
|
||||
prg = Device[Device.DEFAULT].to_program(k)
|
||||
prg.exec(real_bufs, force_wait=True)
|
||||
wanna_output = real_bufs[0].toCPU().copy()
|
||||
|
||||
# Check correctness of handcoded optimiztions.
|
||||
k = Linearizer(realized_ast)
|
||||
k.hand_coded_optimizations()
|
||||
prg = Device[Device.DEFAULT].to_program(k)
|
||||
real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
|
||||
prg.exec(real_bufs, force_wait=True)
|
||||
np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
|
||||
for x in opts: # Check custom transformations if any.
|
||||
check_opt(x, lambda: Linearizer(realized_ast), Device[Device.DEFAULT].to_program)
|
||||
|
||||
class TestLinearizerOpts(unittest.TestCase):
|
||||
def test_local_and_grouped_reduce(self):
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared:
|
||||
self.skipTest("Only Compiled uses linearizer with locals and shared")
|
||||
|
||||
N = 128
|
||||
Tensor.manual_seed(1882)
|
||||
a = Tensor.rand(4, 4, N, N)
|
||||
b = Tensor.rand(4, 4, N)
|
||||
r = (b.sqrt() + ((a+1).sum(axis=3).exp()))
|
||||
helper_linearizer_opt(r, [
|
||||
[Opt(OptOps.LOCAL, 0, 2)],
|
||||
[Opt(OptOps.LOCAL, 0, 8)],
|
||||
[Opt(OptOps.LOCAL, 0, 16)], # Checking how it works with locals
|
||||
[Opt(OptOps.GROUPTOP, 0, 2)],
|
||||
[Opt(OptOps.GROUPTOP, 0, 32)],
|
||||
[Opt(OptOps.GROUPTOP, 0, 64)], # Checking how it works with grouped reduce
|
||||
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 2)],
|
||||
[Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.GROUPTOP, 0, 16)],
|
||||
[Opt(OptOps.LOCAL, 0, 32), Opt(OptOps.GROUPTOP, 0, 2)],
|
||||
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 64)], # Checking how it works with locals + grouped reduce
|
||||
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.UPCAST, 0, 8), Opt(OptOps.UNROLL, 1, 4)], # Checking how it works with locals + grouped reduce + upcasts
|
||||
])
|
||||
|
||||
def test_upcasts(self):
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled):
|
||||
self.skipTest("Only Compiled uses linearizer")
|
||||
|
||||
N = 16
|
||||
Tensor.manual_seed(1772)
|
||||
a = Tensor.rand(N, N)
|
||||
b = Tensor.rand(N, N)
|
||||
r = (a+b).sqrt() * ((a+1).exp())
|
||||
helper_linearizer_opt(r, [
|
||||
[Opt(OptOps.UPCAST, 0, 2)],
|
||||
[Opt(OptOps.UPCAST, 0, 4)],
|
||||
[Opt(OptOps.UPCAST, 0, 8)], # Checking how it works with upcasts
|
||||
])
|
||||
|
||||
def test_full_upcast(self):
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled):
|
||||
self.skipTest("Only Compiled uses linearizer")
|
||||
|
||||
Tensor.manual_seed(1772)
|
||||
a = Tensor.rand(4)
|
||||
b = Tensor.rand(4)
|
||||
r = (a+b).sqrt() * ((a+1).exp())
|
||||
helper_linearizer_opt(r, [
|
||||
[Opt(OptOps.UPCAST, 0, 4)], # Checking how it works with upcasts
|
||||
])
|
||||
|
||||
def test_matmul(self):
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared:
|
||||
self.skipTest("Only Compiled uses linearizer with locals and shared")
|
||||
|
||||
N = 128
|
||||
Tensor.manual_seed(1552)
|
||||
a = Tensor.rand(N, N)
|
||||
b = Tensor.rand(N, N)
|
||||
r = a@b
|
||||
helper_linearizer_opt(r, [
|
||||
[Opt(OptOps.UPCAST, 0, 2)],
|
||||
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4)], # Checking how it works with upcasts
|
||||
[Opt(OptOps.LOCAL, 0, 2)],
|
||||
[Opt(OptOps.LOCAL, 1, 32)],
|
||||
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4)],
|
||||
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 32)],
|
||||
[Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.LOCAL, 1, 8)], # Checking how it works with locals
|
||||
[Opt(OptOps.GROUPTOP, 0, 2)],
|
||||
[Opt(OptOps.GROUPTOP, 0, 32)],
|
||||
[Opt(OptOps.GROUPTOP, 0, 32), Opt(OptOps.UNROLL, 0, 4)], # Checking how it works with grouped_reduce
|
||||
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 32)],
|
||||
[Opt(OptOps.LOCAL, 0, 8), Opt(OptOps.GROUPTOP, 0, 32)],
|
||||
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 8), Opt(OptOps.GROUPTOP, 0, 4)], # Checking how it works with local+grouped_reduce
|
||||
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 2)], # Checking all together
|
||||
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 8)], # Full global upcast + local
|
||||
])
|
||||
|
||||
def test_double_reduce(self):
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local or not Device[Device.DEFAULT].linearizer_opts.has_shared:
|
||||
self.skipTest("Only Compiled uses linearizer with locals and shared")
|
||||
|
||||
N = 128
|
||||
Tensor.manual_seed(1552)
|
||||
a = Tensor.rand(8, N, 8, N)
|
||||
r = a.sum(axis=(1,3))
|
||||
helper_linearizer_opt(r, [
|
||||
# openCL / GPU=1 is 256 max threads
|
||||
[Opt(OptOps.GROUPTOP, 0, 2)], [Opt(OptOps.GROUPTOP, 0, 32)],
|
||||
[Opt(OptOps.GROUPTOP, 1, 2)], [Opt(OptOps.GROUPTOP, 1, 32)], # Checking how it works with 1 grouped_reduce.
|
||||
[Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)],
|
||||
[Opt(OptOps.GROUPTOP, 0, 16), Opt(OptOps.GROUPTOP, 1, 2)],
|
||||
[Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 64)], # Checking how it works with 2 grouped_reduces.
|
||||
[Opt(OptOps.GROUPTOP, 0, 16), Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.UNROLL, 0, 4)],
|
||||
[Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 32), Opt(OptOps.UNROLL, 2, 4)], # Checking how it works with 2 grouped_reduces + upcasts.
|
||||
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4)],
|
||||
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 32), Opt(OptOps.UNROLL, 1, 4)], # Checking how it works with 2 grouped_reduces + upcasts + locals.
|
||||
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2)],
|
||||
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UNROLL, 1, 4)], # Checking how it works with 2 grouped_reduces + upcasts + locals.
|
||||
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.UPCAST, 0, 2)], # No globals
|
||||
])
|
||||
|
||||
def test_tensor_core_opts(self):
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled) or not Device[Device.DEFAULT].linearizer_opts.has_local:
|
||||
self.skipTest("Only Compiled uses linearizer with locals")
|
||||
if Device.DEFAULT not in tensor_cores:
|
||||
self.skipTest("No tensor cores for device")
|
||||
|
||||
N = 128
|
||||
Tensor.manual_seed(1552)
|
||||
a = Tensor.rand(N, N)
|
||||
b = Tensor.rand(N, N)
|
||||
r = a@b
|
||||
helper_linearizer_opt(r, [
|
||||
[Opt(OptOps.UPCAST, 0, 4)],
|
||||
[Opt(OptOps.UPCAST, 1, 4)],
|
||||
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4)], # check upcasts
|
||||
[Opt(OptOps.UNROLL, 0, 2)], # check last unroll
|
||||
[Opt(OptOps.LASTLOCAL, 0, 4)], # check last local
|
||||
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 2)], # check combo of last unroll and last local
|
||||
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 2)],
|
||||
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 4)],
|
||||
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.LASTLOCAL, 0, 2)],
|
||||
# [Opt(OptOps.GROUP, 0, 2)] # doesn't work because group_for_reduce dims become early locals (conflicting with TC)
|
||||
], apply_tc=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
21
tinygrad_repo/test/test_linearizer_failures.py
Normal file
21
tinygrad_repo/test/test_linearizer_failures.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import unittest
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
from tinygrad.ops import Device
|
||||
|
||||
# stuff needed to unpack a kernel
|
||||
from tinygrad.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer, ConstBuffer
|
||||
from tinygrad.helpers import dtypes
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
from tinygrad.shape.view import View
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
inf, nan = float('inf'), float('nan')
|
||||
|
||||
class TestLinearizerFailures(unittest.TestCase):
|
||||
@unittest.skip("this is currently failing")
|
||||
def test_failure_1(self):
|
||||
ast = LazyOp(op=BinaryOps.ADD, src=(LazyOp(op=BinaryOps.ADD, src=(LazyOp(op=ReduceOps.SUM, src=(LazyOp(op=BufferOps.MEM, src=(), arg=MemBuffer(idx=1, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(32, 16, 16), strides=(16, 1, 0), offset=0, mask=None, contiguous=False),)))),), arg=(32, 16, 1)), LazyOp(op=BufferOps.MEM, src=(), arg=MemBuffer(idx=2, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(0, 1, 0), offset=0, mask=None, contiguous=False),))))), arg=None), LazyOp(op=BufferOps.MEM, src=(), arg=MemBuffer(idx=1, dtype=dtypes.float, st=ShapeTracker(views=(View(shape=(32, 16, 1), strides=(16, 1, 0), offset=0, mask=None, contiguous=True),))))), arg=None)
|
||||
lin = Linearizer(ast)
|
||||
prg = Device[Device.DEFAULT].to_program(lin)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
102
tinygrad_repo/test/test_net_speed.py
Normal file
102
tinygrad_repo/test/test_net_speed.py
Normal file
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python
|
||||
import time
|
||||
import cProfile
|
||||
import pstats
|
||||
import unittest
|
||||
import torch
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
import pytest
|
||||
|
||||
pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
|
||||
|
||||
def start_profile():
|
||||
import time
|
||||
pr = cProfile.Profile(timer=lambda: int(time.time()*1e9), timeunit=1e-6)
|
||||
pr.enable()
|
||||
return pr
|
||||
|
||||
def stop_profile(pr, sort='cumtime', frac=0.2):
|
||||
pr.disable()
|
||||
ps = pstats.Stats(pr)
|
||||
ps.strip_dirs()
|
||||
ps.sort_stats(sort)
|
||||
ps.print_stats(frac)
|
||||
|
||||
class TestConvSpeed(unittest.TestCase):
|
||||
|
||||
def test_mnist(self):
|
||||
# https://keras.io/examples/vision/mnist_convnet/
|
||||
conv = 3
|
||||
inter_chan, out_chan = 32, 64
|
||||
|
||||
# ****** torch baseline *******
|
||||
|
||||
torch.backends.mkldnn.enabled = False
|
||||
|
||||
conv = 3
|
||||
inter_chan, out_chan = 32, 64
|
||||
c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True)
|
||||
c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True)
|
||||
l1 = torch.randn(out_chan*5*5, 10, requires_grad=True)
|
||||
|
||||
c2d = torch.nn.functional.conv2d
|
||||
mp = torch.nn.MaxPool2d((2,2))
|
||||
lsm = torch.nn.LogSoftmax(dim=1)
|
||||
|
||||
cnt = 5
|
||||
fpt, bpt = 0.0, 0.0
|
||||
for i in range(cnt):
|
||||
et0 = time.time()
|
||||
x = torch.randn(128, 1, 28, 28, requires_grad=True)
|
||||
x = mp(c2d(x,c1).relu())
|
||||
x = mp(c2d(x,c2).relu())
|
||||
x = x.reshape(x.shape[0], -1)
|
||||
out = lsm(x.matmul(l1))
|
||||
out = out.mean()
|
||||
et1 = time.time()
|
||||
out.backward()
|
||||
et2 = time.time()
|
||||
fpt += (et1-et0)
|
||||
bpt += (et2-et1)
|
||||
|
||||
fpt_baseline = (fpt*1000/cnt)
|
||||
bpt_baseline = (bpt*1000/cnt)
|
||||
print("torch forward pass: %.3f ms" % fpt_baseline)
|
||||
print("torch backward pass: %.3f ms" % bpt_baseline)
|
||||
|
||||
# ****** tinygrad compare *******
|
||||
|
||||
c1 = Tensor(c1.detach().numpy(), requires_grad=True)
|
||||
c2 = Tensor(c2.detach().numpy(), requires_grad=True)
|
||||
l1 = Tensor(l1.detach().numpy(), requires_grad=True)
|
||||
|
||||
cnt = 5
|
||||
fpt, bpt = 0.0, 0.0
|
||||
for i in range(1+cnt):
|
||||
et0 = time.time()
|
||||
x = Tensor.randn(128, 1, 28, 28)
|
||||
x = x.conv2d(c1).relu().avg_pool2d()
|
||||
x = x.conv2d(c2).relu().max_pool2d()
|
||||
x = x.reshape(shape=(x.shape[0], -1))
|
||||
out = x.dot(l1).log_softmax()
|
||||
out = out.mean()
|
||||
out.realize()
|
||||
et1 = time.time()
|
||||
out.backward()
|
||||
[x.grad.realize() for x in [c1, c2, l1]]
|
||||
et2 = time.time()
|
||||
if i == 0:
|
||||
pr = start_profile()
|
||||
else:
|
||||
fpt += (et1-et0)
|
||||
bpt += (et2-et1)
|
||||
|
||||
stop_profile(pr, sort='time')
|
||||
fpt = (fpt*1000/cnt)
|
||||
bpt = (bpt*1000/cnt)
|
||||
print("forward pass: %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline))
|
||||
print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
339
tinygrad_repo/test/test_nn.py
Normal file
339
tinygrad_repo/test/test_nn.py
Normal file
@@ -0,0 +1,339 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from extra.utils import WINDOWS
|
||||
from tinygrad.helpers import CI
|
||||
from tinygrad.jit import TinyJit
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
from tinygrad.nn import BatchNorm2d, Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear, GroupNorm, LayerNorm, LayerNorm2d, Embedding, InstanceNorm
|
||||
import torch
|
||||
import pytest
|
||||
|
||||
pytestmark = [pytest.mark.exclude_cuda]
|
||||
|
||||
class TestNN(unittest.TestCase):
|
||||
def test_sparse_cat_cross_entropy(self):
|
||||
input = torch.randn(3, 5)
|
||||
target = torch.empty(3, dtype=torch.long).random_(5)
|
||||
loss_fun = torch.nn.CrossEntropyLoss(reduction='mean')
|
||||
loss = loss_fun(input, target)
|
||||
|
||||
input_tiny = Tensor(input.detach().numpy())
|
||||
taret_tiny = Tensor(target.detach().numpy())
|
||||
loss_tiny = input_tiny.sparse_categorical_crossentropy(taret_tiny)
|
||||
|
||||
np.testing.assert_allclose(loss_tiny.numpy(), loss.detach().numpy(), atol=1e-5, rtol=1e-6)
|
||||
|
||||
def test_batchnorm2d(self, training=False):
|
||||
szs = [4, 8, 16, 32]
|
||||
for sz in szs:
|
||||
# create in tinygrad
|
||||
Tensor.training = training
|
||||
bn = BatchNorm2d(sz, eps=1e-5, track_running_stats=training)
|
||||
bn.weight = Tensor.randn(sz)
|
||||
bn.bias = Tensor.randn(sz)
|
||||
bn.running_mean = Tensor.randn(sz)
|
||||
bn.running_var = Tensor.randn(sz)
|
||||
bn.running_var.numpy()[bn.running_var.numpy() < 0] = 0
|
||||
|
||||
# create in torch
|
||||
with torch.no_grad():
|
||||
tbn = torch.nn.BatchNorm2d(sz).eval()
|
||||
tbn.training = training
|
||||
tbn.weight[:] = torch.tensor(bn.weight.numpy())
|
||||
tbn.bias[:] = torch.tensor(bn.bias.numpy())
|
||||
tbn.running_mean[:] = torch.tensor(bn.running_mean.numpy())
|
||||
tbn.running_var[:] = torch.tensor(bn.running_var.numpy())
|
||||
|
||||
np.testing.assert_allclose(bn.running_mean.numpy(), tbn.running_mean.detach().numpy(), rtol=1e-5, atol=1e-6)
|
||||
np.testing.assert_allclose(bn.running_var.numpy(), tbn.running_var.detach().numpy(), rtol=1e-5, atol=1e-6)
|
||||
|
||||
# trial
|
||||
inn = Tensor.randn(2, sz, 3, 3)
|
||||
|
||||
# in tinygrad
|
||||
outt = bn(inn)
|
||||
|
||||
# in torch
|
||||
toutt = tbn(torch.tensor(inn.numpy()))
|
||||
|
||||
# close
|
||||
np.testing.assert_allclose(outt.numpy(), toutt.detach().numpy(), rtol=5e-4, atol=1e-6)
|
||||
|
||||
np.testing.assert_allclose(bn.running_mean.numpy(), tbn.running_mean.detach().numpy(), rtol=1e-5, atol=1e-6)
|
||||
|
||||
np.testing.assert_allclose(bn.running_var.numpy(), tbn.running_var.detach().numpy(), rtol=1e-5, atol=1e-6)
|
||||
|
||||
def test_batchnorm2d_training(self):
|
||||
self.test_batchnorm2d(True)
|
||||
|
||||
def test_linear(self):
|
||||
def _test_linear(x):
|
||||
|
||||
# create in tinygrad
|
||||
model = Linear(in_dim, out_dim)
|
||||
z = model(x)
|
||||
|
||||
# create in torch
|
||||
with torch.no_grad():
|
||||
torch_layer = torch.nn.Linear(in_dim, out_dim).eval()
|
||||
torch_layer.weight[:] = torch.tensor(model.weight.numpy(), dtype=torch.float32)
|
||||
torch_layer.bias[:] = torch.tensor(model.bias.numpy(), dtype=torch.float32)
|
||||
torch_x = torch.tensor(x.numpy(), dtype=torch.float32)
|
||||
torch_z = torch_layer(torch_x)
|
||||
|
||||
# test
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
|
||||
|
||||
BS, T, in_dim, out_dim = 4, 2, 8, 16
|
||||
_test_linear(Tensor.randn(BS, in_dim))
|
||||
_test_linear(Tensor.randn(BS, T, in_dim)) # test with more dims
|
||||
|
||||
def test_conv1d(self):
|
||||
BS, C1, W = 4, 16, 224//4
|
||||
C2, K, S, P = 64, 7, 2, 1
|
||||
|
||||
# create in tinygrad
|
||||
layer = Conv1d(C1, C2, kernel_size=K, stride=S, padding=P)
|
||||
|
||||
# create in torch
|
||||
with torch.no_grad():
|
||||
torch_layer = torch.nn.Conv1d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
|
||||
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
|
||||
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
|
||||
|
||||
# test
|
||||
x = Tensor.uniform(BS, C1, W)
|
||||
z = layer(x)
|
||||
torch_x = torch.tensor(x.numpy())
|
||||
torch_z = torch_layer(torch_x)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
|
||||
|
||||
def test_conv2d(self):
|
||||
BS, C1, H, W = 4, 16, 224//4, 224//4
|
||||
C2, K, S, P = 64, 7, 2, 1
|
||||
|
||||
# create in tinygrad
|
||||
layer = Conv2d(C1, C2, kernel_size=K, stride=S, padding=P)
|
||||
|
||||
# create in torch
|
||||
with torch.no_grad():
|
||||
torch_layer = torch.nn.Conv2d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
|
||||
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
|
||||
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
|
||||
|
||||
# test
|
||||
x = Tensor.uniform(BS, C1, H, W)
|
||||
z = layer(x)
|
||||
torch_x = torch.tensor(x.numpy())
|
||||
torch_z = torch_layer(torch_x)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT != "TORCH", "Takes too long to compile for Compiled backends")
|
||||
def test_conv2d_winograd(self):
|
||||
BS, C1, H, W = 2, 8, 16, 16
|
||||
C2, K, S, P = 8, 3, 1, 1
|
||||
|
||||
old_wino = Tensor.wino
|
||||
Tensor.wino = True
|
||||
|
||||
# create in tinygrad
|
||||
layer = Conv2d(C1, C2, kernel_size=K, stride=S, padding=P)
|
||||
layer.weight.requires_grad = True
|
||||
layer.bias.requires_grad = True
|
||||
|
||||
# create in torch
|
||||
torch_layer = torch.nn.Conv2d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
|
||||
torch_layer.weight = torch.nn.Parameter(torch.tensor(layer.weight.numpy(), dtype=torch.float32))
|
||||
torch_layer.bias = torch.nn.Parameter(torch.tensor(layer.bias.numpy(), dtype=torch.float32))
|
||||
|
||||
# test
|
||||
x = Tensor.uniform(BS, C1, H, W, requires_grad=True)
|
||||
z = layer(x)
|
||||
torch_x = torch.tensor(x.numpy(), requires_grad=True)
|
||||
torch_z = torch_layer(torch_x)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
|
||||
|
||||
m = z.mean()
|
||||
m.backward()
|
||||
gw = layer.weight.grad.realize()
|
||||
gb = layer.bias.grad.realize()
|
||||
gx = x.grad.realize()
|
||||
|
||||
torch_z.mean().backward()
|
||||
np.testing.assert_allclose(gw.numpy(), torch_layer.weight.grad.numpy(), atol=5e-4, rtol=1e-5)
|
||||
np.testing.assert_allclose(gb.numpy(), torch_layer.bias.grad.numpy(), atol=5e-4, rtol=1e-5)
|
||||
np.testing.assert_allclose(gx.numpy(), torch_x.grad.numpy(), atol=5e-4, rtol=1e-5)
|
||||
|
||||
Tensor.wino = old_wino
|
||||
|
||||
@unittest.skipIf(CI and (WINDOWS or Device.DEFAULT == "WEBGPU"), "runs out of memory in CI")
|
||||
def test_conv_transpose1d(self):
|
||||
BS, C1, W = 4, 16, 224//4
|
||||
C2, K, S, P = 64, 7, 2, 1
|
||||
|
||||
# create in tinygrad
|
||||
layer = ConvTranspose1d(C1, C2, kernel_size=K, stride=S, padding=P)
|
||||
|
||||
# create in torch
|
||||
with torch.no_grad():
|
||||
torch_layer = torch.nn.ConvTranspose1d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
|
||||
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
|
||||
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
|
||||
|
||||
# test
|
||||
x = Tensor.uniform(BS, C1, W)
|
||||
z = layer(x)
|
||||
torch_x = torch.tensor(x.numpy())
|
||||
torch_z = torch_layer(torch_x)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
|
||||
|
||||
@unittest.skipIf(CI and (WINDOWS or Device.DEFAULT == "WEBGPU"), "runs out of memory in CI")
|
||||
def test_conv_transpose2d(self):
|
||||
BS, C1, H, W = 4, 16, 224//4, 224//4
|
||||
C2, K, S, P = 64, 7, 2, 1
|
||||
|
||||
# create in tinygrad
|
||||
layer = ConvTranspose2d(C1, C2, kernel_size=K, stride=S, padding=P)
|
||||
|
||||
# create in torch
|
||||
with torch.no_grad():
|
||||
torch_layer = torch.nn.ConvTranspose2d(C1, C2, kernel_size=K, stride=S, padding=P).eval()
|
||||
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
|
||||
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
|
||||
|
||||
# test
|
||||
x = Tensor.uniform(BS, C1, H, W)
|
||||
z = layer(x)
|
||||
torch_x = torch.tensor(x.numpy())
|
||||
torch_z = torch_layer(torch_x)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
|
||||
|
||||
def test_groupnorm(self):
|
||||
BS, H, W, C, G = 20, 10, 10, 6, 3
|
||||
|
||||
# create in tinygrad
|
||||
layer = GroupNorm(G, C)
|
||||
|
||||
# create in torch
|
||||
with torch.no_grad():
|
||||
torch_layer = torch.nn.GroupNorm(G, C).eval()
|
||||
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
|
||||
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
|
||||
|
||||
# test
|
||||
x = Tensor.randn(BS, C, H, W)
|
||||
z = layer(x)
|
||||
torch_x = torch.tensor(x.numpy())
|
||||
torch_z = torch_layer(torch_x)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
|
||||
|
||||
def test_layernorm(self):
|
||||
N, C, H, W = 20, 5, 10, 10
|
||||
|
||||
# create in tinygrad
|
||||
layer = LayerNorm([H, W])
|
||||
|
||||
# create in torch
|
||||
with torch.no_grad():
|
||||
torch_layer = torch.nn.LayerNorm([H, W]).eval()
|
||||
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
|
||||
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
|
||||
|
||||
# test
|
||||
x = Tensor.randn(N, C, H, W)
|
||||
z = layer(x)
|
||||
torch_x = torch.tensor(x.numpy())
|
||||
torch_z = torch_layer(torch_x)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
|
||||
|
||||
def test_layernorm_2d(self):
|
||||
N, C, H, W = 20, 5, 10, 10
|
||||
|
||||
# create in tinygrad
|
||||
layer = LayerNorm2d(C)
|
||||
|
||||
# create in torch
|
||||
with torch.no_grad():
|
||||
torch_layer = torch.nn.LayerNorm([C]).eval()
|
||||
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
|
||||
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
|
||||
|
||||
# test
|
||||
x = Tensor.randn(N, C, H, W)
|
||||
z = layer(x)
|
||||
torch_x = torch.tensor(x.numpy())
|
||||
torch_z = torch_layer(torch_x.permute(0,2,3,1)).permute(0,3,1,2)
|
||||
|
||||
def test_instancenorm_2d(self):
|
||||
N, C, H, W = 20, 5, 10, 10
|
||||
|
||||
# create in tinygrad
|
||||
layer = InstanceNorm(C)
|
||||
|
||||
# create in torch
|
||||
with torch.no_grad():
|
||||
torch_layer = torch.nn.InstanceNorm2d(C, affine=True).eval()
|
||||
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
|
||||
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
|
||||
|
||||
# test
|
||||
x = Tensor.randn(N, C, H, W)
|
||||
z = layer(x)
|
||||
torch_x = torch.tensor(x.numpy())
|
||||
torch_z = torch_layer(torch_x)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
|
||||
|
||||
def test_instancenorm_3d(self):
|
||||
N, C, D, H, W = 20, 5, 3, 10, 10
|
||||
|
||||
# create in tinygrad
|
||||
layer = InstanceNorm(C)
|
||||
|
||||
# create in torch
|
||||
with torch.no_grad():
|
||||
torch_layer = torch.nn.InstanceNorm3d(C, affine=True).eval()
|
||||
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
|
||||
torch_layer.bias[:] = torch.tensor(layer.bias.numpy(), dtype=torch.float32)
|
||||
|
||||
# test
|
||||
x = Tensor.randn(N, C, D, H, W)
|
||||
z = layer(x)
|
||||
torch_x = torch.tensor(x.numpy())
|
||||
torch_z = torch_layer(torch_x)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-3, rtol=5e-3)
|
||||
|
||||
def test_embedding(self):
|
||||
B, T, C, VS = 4, 10, 20, 28
|
||||
|
||||
# create in tinygrad
|
||||
layer = Embedding(VS, C)
|
||||
|
||||
with torch.no_grad():
|
||||
torch_layer = torch.nn.Embedding(VS, C).eval()
|
||||
torch_layer.weight[:] = torch.tensor(layer.weight.numpy(), dtype=torch.float32)
|
||||
|
||||
# test
|
||||
x = Tensor(np.random.randint(0, VS, (B, T)).astype(np.float32))
|
||||
z = layer(x)
|
||||
torch_x = torch.tensor(x.numpy().astype(np.int32))
|
||||
torch_z = torch_layer(torch_x)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
|
||||
|
||||
# test with jit enabled
|
||||
@TinyJit
|
||||
def layer_jit(x):
|
||||
return layer(x).realize()
|
||||
|
||||
for _ in range(3):
|
||||
x = Tensor(np.random.randint(0, VS, (B, T)).astype(np.float32))
|
||||
z = layer_jit(x)
|
||||
torch_x = torch.tensor(x.numpy().astype(np.int32))
|
||||
torch_z = torch_layer(torch_x)
|
||||
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
1245
tinygrad_repo/test/test_ops.py
Normal file
1245
tinygrad_repo/test/test_ops.py
Normal file
File diff suppressed because it is too large
Load Diff
98
tinygrad_repo/test/test_optim.py
Normal file
98
tinygrad_repo/test/test_optim.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import unittest
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn.optim import Adam, SGD, AdamW
|
||||
import pytest
|
||||
|
||||
pytestmark = pytest.mark.exclude_cuda
|
||||
|
||||
np.random.seed(1337)
|
||||
x_init = np.random.randn(1,4).astype(np.float32)
|
||||
W_init = np.random.randn(4,4).astype(np.float32)
|
||||
m_init = np.random.randn(1,4).astype(np.float32)
|
||||
|
||||
class TinyNet:
|
||||
def __init__(self, tensor):
|
||||
self.x = tensor(x_init.copy(), requires_grad=True)
|
||||
self.W = tensor(W_init.copy(), requires_grad=True)
|
||||
self.m = tensor(m_init.copy())
|
||||
|
||||
def forward(self):
|
||||
out = self.x.matmul(self.W).relu()
|
||||
# print(out.detach().numpy())
|
||||
out = out.log_softmax(1)
|
||||
out = out.mul(self.m).add(self.m).sum()
|
||||
return out
|
||||
|
||||
def step(tensor, optim, steps=1, kwargs={}):
|
||||
net = TinyNet(tensor)
|
||||
optim = optim([net.x, net.W], **kwargs)
|
||||
for _ in range(steps):
|
||||
out = net.forward()
|
||||
optim.zero_grad()
|
||||
out.backward()
|
||||
optim.step()
|
||||
return net.x.detach().numpy(), net.W.detach().numpy()
|
||||
|
||||
class TestOptim(unittest.TestCase):
|
||||
|
||||
def _test_optim(self, tinygrad_optim, torch_optim, steps, opts, atol, rtol):
|
||||
for x,y in zip(step(Tensor, tinygrad_optim, steps, kwargs=opts),
|
||||
step(torch.tensor, torch_optim, steps, kwargs=opts)):
|
||||
np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
|
||||
|
||||
def _test_sgd(self, steps, opts, atol, rtol): self._test_optim(SGD, torch.optim.SGD, steps, opts, atol, rtol)
|
||||
def _test_adam(self, steps, opts, atol, rtol): self._test_optim(Adam, torch.optim.Adam, steps, opts, atol, rtol)
|
||||
def _test_adamw(self, steps, opts, atol, rtol): self._test_optim(AdamW, torch.optim.AdamW, steps, opts, atol, rtol)
|
||||
|
||||
def test_sgd(self): self._test_sgd(1, {'lr': 0.001}, 1e-6, 0)
|
||||
def test_sgd_high_lr(self): self._test_sgd(1, {'lr': 10}, 1e-6, 1e-5)
|
||||
def test_sgd_wd(self): self._test_sgd(1, {'lr': 0.001, 'weight_decay': 0.1}, 1e-6, 0)
|
||||
def test_sgd_high_lr_wd(self): self._test_sgd(1, {'lr': 10, 'weight_decay': 0.1}, 1e-6, 1e-5)
|
||||
|
||||
def test_multistep_sgd(self): self._test_sgd(10, {'lr': 0.001}, 1e-6, 0)
|
||||
def test_multistep_sgd_high_lr(self): self._test_sgd(10, {'lr': 10}, 1e-6, 3e-4)
|
||||
def test_multistep_sgd_wd(self): self._test_sgd(10, {'lr': 0.001, 'weight_decay': 0.1}, 1e-6, 0)
|
||||
def test_multistep_sgd_high_lr_wd(self): self._test_sgd(10, {'lr': 9, 'weight_decay': 0.1}, 1e-6, 3e-4)
|
||||
|
||||
def test_multistep_sgd_momentum(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9}, 1e-6, 0)
|
||||
def test_multistep_sgd_high_lr_momentum(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9}, 1e-5, 3e-4)
|
||||
def test_multistep_sgd_momentum_wd(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'weight_decay': 0.1}, 1e-6, 0)
|
||||
def test_multistep_sgd_high_lr_momentum_wd(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9, 'weight_decay': 0.1}, 1e-5, 3e-4)
|
||||
|
||||
def test_multistep_sgd_nesterov_momentum(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'nesterov': True}, 1e-5, 0)
|
||||
def test_multistep_sgd_high_lr_nesterov_momentum(self): self._test_sgd(10, {'lr': 10, 'momentum': 0.9, 'nesterov': True}, 1e-5, 3e-4)
|
||||
def test_multistep_sgd_nesterov_momentum_wd(self): self._test_sgd(10, {'lr': 0.001, 'momentum': 0.9, 'nesterov': True, 'weight_decay': 0.1}, 1e-5, 0)
|
||||
def test_multistep_sgd_high_lr_nesterov_momentum_wd(self): self._test_sgd(10, {'lr': 9, 'momentum': 0.9, 'nesterov': True, 'weight_decay': 0.1}, 1e-5, 3e-4)
|
||||
|
||||
def test_adam(self): self._test_adam(1, {'lr': 0.001}, 1e-5, 0)
|
||||
def test_adam_high_lr(self): self._test_adam(1, {'lr': 10}, 1e-4, 1e-4)
|
||||
def test_adamw(self): self._test_adamw(1, {'lr': 0.001}, 1e-5, 0)
|
||||
def test_adamw_high_lr(self): self._test_adamw(1, {'lr': 10}, 1e-4, 1e-4)
|
||||
|
||||
def test_multistep_adam(self): self._test_adam(10, {'lr': 0.001}, 1e-5, 0)
|
||||
def test_multistep_adam_high_lr(self): self._test_adam(10, {'lr': 10}, 2e-4, 5e-4)
|
||||
|
||||
def test_multistep_adamw(self): self._test_adamw(10, {'lr': 0.001}, 1e-5, 0)
|
||||
def test_multistep_adamw_high_lr(self): self._test_adamw(10, {'lr': 10}, 5e-4, 2e-3)
|
||||
|
||||
def test_duped_weights(self):
|
||||
for Opt in [Adam, AdamW, SGD]:
|
||||
losses = []
|
||||
for i in range(2):
|
||||
w = Tensor(x_init.copy())
|
||||
opt = Opt([w], lr=0.1) if i == 0 else Opt([w, w], lr=0.1)
|
||||
|
||||
loss = None
|
||||
for _ in range(3):
|
||||
loss = w.sum()
|
||||
opt.zero_grad()
|
||||
loss.backward()
|
||||
opt.step()
|
||||
losses.append(loss.numpy())
|
||||
|
||||
np.testing.assert_allclose(losses[0], losses[1], atol=1e-4, rtol=0)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
115
tinygrad_repo/test/test_randomness.py
Normal file
115
tinygrad_repo/test/test_randomness.py
Normal file
@@ -0,0 +1,115 @@
|
||||
import math
|
||||
import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from tinygrad.tensor import Tensor
|
||||
import tinygrad.nn as nn
|
||||
import pytest
|
||||
from tinygrad.helpers import dtypes
|
||||
from functools import partial
|
||||
|
||||
pytestmark = pytest.mark.webgpu
|
||||
|
||||
# https://gist.github.com/devries/11405101
|
||||
def ksprob(a):
|
||||
fac, total, termbf = 2.0, 0.0, 0.0
|
||||
a2 = -2.0 * a * a
|
||||
for j in range(1, 101):
|
||||
term = fac * math.exp(a2 * j * j)
|
||||
total += term
|
||||
if math.fabs(term) <= 0.001 * termbf or math.fabs(term) <= 1e-8 * total:
|
||||
return total
|
||||
fac = -fac
|
||||
termbf = math.fabs(term)
|
||||
return 1.0
|
||||
|
||||
def kstest(l1, l2):
|
||||
n1, n2 = len(l1), len(l2)
|
||||
l1.sort()
|
||||
l2.sort()
|
||||
j1, j2, d, fn1, fn2 = 0, 0, 0.0, 0.0, 0.0
|
||||
while j1 < n1 and j2 < n2:
|
||||
d1, d2 = l1[j1], l2[j2]
|
||||
if d1 <= d2:
|
||||
fn1 = (float(j1) + 1.0) / float(n1)
|
||||
j1 += 1
|
||||
if d2 <= d1:
|
||||
fn2 = (float(j2) + 1.0) / float(n2)
|
||||
j2 += 1
|
||||
dtemp = math.fabs(fn2 - fn1)
|
||||
if dtemp > d:
|
||||
d = dtemp
|
||||
ne = float(n1 * n2) / float(n1 + n2)
|
||||
nesq = math.sqrt(ne)
|
||||
prob = ksprob((nesq + 0.12 + 0.11 / nesq) * d)
|
||||
return prob
|
||||
|
||||
def equal_distribution(tiny_func, torch_func=None, numpy_func=None, shape=(20, 23), alpha=0.05):
|
||||
Tensor.manual_seed(1337)
|
||||
torch.manual_seed(1337)
|
||||
np.random.seed(1337)
|
||||
assert not (torch_func is None and numpy_func is None), "no function to compare with"
|
||||
x = tiny_func(*shape).numpy().flatten()
|
||||
if numpy_func is not None: y = numpy_func(shape).flatten()
|
||||
if torch_func is not None: z = torch_func(shape).numpy().flatten()
|
||||
return (numpy_func is None or kstest(x, y) >= alpha) and (torch_func is None or kstest(x, z) >= alpha)
|
||||
|
||||
def normal_test(func, shape=(20, 23), alpha=0.05): return equal_distribution(func, numpy_func=lambda x: np.random.randn(*x), shape=shape, alpha=alpha)
|
||||
|
||||
class TestRandomness(unittest.TestCase):
|
||||
def test_rand(self):
|
||||
self.assertFalse(normal_test(Tensor.rand))
|
||||
self.assertTrue(equal_distribution(Tensor.rand, torch.rand, lambda x: np.random.rand(*x)))
|
||||
|
||||
def test_randn(self):
|
||||
self.assertTrue(normal_test(Tensor.randn))
|
||||
self.assertTrue(equal_distribution(Tensor.randn, torch.randn, lambda x: np.random.randn(*x)))
|
||||
|
||||
def test_normal(self):
|
||||
self.assertTrue(normal_test(Tensor.normal))
|
||||
self.assertTrue(equal_distribution(Tensor.normal, lambda x: torch.nn.init.normal_(torch.empty(x), mean=0, std=1), lambda x: np.random.normal(loc=0, scale=1, size=x)))
|
||||
|
||||
def test_uniform(self):
|
||||
self.assertFalse(normal_test(Tensor.uniform))
|
||||
self.assertTrue(equal_distribution(Tensor.uniform, lambda x: torch.nn.init.uniform_(torch.empty(x)), lambda x: np.random.uniform(size=x)))
|
||||
self.assertTrue(equal_distribution(partial(Tensor.uniform, low=-100, high=100, dtype=dtypes.int32), numpy_func=lambda x: np.random.randint(low=-100, high=100, size=x)))
|
||||
|
||||
def test_scaled_uniform(self):
|
||||
self.assertFalse(normal_test(Tensor.scaled_uniform))
|
||||
self.assertTrue(equal_distribution(Tensor.scaled_uniform, lambda x: torch.nn.init.uniform_(torch.empty(x), a=-1, b=1) / math.sqrt(math.prod(x)), lambda x: np.random.uniform(-1, 1, size=x) / math.sqrt(math.prod(x))))
|
||||
|
||||
def test_glorot_uniform(self):
|
||||
self.assertFalse(normal_test(Tensor.glorot_uniform))
|
||||
self.assertTrue(equal_distribution(Tensor.glorot_uniform, lambda x: torch.nn.init.xavier_uniform_(torch.empty(x)), lambda x: np.random.uniform(-1, 1, size=x) * math.sqrt(6 / (x[0] + math.prod(x[1:])))))
|
||||
|
||||
def test_kaiming_uniform(self):
|
||||
Tensor.manual_seed(1337)
|
||||
torch.manual_seed(1337)
|
||||
np.random.seed(1337)
|
||||
for shape in [(128, 64, 3, 3), (20, 24)]:
|
||||
self.assertTrue(equal_distribution(Tensor.kaiming_uniform, lambda x: torch.nn.init.kaiming_uniform_(torch.empty(x)), shape=shape))
|
||||
|
||||
def test_kaiming_normal(self):
|
||||
Tensor.manual_seed(1337)
|
||||
torch.manual_seed(1337)
|
||||
np.random.seed(1337)
|
||||
for shape in [(128, 64, 3, 3), (20, 24)]:
|
||||
self.assertTrue(equal_distribution(Tensor.kaiming_normal, lambda x: torch.nn.init.kaiming_normal_(torch.empty(x)), shape=shape))
|
||||
|
||||
def test_conv2d_init(self):
|
||||
params = (128, 256, (3,3))
|
||||
assert equal_distribution(lambda *_: nn.Conv2d(*params).weight, lambda _: torch.nn.Conv2d(*params).weight.detach())
|
||||
assert equal_distribution(lambda *_: nn.Conv2d(*params).bias, lambda _: torch.nn.Conv2d(*params).bias.detach())
|
||||
|
||||
def test_linear_init(self):
|
||||
params = (64, 64)
|
||||
assert equal_distribution(lambda *_: nn.Linear(*params).weight, lambda _: torch.nn.Linear(*params).weight.detach())
|
||||
assert equal_distribution(lambda *_: nn.Linear(*params).bias, lambda _: torch.nn.Linear(*params).bias.detach())
|
||||
|
||||
def test_bn_init(self):
|
||||
params = (64,)
|
||||
assert equal_distribution(lambda *_: nn.BatchNorm2d(*params).weight, lambda _: torch.nn.BatchNorm2d(*params).weight.detach())
|
||||
assert equal_distribution(lambda *_: nn.BatchNorm2d(*params).bias, lambda _: torch.nn.BatchNorm2d(*params).bias.detach())
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
335
tinygrad_repo/test/test_schedule.py
Normal file
335
tinygrad_repo/test/test_schedule.py
Normal file
@@ -0,0 +1,335 @@
|
||||
# this will be the new test_ops for the next level
|
||||
# schedule confirms the right things are capable of fusing
|
||||
# NOTE: this has overlap with external_test_opt.py
|
||||
|
||||
import unittest
|
||||
from typing import List, Optional
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import LoadOps, Device, Compiled
|
||||
from tinygrad.helpers import DEBUG, dtypes
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
from tinygrad.graph import log_schedule_item, print_tree
|
||||
from tinygrad import nn
|
||||
|
||||
def check_schedule(t:Tensor, allowed:int, to_prerealize:Optional[List[Tensor]]=None, filter_loadops=True):
|
||||
seen = set()
|
||||
if to_prerealize:
|
||||
for pre in to_prerealize:
|
||||
for s in pre.lazydata.schedule(seen.copy()):
|
||||
log_schedule_item(s)
|
||||
seen.add(s.out)
|
||||
sched = t.lazydata.schedule(seen)
|
||||
for s in sched: log_schedule_item(s)
|
||||
if filter_loadops: sched = [s for s in sched if s.ast.op not in LoadOps]
|
||||
if len(sched) != allowed: print(f"SCHEDULE ISSUE, expecting {allowed} got {len(sched)}")
|
||||
if len(sched) != allowed or DEBUG >= 3:
|
||||
for i, s in enumerate(sched):
|
||||
print("op", i)
|
||||
print_tree(s.ast)
|
||||
assert len(sched) == allowed
|
||||
# test the (non loadops) ops linearize
|
||||
for s in sched:
|
||||
if s.ast.op in LoadOps: continue
|
||||
l = Linearizer(s.ast)
|
||||
l.hand_coded_optimizations()
|
||||
l.linearize()
|
||||
|
||||
class TestSchedule(unittest.TestCase):
|
||||
def test_basic_binop_fusion(self):
|
||||
a = Tensor.empty(10)
|
||||
b = Tensor.empty(10)
|
||||
c = Tensor.empty(10)
|
||||
d = a+b+c
|
||||
check_schedule(d, 1)
|
||||
|
||||
def test_basic_binop_fusion_deep(self):
|
||||
a = Tensor.empty(10)
|
||||
b = Tensor.empty(10)
|
||||
c = Tensor.empty(10)
|
||||
d = Tensor.empty(10)
|
||||
e = a+b+c+d
|
||||
check_schedule(e, 1)
|
||||
|
||||
def test_mulacc_fusion(self):
|
||||
a = Tensor.empty(10)
|
||||
b = Tensor.empty(10)
|
||||
c = (a*b).sum()
|
||||
check_schedule(c, 1)
|
||||
|
||||
def test_mulacc_relu_fusion(self):
|
||||
a = Tensor.empty(10)
|
||||
b = Tensor.empty(10)
|
||||
c = (a*b).sum().relu()
|
||||
check_schedule(c, 1)
|
||||
|
||||
def test_binop_reshape_fusion(self):
|
||||
a = Tensor.empty(10)
|
||||
b = Tensor.empty(10)
|
||||
c = Tensor.empty(5,2)
|
||||
d = (a+b).reshape(5,2)+c
|
||||
check_schedule(d, 1)
|
||||
|
||||
def test_binop_permute_fusion(self):
|
||||
a = Tensor.empty(2,5)
|
||||
b = Tensor.empty(2,5)
|
||||
c = Tensor.empty(5,2)
|
||||
d = (a+b).permute(1,0)+c
|
||||
check_schedule(d, 1)
|
||||
|
||||
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled) or Device.DEFAULT == "LLVM", "only test for compiled backends")
|
||||
def test_constants_are_embedded(self):
|
||||
a = Tensor.empty(3,3) * 2
|
||||
check_schedule(a, 2, filter_loadops=False)
|
||||
|
||||
def test_binop_elu_fusion(self):
|
||||
a = Tensor.empty(10)
|
||||
b = a.elu()
|
||||
check_schedule(b, 1)
|
||||
|
||||
def test_binop_reshape_reduce_fusion(self):
|
||||
a = Tensor.empty(100)
|
||||
b = Tensor.empty(100)
|
||||
c = (a+b).reshape(10, 10).sum(axis=0, keepdim=True)
|
||||
check_schedule(c, 1)
|
||||
|
||||
def test_reduce_reshape_binop_fusion(self):
|
||||
a = Tensor.empty(10,10)
|
||||
b = Tensor.empty(10)
|
||||
c = a.sum(axis=0) + b
|
||||
check_schedule(c, 1)
|
||||
|
||||
@unittest.skip("not pushing permutes through reduces")
|
||||
def test_reduce_permute_binop_fusion(self):
|
||||
a = Tensor.empty(10,10,10)
|
||||
b = Tensor.empty(10,10,1)
|
||||
c = a.sum(axis=0, keepdim=True).permute(2,1,0) + b
|
||||
check_schedule(c, 1)
|
||||
|
||||
def test_binop_early_reshape_reduce_fusion(self):
|
||||
a = Tensor.empty(100)
|
||||
b = Tensor.empty(100)
|
||||
c = Tensor.empty(10,10)
|
||||
d = ((a+b).reshape(10,10) + c).sum(axis=0)
|
||||
check_schedule(d, 1)
|
||||
|
||||
def test_diamond_folded(self):
|
||||
a = Tensor.empty(10)
|
||||
b = Tensor.empty(10)
|
||||
c = Tensor.empty(10)
|
||||
d = Tensor.empty(10)
|
||||
ab = a+b
|
||||
e = (ab+c) + (ab+d)
|
||||
check_schedule(e, 1)
|
||||
|
||||
def test_cache_binaryop(self):
|
||||
a = Tensor.empty(10)
|
||||
b = Tensor.empty(10)
|
||||
c = a+b
|
||||
d = a+b
|
||||
check_schedule(d, 0, [c])
|
||||
|
||||
@unittest.skip("failing in old lazy")
|
||||
def test_cache_binaryop_reshaped(self):
|
||||
a = Tensor.empty(10)
|
||||
b = Tensor.empty(10)
|
||||
c = a+b
|
||||
d = a.reshape(10,1)+b.reshape(10,1)
|
||||
check_schedule(d, 0, [c])
|
||||
|
||||
def test_cache_binaryop_transpose(self):
|
||||
a = Tensor.empty(10,10)
|
||||
b = Tensor.empty(10,10)
|
||||
c = (a.T*b.T).T #.contiguous()
|
||||
d = a*b
|
||||
check_schedule(d, 0, [c])
|
||||
|
||||
def test_cache_two_reduceops(self):
|
||||
a = Tensor.empty(10)
|
||||
b = a.sum()
|
||||
c = a.sum()
|
||||
bc = b+c
|
||||
check_schedule(bc, 1)
|
||||
|
||||
def test_fold_double_unary(self):
|
||||
y = Tensor.empty(2)
|
||||
out = y.sum(keepdim=True).sqrt().__neg__()
|
||||
check_schedule(out, 1)
|
||||
|
||||
#@unittest.skip("may want to reconsider this")
|
||||
def test_fold_batchnorm(self):
|
||||
with Tensor.train():
|
||||
img = Tensor.empty(1,32,4,4)
|
||||
bn = nn.BatchNorm2d(32, track_running_stats=False)
|
||||
out = bn(img)
|
||||
check_schedule(out, 3)
|
||||
|
||||
def test_fold_conv_relu(self):
|
||||
c1 = nn.Conv2d(3,16,3)
|
||||
|
||||
# run
|
||||
img = Tensor.ones(2,3,64,64)
|
||||
out = c1(img).relu()
|
||||
check_schedule(out, 1, [c1.weight, c1.bias])
|
||||
|
||||
def test_fold_conv_elu(self):
|
||||
c1 = nn.Conv2d(3,16,3)
|
||||
|
||||
# run
|
||||
img = Tensor.rand(2,3,64,64)
|
||||
out = c1(img).elu()
|
||||
check_schedule(out, 1, [c1.weight, c1.bias])
|
||||
|
||||
def test_two_sum(self):
|
||||
img = Tensor.empty(64,64)
|
||||
x = (img.sum(0) + img.sum(1))
|
||||
out = x.relu()
|
||||
del x # is 3 without this
|
||||
check_schedule(out, 2)
|
||||
|
||||
@unittest.skip("failing in old lazy")
|
||||
def test_push_permute_through_reshape(self):
|
||||
a = Tensor.empty(16,16)
|
||||
b = Tensor.empty(16,16)
|
||||
c = (a+b).reshape(4,4,4,4).permute(2,3,0,1).contiguous()
|
||||
check_schedule(c, 1)
|
||||
|
||||
@unittest.skip("failing in old lazy")
|
||||
def test_push_permute_through_reshape_alt(self):
|
||||
a = Tensor.empty(4,4,4,4)
|
||||
b = Tensor.empty(4,4,4,4)
|
||||
c = (a+b).reshape(16,16).permute(1,0).contiguous()
|
||||
check_schedule(c, 1)
|
||||
|
||||
def test_no_binop_rerun(self):
|
||||
a = Tensor.empty(16)
|
||||
b = Tensor.empty(16)
|
||||
c = a+b
|
||||
d = (a+b).reshape(16,1)
|
||||
check_schedule(d, 0, [c])
|
||||
|
||||
def test_multi_permute_should_collapse(self):
|
||||
a = Tensor.empty(4,4,4,4)
|
||||
b = Tensor.empty(16)
|
||||
c = a.sum((0,1)).cast(dtypes.float16).permute(1,0).reshape(4,4,1).permute(1,0,2).reshape(16) + b
|
||||
check_schedule(c, 1)
|
||||
|
||||
@unittest.skip("failing in old lazy")
|
||||
def test_fancy_reshape_fusion(self):
|
||||
a = Tensor.empty(10)
|
||||
b = Tensor.empty(10)
|
||||
c = a+b
|
||||
d = a.reshape(10,1)+b.reshape(10,1)
|
||||
out = c.sum() + d.sum()
|
||||
check_schedule(out, 1)
|
||||
|
||||
# NOTE: for this to pass, LazyViews must be children of LazyBuffers so the (a+b) runs first
|
||||
@unittest.skip("not real world")
|
||||
def test_children_dont_push(self):
|
||||
a = Tensor.empty(10, 10, 1)
|
||||
b = Tensor.empty(10, 10, 1)
|
||||
d = (a+b).expand(10, 10, 10)
|
||||
e = (a+b).permute(2,1,0)
|
||||
f = d+e
|
||||
check_schedule(f, 2)
|
||||
|
||||
def test_dont_fuse_binops_with_children(self):
|
||||
a = Tensor.empty(10)
|
||||
b = Tensor.empty(10)
|
||||
c = Tensor.empty(10)
|
||||
keep_me = a+b
|
||||
e = keep_me.sum() # give keep_me a child (NOTE: BinaryOps won't be a child since it will instant fuse)
|
||||
d = keep_me+c
|
||||
check_schedule(d, 2)
|
||||
check_schedule(keep_me, 0, [d])
|
||||
|
||||
@unittest.skip("failing in old lazy")
|
||||
def test_permute_breaks_fusion(self):
|
||||
a = Tensor.empty(10, 10, 10)
|
||||
b = Tensor.empty(10, 10)
|
||||
c = (a.sum(axis=2) + b).permute(1,0)
|
||||
d = c.permute(1,0)
|
||||
check_schedule(d, 1)
|
||||
|
||||
def test_some_permute_fusion(self):
|
||||
a = Tensor.empty(8192, 16)
|
||||
b = Tensor.empty(1, 16)
|
||||
d = (a.T + b.expand(8192, 16).T)
|
||||
c = a + b.expand(8192, 16)
|
||||
e = d.T
|
||||
check_schedule(c, 1)
|
||||
check_schedule(e, 1)
|
||||
|
||||
# this is the failing case in openpilot...it's very simple like this
|
||||
@unittest.skip("failing in old lazy")
|
||||
def test_image_conv_fusion(self):
|
||||
from tinygrad.features.image import image_conv2d
|
||||
w1 = Tensor.empty(16, 16, 1, 1)
|
||||
b1 = Tensor.empty(16)
|
||||
w2 = Tensor.empty(16, 16, 1, 1)
|
||||
b2 = Tensor.empty(16)
|
||||
w3 = Tensor.empty(16, 16, 1, 1)
|
||||
b3 = Tensor.empty(16)
|
||||
|
||||
x = Tensor.empty(1, 16, 32, 32)
|
||||
x = base = image_conv2d(x, w1, b1)
|
||||
x = image_conv2d(x, w2, b2) + base
|
||||
x = image_conv2d(x, w3, b3)
|
||||
|
||||
# NOOP, 3 convs, contiguous
|
||||
check_schedule(x, 5)
|
||||
|
||||
def test_image_conv_fusion_minimal(self):
|
||||
b1 = Tensor.empty(16)
|
||||
b2 = Tensor.empty(16)
|
||||
def p(x): return x.permute(1,0).contiguous().reshape(32,16,1).expand(32,16,16).sum(axis=2).permute(1,0)
|
||||
|
||||
x = Tensor.empty(16, 32)
|
||||
x = base = p(x) + b1.reshape(16,1)
|
||||
x = p(x)
|
||||
x = x + b2.reshape(16,1)
|
||||
x = x + base
|
||||
del base
|
||||
x = p(x)
|
||||
check_schedule(x, 4)
|
||||
|
||||
def test_image_conv_fusion_more_minimal(self):
|
||||
b1 = Tensor.empty(16)
|
||||
def p(x): return x.permute(1,0).contiguous().reshape(32,16,1).expand(32,16,16).sum(axis=2).permute(1,0)
|
||||
|
||||
x = Tensor.empty(16, 32)
|
||||
x = base = p(x) + b1.reshape(16,1)
|
||||
x = p(x)
|
||||
del base
|
||||
check_schedule(x, 3)
|
||||
|
||||
def test_resnet_block(self):
|
||||
from models.resnet import BasicBlock
|
||||
Tensor.training = False
|
||||
bb = BasicBlock(64,64)
|
||||
|
||||
x = Tensor.empty(1, 64, 32, 32)
|
||||
out = bb(x)
|
||||
check_schedule(out, 4)
|
||||
|
||||
def test_contiguous_while_contiguous(self):
|
||||
x = Tensor.empty(1, 64, 32, 32)
|
||||
out = x.contiguous()
|
||||
check_schedule(out, 1, filter_loadops=False)
|
||||
|
||||
def test_contiguous_while_not_contiguous(self):
|
||||
x = Tensor.empty(1, 64, 32, 32)
|
||||
out = x.permute(0,2,3,1).contiguous()
|
||||
check_schedule(out, 2, filter_loadops=False)
|
||||
|
||||
def test_double_from(self):
|
||||
x = Tensor([1,2,3,4])
|
||||
out = x.to('cpu')
|
||||
check_schedule(out, 0, filter_loadops=False)
|
||||
|
||||
def test_pow_const_tensor(self):
|
||||
x = Tensor([1,2,3,4])
|
||||
out = x ** Tensor(2)
|
||||
check_schedule(out, 1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main(verbosity=2)
|
||||
19
tinygrad_repo/test/test_search.py
Normal file
19
tinygrad_repo/test/test_search.py
Normal file
@@ -0,0 +1,19 @@
|
||||
import unittest
|
||||
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
from tinygrad.features.search import time_linearizer
|
||||
from tinygrad.ops import Compiled, Device, LoadOps
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
class TestTimeLinearizer(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
if not isinstance(Device[Device.DEFAULT], Compiled): raise unittest.SkipTest("only test for compiled backends")
|
||||
|
||||
def test_reasonable_time(self):
|
||||
si = [si for si in Tensor([1,2,3,4]).add(1).lazydata.schedule() if si.ast.op not in LoadOps][0]
|
||||
rawbufs = [Device[Device.DEFAULT].buffer(si.out.st.size(), si.out.dtype)] + [Device[Device.DEFAULT].buffer(x.st.size(), x.dtype) for x in si.inputs]
|
||||
tm = time_linearizer(Linearizer(si.ast), rawbufs, allow_test_size=False, cnt=10)
|
||||
assert tm > 0 and tm != float('inf')
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
57
tinygrad_repo/test/test_specific_conv.py
Normal file
57
tinygrad_repo/test/test_specific_conv.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import unittest
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import dtypes
|
||||
from tinygrad.ops import Device
|
||||
import pytest
|
||||
# similar to test/external/external_test_gpu_ast.py, but universal
|
||||
|
||||
pytestmark = pytest.mark.exclude_cuda
|
||||
|
||||
class TestSpecific(unittest.TestCase):
|
||||
# from openpilot
|
||||
|
||||
# 1x1 6 <- 24
|
||||
def test_1x1_6_24(self):
|
||||
x = Tensor.randn(1, 24*4, 32, 64)
|
||||
w = Tensor.randn(6*4, 24*4, 1, 1)
|
||||
x.conv2d(w).permute(0,2,3,1).reshape(32, 384, 4).contiguous().realize()
|
||||
|
||||
def test_vec_mul(self):
|
||||
# this forces it to be an image...
|
||||
x = Tensor.ones(1, 512, 4).contiguous().reshape(1, 2048)
|
||||
w = Tensor.randn(2048, 512)
|
||||
(x @ w).reshape(1, 128, 4).contiguous().realize()
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT in ["LLVM", "WEBGPU"], "Broken on LLVM and webgpu")
|
||||
def test_big_vec_mul(self):
|
||||
# from LLaMA
|
||||
# 0 buffer<4096, dtypes.float> [View((1024, 1, 1, 4), (4, 0, 0, 1), 0, None)]
|
||||
# 1 buffer<4096, dtypes.float> [View((1024, 1024, 4, 4), (0, 4, 1, 0), 0, None)]
|
||||
# 2 buffer<16777216, dtypes.half> [View((1024, 1024, 4, 4), (16384, 4, 1, 4096), 0, None)]
|
||||
x = Tensor.randn(4096).realize()
|
||||
w = Tensor.randn(4096, 4096, device='cpu').cast(dtypes.float16).to(Device.DEFAULT).realize()
|
||||
(x @ w.T).realize()
|
||||
|
||||
# from https://dl.acm.org/doi/pdf/10.1145/3495243.3517020
|
||||
|
||||
# ~260 GFLOPS on Adreno 640, should be 260*(720/890)*(596/710) = 176.5 on downclocked 630
|
||||
# we get 170
|
||||
def test_1x1_28_28(self):
|
||||
x = Tensor.randn(1, 256, 28, 28)
|
||||
w = Tensor.randn(256, 256, 1, 1)
|
||||
x.conv2d(w).permute(0,2,3,1).reshape(28, 28*256//4, 4).contiguous().realize()
|
||||
|
||||
# 132 GFLOPS on Adreno 640, should be 132*(720/890)*(596/710) = 90 on downclocked 630
|
||||
# gets 54 with broken opt, 74 without opt, and 146 if we pad and opt 3!
|
||||
def test_3x3_28_28_stride_2(self):
|
||||
x = Tensor.randn(1, 288, 36, 36)
|
||||
w = Tensor.randn(384, 288, 3, 3)
|
||||
x.conv2d(w, stride=2).permute(0,2,3,1).reshape(17, 17*384//4, 4).contiguous().realize()
|
||||
|
||||
def test_3x3_28_28_stride_2_padded(self):
|
||||
x = Tensor.randn(1, 288, 36, 36)
|
||||
w = Tensor.randn(384, 288, 3, 3)
|
||||
x.conv2d(w, stride=2, padding=1).permute(0,2,3,1).reshape(18, 18*384//4, 4).contiguous().realize()
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
288
tinygrad_repo/test/test_speed_v_torch.py
Normal file
288
tinygrad_repo/test/test_speed_v_torch.py
Normal file
@@ -0,0 +1,288 @@
|
||||
import os
|
||||
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
|
||||
os.environ["MKL_NUM_THREADS"] = "1"
|
||||
os.environ["NUMEXPR_NUM_THREADS"] = "1"
|
||||
os.environ["OMP_NUM_THREADS"] = "1"
|
||||
import unittest
|
||||
import torch
|
||||
torch.set_num_threads(1)
|
||||
import time
|
||||
import numpy as np
|
||||
np.set_printoptions(linewidth=160)
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn import Conv2d
|
||||
from tinygrad.helpers import colored, getenv, CI
|
||||
from tinygrad.jit import TinyJit
|
||||
import pytest
|
||||
|
||||
pytestmark = [pytest.mark.exclude_cuda, pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
|
||||
|
||||
IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")]
|
||||
|
||||
torch_dt = torch.float16 if getenv("HALF", 0) else torch.float32
|
||||
torch_device = torch.device('mps' if getenv("MPS", 0) else ('cuda' if getenv("TORCHCUDA", 0) else 'cpu'))
|
||||
if str(torch_device) == "mps":
|
||||
import torch.mps
|
||||
sync = lambda: torch.mps.synchronize()
|
||||
elif str(torch_device) == "cuda":
|
||||
import torch.cuda
|
||||
sync = lambda: torch.cuda.synchronize()
|
||||
else:
|
||||
sync = lambda: None
|
||||
|
||||
def colorize_float(x):
|
||||
ret = f"{x:7.2f}x"
|
||||
if x < 0.75:
|
||||
return colored(ret, 'green')
|
||||
elif x > 1.15:
|
||||
return colored(ret, 'red')
|
||||
else:
|
||||
return colored(ret, 'yellow')
|
||||
|
||||
save_ops, save_mem = 0, 0
|
||||
CNT = getenv("CNT", 8)
|
||||
def helper_test_speed(f1, *args):
|
||||
global save_ops, save_mem
|
||||
ets = []
|
||||
ret = None
|
||||
cache_defeat = np.zeros((2048,2048))
|
||||
for i in range(CNT):
|
||||
del ret
|
||||
|
||||
# operation cache defeats
|
||||
args = [(x+1).realize() if isinstance(x, Tensor) else (None if x is None else (x+1)) for x in args]
|
||||
|
||||
# force syncing
|
||||
[x.numpy() if isinstance(x, Tensor) or str(torch_device) == "cpu" else x.cpu().numpy() for x in args if x is not None]
|
||||
|
||||
# clear 32MB global memory cache (CPU and global memory only)
|
||||
cache_defeat += 1
|
||||
|
||||
# manual pre sync
|
||||
if isinstance(args[0], Tensor): Device[args[0].device].synchronize()
|
||||
else: sync()
|
||||
|
||||
GlobalCounters.global_ops = 0
|
||||
GlobalCounters.global_mem = 0
|
||||
st = time.perf_counter()
|
||||
ret = f1(*args)
|
||||
if isinstance(ret, Tensor): Device[ret.device].synchronize()
|
||||
else: sync()
|
||||
et = (time.perf_counter() - st) * 1000
|
||||
if i >= 1: ets.append(et)
|
||||
if GlobalCounters.global_ops:
|
||||
save_ops, save_mem = GlobalCounters.global_ops, GlobalCounters.global_mem
|
||||
return ret.numpy() if isinstance(ret, Tensor) else ret.cpu().numpy(), np.min(ets)
|
||||
|
||||
def helper_test_generic_square(name, N, f1, f2, onearg=False):
|
||||
torch.manual_seed(0)
|
||||
torch_a = (torch.rand(N, N, dtype=torch_dt) - 0.5).to(torch_device)
|
||||
torch_b = (torch.rand(N, N, dtype=torch_dt) - 0.5).to(torch_device) if not onearg else None
|
||||
|
||||
tiny_a = Tensor(torch_a.cpu().numpy())
|
||||
tiny_b = Tensor(torch_b.cpu().numpy()) if not onearg else None
|
||||
|
||||
helper_test_generic(f"{name:30s} {N:5d}x{N:5d}", f1, (torch_a, torch_b), TinyJit(lambda a,b:f2(a,b).realize()), (tiny_a, tiny_b))
|
||||
|
||||
def helper_test_matvec(name, N, M):
|
||||
torch.manual_seed(0)
|
||||
torch_a = (torch.rand(N, dtype=torch_dt) - 0.5).to(torch_device)
|
||||
torch_b = (torch.rand(N, M, dtype=torch_dt) - 0.5).to(torch_device)
|
||||
|
||||
tiny_a = Tensor(torch_a.cpu().numpy())
|
||||
tiny_b = Tensor(torch_b.cpu().numpy())
|
||||
|
||||
helper_test_generic(f"{name:30s} {N:5d}x{M:5d}", lambda a,b: a@b, (torch_a, torch_b), TinyJit(lambda a,b:(a@b).realize()), (tiny_a, tiny_b))
|
||||
|
||||
prefix = None
|
||||
def helper_test_generic(name, f1, f1_args, f2, f2_args):
|
||||
global prefix
|
||||
with torch.no_grad():
|
||||
val_torch, et_torch = helper_test_speed(f1, *f1_args)
|
||||
val_tinygrad, et_tinygrad = helper_test_speed(f2, *f2_args)
|
||||
|
||||
desc = "faster" if et_torch > et_tinygrad else "slower"
|
||||
flops = save_ops*1e-6
|
||||
mem = save_mem*1e-6
|
||||
print(("\r" if not CI else "")+f"{name:42s} {et_torch:7.2f} ms ({flops/et_torch:8.2f} GFLOPS {mem/et_torch:8.2f} GB/s) in torch, {et_tinygrad:7.2f} ms ({flops/et_tinygrad:8.2f} GFLOPS {mem/et_tinygrad:8.2f} GB/s) in tinygrad, {colorize_float(et_tinygrad/et_torch)} {desc} {flops:10.2f} MOPS {mem:8.2f} MB")
|
||||
np.testing.assert_allclose(val_tinygrad, val_torch, atol=1e-3, rtol=1e-3)
|
||||
|
||||
def helper_test_conv(bs, in_chans, out_chans, kernel_size, img_size_y, img_size_x):
|
||||
torch.manual_seed(0)
|
||||
torch_dat = torch.rand(bs, in_chans, img_size_y, img_size_x, dtype=torch_dt).to(torch_device)
|
||||
torch_conv = torch.nn.Conv2d(in_chans, out_chans, kernel_size, bias=None, dtype=torch_dt).to(torch_device)
|
||||
|
||||
tiny_dat = Tensor(torch_dat.cpu().numpy())
|
||||
tiny_conv = Conv2d(in_chans, out_chans, kernel_size, bias=None)
|
||||
tiny_conv.weight = Tensor(torch_conv.weight.detach().cpu().numpy())
|
||||
|
||||
def f1(torch_dat): return torch_conv(torch_dat)
|
||||
def f2(tiny_dat): return tiny_conv(tiny_dat).realize()
|
||||
helper_test_generic(f"conv bs:{bs:3d} chans:{in_chans:3d} -> {out_chans:3d} k:{kernel_size}", f1, (torch_dat,), TinyJit(f2), (tiny_dat,))
|
||||
|
||||
@unittest.skipIf(getenv("BIG") == 0, "no big tests")
|
||||
class TestBigSpeed(unittest.TestCase):
|
||||
def test_add(self):
|
||||
def f(a, b): return a+b
|
||||
helper_test_generic_square('add', 8192, f, f)
|
||||
def test_exp(self):
|
||||
def f(a, b): return a.exp()
|
||||
helper_test_generic_square('exp', 8192, f, f, onearg=True)
|
||||
def test_gemm_2048(self):
|
||||
def f(a, b): return a @ b
|
||||
helper_test_generic_square('gemm', 2048, f, f)
|
||||
def test_gemm_4096(self):
|
||||
def f(a, b): return a @ b
|
||||
helper_test_generic_square('gemm', 4096, f, f)
|
||||
def test_large_conv_1x1(self): helper_test_conv(bs=32, in_chans=128, out_chans=128, kernel_size=1, img_size_y=128, img_size_x=128)
|
||||
def test_large_conv_3x3(self): helper_test_conv(bs=4, in_chans=128, out_chans=128, kernel_size=3, img_size_y=130, img_size_x=130)
|
||||
def test_large_conv_5x5(self): helper_test_conv(bs=4, in_chans=128, out_chans=128, kernel_size=5, img_size_y=132, img_size_x=132)
|
||||
def test_matvec_4096_16384(self): helper_test_matvec('matvec_4096_16384', 4096, 16384)
|
||||
def test_matvec_16384_4096(self): helper_test_matvec('matvec_16384_4096', 16384, 4096)
|
||||
|
||||
@unittest.skipIf(getenv("BIG") == 1, "only big tests")
|
||||
class TestSpeed(unittest.TestCase):
|
||||
def test_sub(self):
|
||||
def f(a, b): return a-b
|
||||
helper_test_generic_square('sub', 4096, f, f)
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT == "WEBGPU", "breaking on webgpu CI")
|
||||
def test_pow(self):
|
||||
def f(a, b): return a.pow(b)
|
||||
helper_test_generic_square('pow', 2048, f, f)
|
||||
|
||||
def test_sum(self):
|
||||
def f(a, b): return a.sum()
|
||||
helper_test_generic_square('sum', 2048, f, f, onearg=True)
|
||||
helper_test_generic_square('sum', 4096, f, f, onearg=True)
|
||||
|
||||
def test_partial_sum(self):
|
||||
R = 256
|
||||
def f(a, b): return a.reshape(int(4096//R), int(4096*R)).sum(axis=1)
|
||||
helper_test_generic_square('partial_sum', 4096, f, f, onearg=True)
|
||||
|
||||
@unittest.skip("not really used in models")
|
||||
def test_cumsum(self):
|
||||
def f0(a, b): return a.cumsum(axis=0)
|
||||
def f1(a, b): return a.cumsum(axis=1)
|
||||
helper_test_generic_square('cumsum_0', 256, f0, f0, onearg=True)
|
||||
helper_test_generic_square('cumsum_1', 256, f1, f1, onearg=True)
|
||||
|
||||
def test_cat(self):
|
||||
helper_test_generic_square('cat_0', 256, lambda x,y: torch.cat((x,y),dim=0), lambda x,y: x.cat(y,dim=0))
|
||||
helper_test_generic_square('cat_1', 256, lambda x,y: torch.cat((x,y),dim=1), lambda x,y: x.cat(y,dim=1))
|
||||
|
||||
def test_array_packing(self):
|
||||
N = 2048
|
||||
def f(a, b): return a.reshape(N, N // 32, 32).permute(1,0,2).contiguous()
|
||||
helper_test_generic_square('array_packing', N, f, f, onearg=True)
|
||||
|
||||
def test_permute(self):
|
||||
for N in [1024, 4096]:
|
||||
# this is a 64MB tensor, M1 L1 cache is 128kB
|
||||
# to fit easily in L1, rotations should be 128x128 chunks. 128x128 is also the AMX size
|
||||
def f(a, b): return a.permute(1,0).contiguous()
|
||||
helper_test_generic_square('permute', N, f, f, onearg=True)
|
||||
|
||||
def test_double_permute(self):
|
||||
N = 64
|
||||
torch.manual_seed(0)
|
||||
torch_a = (torch.rand(N, N, N, N, dtype=torch_dt) - 0.5).to(torch_device)
|
||||
tiny_a = Tensor(torch_a.cpu().numpy())
|
||||
def f(a): return a.permute(1,0,3,2).contiguous()
|
||||
helper_test_generic(f"double_permute {tiny_a.shape}", f, (torch_a,), TinyJit(lambda a: f(a).realize()), (tiny_a,))
|
||||
|
||||
def test_neg(self):
|
||||
def f(a, b): return -a
|
||||
helper_test_generic_square('neg', 4096, f, f, onearg=True)
|
||||
|
||||
def test_exp(self):
|
||||
def f(a, b): return a.exp()
|
||||
helper_test_generic_square('exp', 2048, f, f, onearg=True)
|
||||
|
||||
def test_relu(self):
|
||||
def f(a, b): return a.relu()
|
||||
helper_test_generic_square('relu', 4096, f, f, onearg=True)
|
||||
|
||||
def test_max(self):
|
||||
def f(a, b): return a.max()
|
||||
helper_test_generic_square('max', 4096, f, f, onearg=True)
|
||||
|
||||
def test_mul_sum(self):
|
||||
def f(a, b): return (a*b).sum()
|
||||
helper_test_generic_square('mul_sum', 4096, f, f)
|
||||
|
||||
def test_add(self):
|
||||
for N in [1, 1024, 4096]:
|
||||
def f(a, b): return a + b
|
||||
helper_test_generic_square('add', N, f, f)
|
||||
|
||||
def test_add_constant(self):
|
||||
def f(a, b): return a+2.0
|
||||
helper_test_generic_square('add_constant', 4096, f, f, onearg=True)
|
||||
|
||||
def test_add_sq(self):
|
||||
def f(a, b): return a*a + b*b
|
||||
helper_test_generic_square('add_sq', 4096, f, f)
|
||||
|
||||
def test_gemm(self):
|
||||
def f(a, b): return a @ b
|
||||
helper_test_generic_square('gemm', 1024, f, f)
|
||||
|
||||
def test_gemm_small(self):
|
||||
def f(a, b): return a @ b
|
||||
helper_test_generic_square('gemm', 256, f, f)
|
||||
|
||||
def test_gemm_unrolled(self):
|
||||
N = 512
|
||||
def f1(a, b): return a@b.T
|
||||
def f2(a, b): return (a.reshape(N, 1, N).expand(N, N, N) * b.reshape(1, N, N).expand(N, N, N)).sum(axis=2)
|
||||
helper_test_generic_square('gemm_unrolled', N, f1, f2)
|
||||
|
||||
def test_gemm_unrolled_permute_l(self):
|
||||
N = 512
|
||||
def f1(a, b): return a.T@b.T
|
||||
def f2(a, b): return (a.permute(1,0).reshape(N, 1, N).expand(N, N, N) * b.reshape(1, N, N).expand(N, N, N)).sum(axis=2)
|
||||
helper_test_generic_square('gemm_unrolled_permute_l', N, f1, f2)
|
||||
|
||||
def test_gemm_unrolled_permute_r(self):
|
||||
N = 512
|
||||
def f1(a, b): return a@b
|
||||
def f2(a, b): return (a.reshape(N, 1, N).expand(N, N, N) * b.permute(1,0).reshape(1, N, N).expand(N, N, N)).sum(axis=2)
|
||||
helper_test_generic_square('gemm_unrolled_permute_r', N, f1, f2)
|
||||
|
||||
def test_gemm_unrolled_permute_lr(self):
|
||||
N = 512
|
||||
def f1(a, b): return a.T@b
|
||||
def f2(a, b): return (a.permute(1,0).reshape(N, 1, N).expand(N, N, N) * b.permute(1,0).reshape(1, N, N).expand(N, N, N)).sum(axis=2)
|
||||
helper_test_generic_square('gemm_unrolled_permute_lr', N, f1, f2)
|
||||
|
||||
def test_matvec_1024_1024(self): helper_test_matvec('matvec_1024_1024', 1024, 1024)
|
||||
def test_matvec_1024_4096(self): helper_test_matvec('matvec_1024_4096', 1024, 4096)
|
||||
def test_matvec_4096_1024(self): helper_test_matvec('matvec_4096_1024', 4096, 1024)
|
||||
def test_matvec_4096_4096(self): helper_test_matvec('matvec_4096_4096', 4096, 4096)
|
||||
|
||||
def test_openpilot_conv2d(self):
|
||||
bs, in_chans, out_chans = 1,12,32
|
||||
torch.manual_seed(0)
|
||||
torch_dat = torch.rand(bs, 64, 128, 12, dtype=torch_dt).to(torch_device)
|
||||
torch_conv = torch.nn.Conv2d(in_chans, out_chans, 3, bias=None, padding=1, dtype=torch_dt).to(torch_device)
|
||||
|
||||
tiny_dat = Tensor(torch_dat.cpu().numpy())
|
||||
tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
|
||||
tiny_conv.weight = Tensor(torch_conv.weight.detach().cpu().numpy())
|
||||
|
||||
def f1(torch_dat): return torch_conv(torch_dat.permute(0,3,1,2))
|
||||
def f2(tiny_dat): return tiny_conv(tiny_dat.permute(0,3,1,2)).realize()
|
||||
helper_test_generic(f"conv bs:{bs:3d} chans:{in_chans:3d} -> {out_chans:3d} k:3", f1, (torch_dat,), TinyJit(f2), (tiny_dat,))
|
||||
|
||||
def test_conv2d(self):
|
||||
for bs in [32]:
|
||||
for in_chans in IN_CHANS:
|
||||
for out_chans in [32]:
|
||||
helper_test_conv(bs, in_chans, out_chans, 3, 34, 34)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
181
tinygrad_repo/test/test_symbolic_jit.py
Normal file
181
tinygrad_repo/test/test_symbolic_jit.py
Normal file
@@ -0,0 +1,181 @@
|
||||
import unittest
|
||||
from tinygrad.jit import TinyJit
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
import numpy as np
|
||||
|
||||
@unittest.skipIf(getenv("ARM64") or getenv("PTX"), "ARM64 and PTX are not supported")
|
||||
@unittest.skipUnless(Device.DEFAULT in ["GPU", "METAL", "CLANG", "CUDA", "LLVM"], f"{Device.DEFAULT} is not supported")
|
||||
class TestSymbolicJit(unittest.TestCase):
|
||||
def test_plus1(self):
|
||||
def f(a): return (a+1).realize()
|
||||
jf = TinyJit(f)
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
a = Tensor.rand(3, i)
|
||||
symbolic = jf(a.reshape(3, vi)).reshape(3, i).numpy()
|
||||
expected = f(a).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
assert len(jf.jit_cache) == 1
|
||||
|
||||
def test_reshape_inside_plus1(self):
|
||||
def f(a, jit=False, jit_ctx=None):
|
||||
if jit: a = a.reshape(3, Variable("i", 1, 10).bind(a.shape[1]))
|
||||
return (a+1).realize()
|
||||
jf = TinyJit(f)
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10)
|
||||
a = Tensor.rand(3, i)
|
||||
symbolic = jf(a, jit=True, jit_ctx={vi: i}).reshape(3, i).numpy()
|
||||
expected = f(a).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
assert len(jf.jit_cache) == 1
|
||||
|
||||
def test_add(self):
|
||||
def f(a, b): return (a+b).realize()
|
||||
jf = TinyJit(f)
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
a = Tensor.rand(3, i)
|
||||
b = Tensor.rand(3, i)
|
||||
symbolic = jf(a.reshape(3, vi), b.reshape(3, vi)).reshape(3, i).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
assert len(jf.jit_cache) == 1
|
||||
|
||||
def test_matmul(self):
|
||||
def f(a, b): return (a@b).realize()
|
||||
jf = TinyJit(f)
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
a = Tensor.rand(3, i)
|
||||
b = Tensor.rand(i, 5)
|
||||
symbolic = jf(a.reshape(3, vi), b.reshape(vi, 5)).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
assert len(jf.jit_cache) == 1
|
||||
|
||||
def test_mixed_with_no_symbol_kernel(self):
|
||||
def f(a, b):
|
||||
s = (a@b).realize()
|
||||
s = (s+s).realize() # this one does not have symbols in input
|
||||
return s
|
||||
jf = TinyJit(f)
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
a = Tensor.rand(3, i)
|
||||
b = Tensor.rand(i, 5)
|
||||
symbolic = jf(a.reshape(3, vi), b.reshape(vi, 5)).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
assert len(jf.jit_cache) == 2
|
||||
|
||||
def test_attention(self):
|
||||
def f(q, k, v): return Tensor.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).realize()
|
||||
jf = TinyJit(f)
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
q = Tensor.rand(2, 1, 4, 8)
|
||||
k = Tensor.rand(2, i, 4, 8)
|
||||
v = Tensor.rand(2, i, 4, 8)
|
||||
symbolic = jf(q, k.reshape(2, vi, 4, 8), v.reshape(2, vi, 4, 8)).reshape(2, 4, 1, 8).numpy()
|
||||
expected = f(q, k, v).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
assert len(jf.jit_cache) == 6
|
||||
|
||||
def test_cat_dim0(self):
|
||||
def f(a, b): return a.cat(b, dim=0).realize()
|
||||
jf = TinyJit(f)
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
a = Tensor.rand(i, 3)
|
||||
b = Tensor.rand(2, 3)
|
||||
symbolic = jf(a.reshape(vi, 3), b).reshape(i+2, 3).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
assert len(jf.jit_cache) == 1
|
||||
|
||||
def test_cat_dim1(self):
|
||||
def f(a, b): return a.cat(b, dim=1).realize()
|
||||
jf = TinyJit(f)
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
a = Tensor.rand(3, i)
|
||||
b = Tensor.rand(3, 2)
|
||||
symbolic = jf(a.reshape(3, vi), b).reshape(3, i+2).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
assert len(jf.jit_cache) == 1
|
||||
|
||||
def test_cat_dim0_two_vars(self):
|
||||
def f(a, b): return a.cat(b, dim=0).realize()
|
||||
jf = TinyJit(f)
|
||||
for i in range(1, 5):
|
||||
for j in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
vj = Variable("j", 1, 10).bind(j)
|
||||
a = Tensor.rand(i, 3)
|
||||
b = Tensor.rand(j, 3)
|
||||
symbolic = jf(a.reshape(vi, 3), b.reshape(vj, 3)).reshape(i+j, 3).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
assert len(jf.jit_cache) == 1
|
||||
|
||||
def test_cat_dim1_two_vars(self):
|
||||
def f(a, b): return a.cat(b, dim=1).realize()
|
||||
jf = TinyJit(f)
|
||||
for i in range(1, 5):
|
||||
for j in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
vj = Variable("j", 1, 10).bind(j)
|
||||
a = Tensor.rand(3, i)
|
||||
b = Tensor.rand(3, j)
|
||||
symbolic = jf(a.reshape(3, vi), b.reshape(3, vj)).reshape(3, i+j).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
assert len(jf.jit_cache) == 1
|
||||
|
||||
def test_two_vars_plus1(self):
|
||||
def f(a, b): return (a@b+1).realize()
|
||||
jf = TinyJit(f)
|
||||
for i in range(1, 5):
|
||||
for j in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
vj = Variable("j", 1, 10).bind(j)
|
||||
a = Tensor.rand(i, 3)
|
||||
b = Tensor.rand(3, j)
|
||||
symbolic = jf(a.reshape(vi, 3), b.reshape(3, vj)).reshape(i, j).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
assert len(jf.jit_cache) == 1
|
||||
|
||||
def test_jit_symbolic_shape_mismatch(self):
|
||||
@TinyJit
|
||||
def add(a, b): return (a+b).realize()
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
a = Tensor.rand(3, i).reshape(3, vi)
|
||||
b = Tensor.rand(3, i).reshape(3, vi)
|
||||
c = add(a, b)
|
||||
vi2 = Variable("i", 1, 10).bind(7)
|
||||
a = Tensor.rand(3, 7).reshape(3, vi2)
|
||||
bad = Tensor.rand(4, 7).reshape(4, vi2)
|
||||
with self.assertRaises(AssertionError):
|
||||
add(a, bad)
|
||||
|
||||
def test_shrink(self):
|
||||
# shrink is a movement, so we pair it with a simple function to test the JIT interaction
|
||||
def f(a): return (a+1).realize()
|
||||
jf = TinyJit(f)
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
a = Tensor.rand(7, 11)
|
||||
symbolic = a.shrink(((3,5),(vi,vi+2)))
|
||||
symbolic = jf(symbolic).numpy()
|
||||
expected = f(a.shrink(((3,5),(i,i+2)))).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
assert len(jf.jit_cache) == 1
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
124
tinygrad_repo/test/test_symbolic_ops.py
Normal file
124
tinygrad_repo/test/test_symbolic_ops.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import unittest
|
||||
from tinygrad.jit import JIT_SUPPORTED_DEVICE
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
import numpy as np
|
||||
|
||||
@unittest.skipIf(getenv("ARM64") or getenv("PTX"), "ARM64 and PTX are not supported")
|
||||
@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["HIP", "WEBGPU"], f"{Device.DEFAULT} is not supported")
|
||||
class TestSymbolicOps(unittest.TestCase):
|
||||
def test_plus1(self):
|
||||
def f(a): return (a+1).realize()
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
a = Tensor.rand(3, i)
|
||||
symbolic = f(a.reshape(3, vi)).reshape(3, i).numpy()
|
||||
expected = f(a).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
|
||||
def test_add(self):
|
||||
def f(a, b): return (a+b).realize()
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
a = Tensor.rand(3, i)
|
||||
b = Tensor.rand(3, i)
|
||||
symbolic = f(a.reshape(3, vi), b.reshape(3, vi)).reshape(3, i).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
|
||||
def test_matmul(self):
|
||||
def f(a, b): return (a@b).realize()
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
a = Tensor.rand(3, i)
|
||||
b = Tensor.rand(i, 5)
|
||||
symbolic = f(a.reshape(3, vi), b.reshape(vi, 5)).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
|
||||
def test_attention(self, dropout_p=0.0):
|
||||
def f(q, k, v): return Tensor.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), dropout_p=dropout_p).realize()
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
q = Tensor.rand(2, 1, 4, 8)
|
||||
k = Tensor.rand(2, i, 4, 8)
|
||||
v = Tensor.rand(2, i, 4, 8)
|
||||
symbolic = f(q, k.reshape(2, vi, 4, 8), v.reshape(2, vi, 4, 8)).reshape(2, 4, 1, 8).numpy()
|
||||
expected = f(q, k, v).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
|
||||
def test_attention_training(self):
|
||||
with Tensor.train():
|
||||
self.test_attention(dropout_p=0.0)
|
||||
with self.assertRaises(AssertionError):
|
||||
# symbolic shape dropout is not supported
|
||||
self.test_attention(dropout_p=0.5)
|
||||
|
||||
def test_cat_dim0(self):
|
||||
def f(a, b): return a.cat(b, dim=0).realize()
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
a = Tensor.rand(i, 3)
|
||||
b = Tensor.rand(2, 3)
|
||||
symbolic = f(a.reshape(vi, 3), b).reshape(i+2, 3).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
|
||||
def test_cat_dim1(self):
|
||||
def f(a, b): return a.cat(b, dim=1).realize()
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
a = Tensor.rand(3, i)
|
||||
b = Tensor.rand(3, 2)
|
||||
symbolic = f(a.reshape(3, vi), b).reshape(3, i+2).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
|
||||
def test_cat_dim0_two_vars(self):
|
||||
def f(a, b): return a.cat(b, dim=0).realize()
|
||||
for i in range(1, 5):
|
||||
for j in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
vj = Variable("j", 1, 10).bind(j)
|
||||
a = Tensor.rand(i, 3)
|
||||
b = Tensor.rand(j, 3)
|
||||
symbolic = f(a.reshape(vi, 3), b.reshape(vj, 3)).reshape(i+j, 3).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
|
||||
def test_cat_dim1_two_vars(self):
|
||||
def f(a, b): return a.cat(b, dim=1).realize()
|
||||
for i in range(1, 5):
|
||||
for j in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
vj = Variable("j", 1, 10).bind(j)
|
||||
a = Tensor.rand(3, i)
|
||||
b = Tensor.rand(3, j)
|
||||
symbolic = f(a.reshape(3, vi), b.reshape(3, vj)).reshape(3, i+j).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
|
||||
def test_two_vars_plus1(self):
|
||||
def f(a, b): return (a@b+1).realize()
|
||||
for i in range(1, 5):
|
||||
for j in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
vj = Variable("j", 1, 10).bind(j)
|
||||
a = Tensor.rand(i, 3)
|
||||
b = Tensor.rand(3, j)
|
||||
symbolic = f(a.reshape(vi, 3), b.reshape(3, vj)).reshape(i, j).numpy()
|
||||
expected = f(a, b).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
|
||||
def test_shrink(self):
|
||||
for i in range(1, 5):
|
||||
vi = Variable("i", 1, 10).bind(i)
|
||||
a = Tensor.rand(7, 11)
|
||||
symbolic = a.shrink(((3,5),(vi,vi+2)))
|
||||
symbolic = symbolic.numpy()
|
||||
expected = a.shrink(((3,5),(i,i+2))).numpy()
|
||||
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
173
tinygrad_repo/test/test_symbolic_shapetracker.py
Normal file
173
tinygrad_repo/test/test_symbolic_shapetracker.py
Normal file
@@ -0,0 +1,173 @@
|
||||
import unittest
|
||||
from tinygrad.shape.shapetracker import ShapeTracker, View
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
class TestSymbolic(unittest.TestCase):
|
||||
def test_symbolic_st(self):
|
||||
x = Variable("x", 1, 100)
|
||||
st = ShapeTracker.from_shape((x, 3))
|
||||
assert st.shape == (x, 3)
|
||||
assert st.real_strides() == (3, 1)
|
||||
|
||||
def test_expr_idxs(self):
|
||||
x = Variable("x", 1, 100)
|
||||
st = ShapeTracker.from_shape((x, 3))
|
||||
idxs = [Variable("x", 0, 100), Variable("y", 0, 100)]
|
||||
e1, e2 = st.expr_idxs(idxs)
|
||||
assert e1.render() == "((x*3)+y)"
|
||||
assert e2.render() == "1"
|
||||
st = st.permute((1, 0))
|
||||
e1, e2 = st.expr_idxs(idxs)
|
||||
assert e1.render() == "((y*3)+x)"
|
||||
assert e2.render() == "1"
|
||||
|
||||
def test_cat_dim0_strides(self):
|
||||
i = Variable("i", 1, 5).bind(3)
|
||||
j = Variable("j", 1, 5).bind(3)
|
||||
k = Variable("k", 1, 5).bind(3)
|
||||
t = Tensor.rand(3, 4).reshape(i, 4).cat(Tensor.rand(3, 4).reshape(j, 4), dim=0).cat(Tensor.rand(3, 4).reshape(k, 4), dim=0)
|
||||
st = t.lazydata.st
|
||||
assert st.shape == (i+j+k, 4)
|
||||
assert st.real_strides() == (4, 1)
|
||||
t = Tensor.rand(3, 3).reshape(i, 3).cat(Tensor.rand(3, 3).reshape(i, 3), dim=0).cat(Tensor.rand(3, 3), dim=0)
|
||||
st = t.lazydata.st
|
||||
assert st.shape == (2*i+3, 3)
|
||||
assert st.real_strides() == (3, 1)
|
||||
|
||||
def test_cat_dim1_strides(self):
|
||||
i = Variable("i", 1, 5).bind(4)
|
||||
j = Variable("j", 1, 5).bind(4)
|
||||
k = Variable("k", 1, 5).bind(4)
|
||||
t = Tensor.rand(3, 4).reshape(3, i).cat(Tensor.rand(3, 4).reshape(3, j), dim=1).cat(Tensor.rand(3, 4).reshape(3, k), dim=1)
|
||||
st = t.lazydata.st
|
||||
assert st.shape == (3, i+j+k)
|
||||
assert st.real_strides() == (i+j+k, 1)
|
||||
|
||||
class TestSymbolicVarVals(unittest.TestCase):
|
||||
def test_var_vals_empty(self):
|
||||
assert ShapeTracker.from_shape((3, 4, 5)).var_vals == {}
|
||||
|
||||
def test_var_vals_shape(self):
|
||||
x = Variable("x", 1, 100).bind(3)
|
||||
assert ShapeTracker.from_shape((x, 3)).var_vals == {Variable("x", 1, 100): 3}
|
||||
|
||||
def test_var_vals_offset(self):
|
||||
x = Variable("x", 1, 100).bind(3)
|
||||
st = ShapeTracker.from_shape((4, 3)).shrink(((x, x+1), (0, 3)))
|
||||
assert st.real_offset() == x * 3
|
||||
assert st.var_vals == {Variable("x", 1, 100): 3}
|
||||
|
||||
def test_var_vals_mask(self):
|
||||
x = Variable("x", 1, 100).bind(3)
|
||||
view = View.create(shape=(3,4), strides=(4,1), offset=0, mask=((0, x), (0, 4)))
|
||||
st = ShapeTracker(views=(view,))
|
||||
assert st.var_vals == {Variable("x", 1, 100): 3}
|
||||
|
||||
def test_var_vals_complex(self):
|
||||
x = Variable("x", 1, 100).bind(3)
|
||||
y = Variable("y", 1, 100).bind(4)
|
||||
z = Variable("z", 1, 100).bind(5)
|
||||
st = ShapeTracker.from_shape((x, 5, y)).shrink(((0, x), (z, z+1), (0, 3)))
|
||||
assert st.real_offset() == y * z
|
||||
assert st.var_vals == {Variable("x", 1, 100): 3, Variable("y", 1, 100):4, Variable("z", 1, 100): 5}
|
||||
|
||||
def test_shrink_reshape(self):
|
||||
x = Variable("x", 1, 100).bind(3)
|
||||
st = ShapeTracker.from_shape((10, 10, 10)).shrink(((x, x+3), (3, 7), (2, 5)))
|
||||
st = st.reshape((3*4*3,))
|
||||
assert st.var_vals == {Variable("x", 1, 100): 3}
|
||||
|
||||
class TestShapeTrackerUnbind(unittest.TestCase):
|
||||
def test_view_unbind(self):
|
||||
v = Variable("v", 1, 100)
|
||||
bv = Variable("v", 1, 100).bind(3)
|
||||
assert View.create(shape=(bv, 4)).unbind() == View.create(shape=(v, 4))
|
||||
|
||||
def test_reshape_unbind(self):
|
||||
v = Variable("v", 1, 100)
|
||||
bv = Variable("v", 1, 100).bind(3)
|
||||
t = Tensor.rand(3, 4).reshape(bv, 4)
|
||||
assert t.lazydata.st.unbind() == ShapeTracker((View.create(shape=(v, 4)),))
|
||||
|
||||
def test_shrink_unbind(self):
|
||||
v = Variable("v", 1, 100)
|
||||
bv = Variable("v", 1, 100).bind(2)
|
||||
t = Tensor.rand(3, 4).shrink(((bv, bv+1), (0, 4)))
|
||||
assert t.lazydata.st.unbind() == ShapeTracker((View.create(shape=(1, 4), offset=4*v),))
|
||||
|
||||
class TestSymbolicReshape(unittest.TestCase):
|
||||
def test_reshape_into_symbols_simple(self):
|
||||
for i in range(1, 6):
|
||||
vi = Variable("i", 1, 5).bind(i)
|
||||
t = Tensor.rand(i, 4).reshape(vi, 4)
|
||||
assert t.shape == (vi, 4)
|
||||
t = Tensor.rand(i, 6).reshape(vi, 2, 3)
|
||||
assert t.shape == (vi, 2, 3)
|
||||
|
||||
def test_reshape_symbols_reshape_ints(self):
|
||||
for i in range(1, 6):
|
||||
vi = Variable("i", 1, 5).bind(i)
|
||||
t = Tensor.rand(i, 4).reshape(vi, 4)
|
||||
assert t.shape == (vi, 4)
|
||||
t = t.reshape(i, 4)
|
||||
assert t.shape == (i, 4)
|
||||
|
||||
def test_reshape_into_symbols_bad_shape(self):
|
||||
vi = Variable("i", 1, 10).bind(4)
|
||||
with self.assertRaises(AssertionError):
|
||||
t = Tensor.rand(4, 6).reshape(vi, 6).reshape(1, 77) # reshape to a different size new shape through symbolic shape
|
||||
with self.assertRaises(AssertionError):
|
||||
t = Tensor.rand(3, 4).reshape(3, (vi+1)) # reshape into non-Variable Node
|
||||
|
||||
def test_two_symbol_reshape(self):
|
||||
for i in range(1, 6):
|
||||
for j in range(1, 6):
|
||||
vi = Variable("i", 1, 5).bind(i)
|
||||
vj = Variable("j", 1, 5).bind(j)
|
||||
t = Tensor.rand(i, j).reshape(vi, vj)
|
||||
assert t.shape == (vi, vj)
|
||||
# NOTE: this is currently not allowed
|
||||
# t = t.reshape(1, vi*vj)
|
||||
# assert t.shape == (1, vi*vj)
|
||||
t = t.reshape(vj, vi)
|
||||
assert t.shape == (vj, vi)
|
||||
|
||||
class TestSymbolicExpand(unittest.TestCase):
|
||||
def test_expand_into_symbols(self):
|
||||
# TODO: enfore expand only into bound variables
|
||||
vi = Variable("i", 1, 5)
|
||||
vj = Variable("j", 1, 5)
|
||||
a = Tensor([[1], [2], [3]]).expand((3, vi))
|
||||
assert a.shape == (3, vi)
|
||||
a = a.reshape(3, vi, 1).expand((3, vi, vj))
|
||||
assert a.shape == (3, vi, vj)
|
||||
|
||||
def test_plus_expands_constant(self):
|
||||
for i in range(1, 6):
|
||||
vi = Variable("i", 1, 5).bind(i)
|
||||
a = Tensor.rand(3, i).reshape(3, vi)
|
||||
a = a + 1
|
||||
assert a.shape == (3, vi)
|
||||
|
||||
class TestSymbolicShrink(unittest.TestCase):
|
||||
def test_shrink_symbols(self):
|
||||
vi = Variable("i", 1, 5)
|
||||
t = Tensor.rand(3, 5).shrink(((0, 2), (vi, vi+1)))
|
||||
assert t.shape == (2, 1)
|
||||
|
||||
class TestSymbolicShapeExpr(unittest.TestCase):
|
||||
def test_symbolic_expr_idxs(self):
|
||||
# taken from symbolic shape llama
|
||||
i = Variable("i", 1, 120)
|
||||
gidx0 = Variable("gidx0", 0, i)
|
||||
lidx1 = Variable("lidx1", 0, 7)
|
||||
idx = (gidx0, lidx1, Variable.num(1))
|
||||
shape = (i+1, 8, 4)
|
||||
strides = (1, (i*4)+4, i+1)
|
||||
st = ShapeTracker((View.create(shape, strides), ))
|
||||
idx, valid = st.expr_idxs(idx)
|
||||
assert idx.render() == "((lidx1*((i*4)+4))+1+gidx0+i)"
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
266
tinygrad_repo/test/test_tensor.py
Normal file
266
tinygrad_repo/test/test_tensor.py
Normal file
@@ -0,0 +1,266 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import struct
|
||||
import unittest, copy
|
||||
import mmap
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
from tinygrad.helpers import dtypes
|
||||
from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
|
||||
from extra.utils import temp
|
||||
|
||||
x_init = np.random.randn(1,3).astype(np.float32)
|
||||
U_init = np.random.randn(3,3).astype(np.float32)
|
||||
V_init = np.random.randn(3,3).astype(np.float32)
|
||||
W_init = np.random.randn(3,3).astype(np.float32)
|
||||
m_init = np.random.randn(1,3).astype(np.float32)
|
||||
|
||||
class TestTinygrad(unittest.TestCase):
|
||||
def test_zerodim_initialization(self):
|
||||
a = Tensor(55)
|
||||
b = Tensor(3.14)
|
||||
|
||||
self.assertEqual(a.shape, ())
|
||||
self.assertEqual(b.shape, ())
|
||||
|
||||
def test_plus_equals(self):
|
||||
a = Tensor.randn(10,10)
|
||||
b = Tensor.randn(10,10)
|
||||
c = a + b
|
||||
val1 = c.numpy()
|
||||
a += b
|
||||
val2 = a.numpy()
|
||||
np.testing.assert_allclose(val1, val2)
|
||||
|
||||
def test_backward_pass(self):
|
||||
def test_tinygrad():
|
||||
x = Tensor(x_init, requires_grad=True)
|
||||
W = Tensor(W_init, requires_grad=True)
|
||||
m = Tensor(m_init)
|
||||
out = x.dot(W).relu()
|
||||
out = out.log_softmax()
|
||||
out = out.mul(m).add(m).sum()
|
||||
out.backward()
|
||||
return out.numpy(), x.grad.numpy(), W.grad.numpy()
|
||||
|
||||
def test_pytorch():
|
||||
x = torch.tensor(x_init, requires_grad=True)
|
||||
W = torch.tensor(W_init, requires_grad=True)
|
||||
m = torch.tensor(m_init)
|
||||
out = x.matmul(W).relu()
|
||||
out = torch.nn.functional.log_softmax(out, dim=1)
|
||||
out = out.mul(m).add(m).sum()
|
||||
out.backward()
|
||||
return out.detach().numpy(), x.grad, W.grad
|
||||
|
||||
for x,y in zip(test_tinygrad(), test_pytorch()):
|
||||
np.testing.assert_allclose(x, y, atol=1e-5)
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "this test uses more than 8 bufs which breaks webgpu") #TODO: remove after #1461
|
||||
def test_backward_pass_diamond_model(self):
|
||||
def test_tinygrad():
|
||||
u = Tensor(U_init, requires_grad=True)
|
||||
v = Tensor(V_init, requires_grad=True)
|
||||
w = Tensor(W_init, requires_grad=True)
|
||||
x = u.mul(v).relu()
|
||||
y = u.mul(w).relu()
|
||||
out = x.add(y).mul(y).relu()
|
||||
out = out.log_softmax()
|
||||
out = out.sum()
|
||||
out.backward()
|
||||
return out.numpy(), u.grad.numpy(), v.grad.numpy(), w.grad.numpy()
|
||||
|
||||
def test_pytorch():
|
||||
u = torch.tensor(U_init, requires_grad=True)
|
||||
v = torch.tensor(V_init, requires_grad=True)
|
||||
w = torch.tensor(W_init, requires_grad=True)
|
||||
x = u.mul(v).relu()
|
||||
y = u.mul(w).relu()
|
||||
out = x.add(y).mul(y).relu()
|
||||
out = torch.nn.functional.log_softmax(out, dim=1)
|
||||
out = out.sum()
|
||||
out.backward()
|
||||
return out.detach().numpy(), u.grad, v.grad, w.grad
|
||||
|
||||
for x,y in zip(test_tinygrad(), test_pytorch()):
|
||||
np.testing.assert_allclose(x, y, atol=1e-5)
|
||||
|
||||
def test_nograd(self):
|
||||
x = Tensor(x_init, requires_grad=False)
|
||||
m = Tensor(m_init, requires_grad=False)
|
||||
W = Tensor(W_init, requires_grad=True)
|
||||
tmp = x.mul(m)
|
||||
mm = tmp.matmul(W)
|
||||
out = mm.relu()
|
||||
out = out.sum()
|
||||
out.backward()
|
||||
assert x.grad is None
|
||||
assert m.grad is None
|
||||
assert tmp.grad is None
|
||||
assert mm.grad is not None
|
||||
assert W.grad is not None
|
||||
|
||||
def test_dropout(self):
|
||||
with Tensor.train():
|
||||
n, rate = 1_000_000, 0.1
|
||||
w = Tensor.ones(n).dropout(rate)
|
||||
non_zeros = np.count_nonzero(w.numpy())
|
||||
expected = n * (1 - rate)
|
||||
np.testing.assert_allclose(non_zeros, expected, rtol=2e-3)
|
||||
|
||||
def test_jacobian(self):
|
||||
W = np.random.RandomState(42069).random((10, 5)).astype(np.float32)
|
||||
x = np.random.RandomState(69420).random((1, 10)).astype(np.float32)
|
||||
|
||||
torch_x = torch.tensor(x, requires_grad=True)
|
||||
torch_W = torch.tensor(W, requires_grad=True)
|
||||
torch_func = lambda x: torch.nn.functional.log_softmax(x.matmul(torch_W).relu(), dim=1)
|
||||
PJ = torch.autograd.functional.jacobian(torch_func, torch_x).squeeze().numpy()
|
||||
|
||||
tiny_x = Tensor(x, requires_grad=True)
|
||||
tiny_W = Tensor(W, requires_grad=True)
|
||||
tiny_func = lambda x: x.dot(tiny_W).relu().log_softmax()
|
||||
J = jacobian(tiny_func, tiny_x)
|
||||
NJ = numerical_jacobian(tiny_func, tiny_x)
|
||||
|
||||
np.testing.assert_allclose(PJ, J, atol = 1e-5)
|
||||
np.testing.assert_allclose(PJ, NJ, atol = 1e-3)
|
||||
|
||||
def test_gradcheck(self):
|
||||
W = np.random.RandomState(1337).random((10, 5)).astype(np.float32)
|
||||
x = np.random.RandomState(7331).random((1, 10)).astype(np.float32)
|
||||
|
||||
tiny_x = Tensor(x, requires_grad=True)
|
||||
tiny_W = Tensor(W, requires_grad=True)
|
||||
tiny_func = lambda x: x.dot(tiny_W).relu().log_softmax()
|
||||
|
||||
self.assertTrue(gradcheck(tiny_func, tiny_x, eps = 1e-3))
|
||||
|
||||
# coarse approx. since a "big" eps and the non-linearities of the model
|
||||
self.assertFalse(gradcheck(tiny_func, tiny_x, eps = 1e-5))
|
||||
|
||||
def test_random_fns_are_deterministic_with_seed(self):
|
||||
for random_fn in [Tensor.randn, Tensor.normal, Tensor.uniform, Tensor.scaled_uniform, Tensor.glorot_uniform, Tensor.kaiming_normal]:
|
||||
with self.subTest(msg=f"Tensor.{random_fn.__name__}"):
|
||||
Tensor.manual_seed(1337)
|
||||
a = random_fn(10,10).realize()
|
||||
Tensor.manual_seed(1337)
|
||||
b = random_fn(10,10).realize()
|
||||
np.testing.assert_allclose(a.numpy(), b.numpy())
|
||||
|
||||
def test_randn_isnt_inf_on_zero(self):
|
||||
# simulate failure case of rand handing a zero to randn
|
||||
original_rand, Tensor.rand = Tensor.rand, Tensor.zeros
|
||||
try: self.assertNotIn(np.inf, Tensor.randn(16).numpy())
|
||||
except: raise
|
||||
finally: Tensor.rand = original_rand
|
||||
|
||||
def test_zeros_like_has_same_dtype(self):
|
||||
for datatype in [dtypes.float16, dtypes.float32, dtypes.int8, dtypes.int32, dtypes.int64, dtypes.uint8]:
|
||||
a = Tensor([1, 2, 3], dtype=datatype)
|
||||
b = Tensor.zeros_like(a)
|
||||
assert a.dtype == b.dtype, f"a.dtype and b.dtype should be {datatype}"
|
||||
assert a.shape == b.shape, f"shape mismatch (Tensor.zeros_like){a.shape} != (torch){b.shape}"
|
||||
|
||||
a = Tensor([1, 2, 3])
|
||||
b = Tensor.zeros_like(a, dtype=dtypes.int8)
|
||||
assert a.dtype != b.dtype and a.dtype == dtypes.float32 and b.dtype == dtypes.int8, "a.dtype should be float and b.dtype should be char"
|
||||
assert a.shape == b.shape, f"shape mismatch (Tensor.zeros_like){a.shape} != (torch){b.shape}"
|
||||
|
||||
def test_ones_like_has_same_dtype_and_shape(self):
|
||||
for datatype in [dtypes.float16, dtypes.float32, dtypes.int8, dtypes.int32, dtypes.int64, dtypes.uint8]:
|
||||
a = Tensor([1, 2, 3], dtype=datatype)
|
||||
b = Tensor.ones_like(a)
|
||||
assert a.dtype == b.dtype, f"a.dtype and b.dtype should be {datatype}"
|
||||
assert a.shape == b.shape, f"shape mismatch (Tensor.ones_like){a.shape} != (torch){b.shape}"
|
||||
|
||||
a = Tensor([1, 2, 3])
|
||||
b = Tensor.ones_like(a, dtype=dtypes.int8)
|
||||
assert a.dtype != b.dtype and a.dtype == dtypes.float32 and b.dtype == dtypes.int8, "a.dtype should be float and b.dtype should be char"
|
||||
assert a.shape == b.shape, f"shape mismatch (Tensor.ones_like){a.shape} != (torch){b.shape}"
|
||||
|
||||
def test_ndim(self):
|
||||
assert Tensor.randn(1).ndim == 1
|
||||
assert Tensor.randn(2,2,2).ndim == 3
|
||||
assert Tensor.randn(1,1,1,1,1,1).ndim == 6
|
||||
|
||||
def test_argfix(self):
|
||||
self.assertEqual(Tensor.zeros().shape, ())
|
||||
self.assertEqual(Tensor.ones().shape, ())
|
||||
|
||||
self.assertEqual(Tensor.zeros([]).shape, ())
|
||||
self.assertEqual(Tensor.ones([]).shape, ())
|
||||
|
||||
self.assertEqual(Tensor.zeros(tuple()).shape, ())
|
||||
self.assertEqual(Tensor.ones(tuple()).shape, ())
|
||||
|
||||
self.assertEqual(Tensor.zeros(1).shape, (1,))
|
||||
self.assertEqual(Tensor.ones(1).shape, (1,))
|
||||
|
||||
self.assertEqual(Tensor.zeros(1,10,20).shape, (1,10,20))
|
||||
self.assertEqual(Tensor.ones(1,10,20).shape, (1,10,20))
|
||||
|
||||
self.assertEqual(Tensor.zeros([1]).shape, (1,))
|
||||
self.assertEqual(Tensor.ones([1]).shape, (1,))
|
||||
|
||||
self.assertEqual(Tensor.zeros([10,20,40]).shape, (10,20,40))
|
||||
self.assertEqual(Tensor.ones([10,20,40]).shape, (10,20,40))
|
||||
|
||||
def test_numel(self):
|
||||
assert Tensor.randn(10, 10).numel() == 100
|
||||
assert Tensor.randn(1,2,5).numel() == 10
|
||||
assert Tensor.randn(1,1,1,1,1,1).numel() == 1
|
||||
assert Tensor([]).numel() == 0
|
||||
# assert Tensor.randn(1,0,2,5) == 0 # TODO: fix empty tensors
|
||||
|
||||
def test_element_size(self):
|
||||
for _, dtype in dtypes.fields().items():
|
||||
assert dtype.itemsize == Tensor.randn(3, dtype=dtype).element_size(), f"Tensor.element_size() not matching Tensor.dtype.itemsize for {dtype}"
|
||||
|
||||
def test_deepwalk_ctx_check(self):
|
||||
layer = Tensor.uniform(1, 1, requires_grad=True)
|
||||
x = Tensor.randn(1, 1, 1)
|
||||
x.dot(layer).mean().backward()
|
||||
x = Tensor.randn(1, 1, 1)
|
||||
x.dot(layer).mean().backward()
|
||||
|
||||
def test_zerosized_tensors(self):
|
||||
Tensor([]).realize()
|
||||
Tensor([]).numpy()
|
||||
|
||||
def test_tensor_ndarray_dtype(self):
|
||||
arr = np.array([1]) # where dtype is implicitly int64
|
||||
assert Tensor(arr).dtype == dtypes.int64
|
||||
assert Tensor(arr, dtype=dtypes.float32).dtype == dtypes.float32 # check if ndarray correctly casts to Tensor dtype
|
||||
assert Tensor(arr, dtype=dtypes.float64).dtype == dtypes.float64 # check that it works for something else
|
||||
|
||||
def test_tensor_list_dtype(self):
|
||||
arr = [1]
|
||||
assert Tensor(arr).dtype == Tensor.default_type
|
||||
assert Tensor(arr, dtype=dtypes.float32).dtype == dtypes.float32
|
||||
assert Tensor(arr, dtype=dtypes.float64).dtype == dtypes.float64
|
||||
|
||||
def test_tensor_copy(self):
|
||||
x = copy.deepcopy(Tensor.ones((3,3,3)))
|
||||
np.testing.assert_allclose(x.numpy(), np.ones((3,3,3)))
|
||||
|
||||
def test_copy_from_disk(self):
|
||||
t = Tensor.randn(30, device="CPU").to(f"disk:{temp('test_copy_from_disk')}")
|
||||
a = t[10:20]
|
||||
dev = a.to(Device.DEFAULT)
|
||||
np.testing.assert_allclose(a.numpy(), dev.numpy())
|
||||
|
||||
# Regression test for https://github.com/tinygrad/tinygrad/issues/1751
|
||||
def test_copy_from_numpy_unaligned(self):
|
||||
# 2**15 is the minimum for repro
|
||||
arr = np.random.randn(2**15).astype(dtypes.float.np)
|
||||
fn = temp('test_copy_from_numpy_unaligned')
|
||||
with open(fn, 'wb') as f: f.write(b't' + arr.tobytes())
|
||||
with open(fn, "a+b") as f: memview = memoryview(mmap.mmap(f.fileno(), arr.nbytes + 1))
|
||||
ua_arr = np.frombuffer(memview[1:], dtype=arr.dtype, count=arr.shape[0])
|
||||
np.testing.assert_allclose(arr, ua_arr)
|
||||
assert not ua_arr.flags.aligned
|
||||
# force device copy - to() is opt'd away - Tensor(dev)/1 is ignored
|
||||
np.testing.assert_allclose(ua_arr, (Tensor(ua_arr)/Tensor(1)).numpy())
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
99
tinygrad_repo/test/test_uops.py
Normal file
99
tinygrad_repo/test/test_uops.py
Normal file
@@ -0,0 +1,99 @@
|
||||
from typing import Optional, Tuple, Any, List
|
||||
import unittest, math
|
||||
import numpy as np
|
||||
from tinygrad.helpers import dtypes, getenv, DType, PtrDType
|
||||
from tinygrad.tensor import Device
|
||||
from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps, ASTRunner, Compiled
|
||||
from tinygrad.codegen.linearizer import UOps, UOp
|
||||
|
||||
def _uops_to_prg(uops):
|
||||
src, runtime_args = Device[Device.DEFAULT].renderer("test", uops)
|
||||
return ASTRunner("test", src,
|
||||
[1] if Device[Device.DEFAULT].linearizer_opts.has_local else None, [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None,
|
||||
runtime_args=runtime_args).build(Device[Device.DEFAULT].compiler, Device[Device.DEFAULT].runtime)
|
||||
|
||||
def uop(uops:List[UOp], uop:UOps, dtype:Optional[DType], vin:Tuple[UOp, ...], arg:Any=None) -> UOp:
|
||||
uops.append(UOp(uop, dtype, tuple(vin), arg, len(uops)))
|
||||
return uops[-1]
|
||||
|
||||
def _test_single_value(vals, op, dtype):
|
||||
uops = []
|
||||
buf_store = uop(uops, UOps.DEFINE_GLOBAL, PtrDType(dtype), (), ('data0', dtype))
|
||||
buf_loads = [uop(uops, UOps.DEFINE_GLOBAL, PtrDType(dtype), (), (f'data{i+1}', dtype)) for i in range(len(vals))]
|
||||
loads = (uop(uops, UOps.LOAD, dtype, [buf_loads[i], uop(uops, UOps.CONST, dtypes.int32, (), 0)]) for i in range(len(vals)))
|
||||
alu = uop(uops, UOps.ALU, dtype, loads, op)
|
||||
uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
|
||||
buf = Device[Device.DEFAULT].buffer(1, dtype)
|
||||
buf2 = [Device[Device.DEFAULT].buffer.fromCPU(np.array([a], dtype=dtype.np)) for a in vals]
|
||||
prg = _uops_to_prg(uops)
|
||||
prg([buf]+buf2)
|
||||
return buf.toCPU()[0]
|
||||
|
||||
def _test_single_value_const(vals, op, dtype):
|
||||
uops = []
|
||||
buf_store = uop(uops, UOps.DEFINE_GLOBAL, PtrDType(dtype), (), ('data0', dtype))
|
||||
loads = (uop(uops, UOps.CONST, dtype, [], a) for a in vals)
|
||||
alu = uop(uops, UOps.ALU, dtype, loads, op)
|
||||
uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
|
||||
buf = Device[Device.DEFAULT].buffer(1, dtype)
|
||||
prg = _uops_to_prg(uops)
|
||||
prg([buf])
|
||||
return buf.toCPU()[0]
|
||||
|
||||
class TestUOps(unittest.TestCase):
|
||||
def _equal(self, v1, v2):
|
||||
if not (math.isnan(v1) and math.isnan(v2)): self.assertAlmostEqual(v1, v2, places=5)
|
||||
|
||||
def _test_uop_fxn(self, bop, fxn, dt=dtypes.float32):
|
||||
for f in [_test_single_value, _test_single_value_const]:
|
||||
for a in [-2.0, 0.0, 1.0]:
|
||||
self._equal(f([a], bop, dt), fxn(a))
|
||||
|
||||
def _test_bop_fxn(self, bop, fxn, dt=dtypes.float32, no_b_zero=False):
|
||||
for f in [_test_single_value, _test_single_value_const]:
|
||||
for a in [-2.0, 0.0, 1.0]:
|
||||
for b in [-3.0, 1.0] + ([] if no_b_zero else [0.0]):
|
||||
self._equal(f([a,b], bop, dt), fxn(a,b))
|
||||
|
||||
def _test_top_fxn(self, bop, fxn, dt=dtypes.float32):
|
||||
for f in [_test_single_value, _test_single_value_const]:
|
||||
for a in [-2.0, 0, 1]:
|
||||
for b in [-3.0, 3.0]:
|
||||
for c in [-4.0, 4.0]:
|
||||
self._equal(f([a,b,c], bop, dt), fxn(a,b,c))
|
||||
|
||||
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
|
||||
class TestFloatUOps(TestUOps):
|
||||
def test_neg(self): self._test_uop_fxn(UnaryOps.NEG, lambda a: -a)
|
||||
def test_exp2(self): self._test_uop_fxn(UnaryOps.EXP2, lambda a: np.exp2(a))
|
||||
def test_log2(self): self._test_uop_fxn(UnaryOps.LOG2, lambda a: math.log2(a) if a > 0 else float('-inf' if a==0 else 'nan'))
|
||||
def test_sin(self): self._test_uop_fxn(UnaryOps.SIN, lambda a: math.sin(a))
|
||||
def test_sqrt(self): self._test_uop_fxn(UnaryOps.SQRT, lambda a: math.sqrt(a) if a >= 0 else float('nan'))
|
||||
# this is not on most backends
|
||||
#def test_recip(self): self._test_uop_fxn(UnaryOps.RECIP, lambda a: 1.0/a if a != 0 else float('inf'))
|
||||
|
||||
def test_add(self): self._test_bop_fxn(BinaryOps.ADD, lambda a,b: a+b)
|
||||
def test_sub(self): self._test_bop_fxn(BinaryOps.SUB, lambda a,b: a-b)
|
||||
def test_mul(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: a*b)
|
||||
def test_div(self): self._test_bop_fxn(BinaryOps.DIV, lambda a,b: a/b if b != 0 else a*float('inf'))
|
||||
def test_max(self): self._test_bop_fxn(BinaryOps.MAX, lambda a,b: max(a,b))
|
||||
def test_cmplt(self): self._test_bop_fxn(BinaryOps.CMPLT, lambda a,b: float(a<b))
|
||||
# MOD isn't tested on floats
|
||||
|
||||
def test_mulacc(self): self._test_top_fxn(TernaryOps.MULACC, lambda a,b,c: (a*b)+c)
|
||||
def test_where(self): self._test_top_fxn(TernaryOps.WHERE, lambda a,b,c: b if a!=0 else c)
|
||||
|
||||
# TODO: fix this on all the backends
|
||||
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled) or getenv('ARM64', False), "only test for compiled backends, broken on some")
|
||||
class TestNonFloatUOps(TestUOps):
|
||||
def test_neg_int32(self): self._test_uop_fxn(UnaryOps.NEG, lambda a: -a, dtypes.int32)
|
||||
def test_add_int32(self): self._test_bop_fxn(BinaryOps.ADD, lambda a,b: int(a)+int(b), dtypes.int32)
|
||||
def test_sub_int32(self): self._test_bop_fxn(BinaryOps.SUB, lambda a,b: int(a)-int(b), dtypes.int32)
|
||||
def test_mul_int32(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: int(a)*int(b), dtypes.int32)
|
||||
def test_div_int32(self): self._test_bop_fxn(BinaryOps.DIV, lambda a,b: int(a/b), dtypes.int32, no_b_zero=True)
|
||||
def test_mod_int32(self): self._test_bop_fxn(BinaryOps.MOD, lambda a,b: abs(int(a))%abs(int(b))*(1,-1)[a<0], dtypes.int32, no_b_zero=True)
|
||||
def test_cmplt_int32(self): self._test_bop_fxn(BinaryOps.CMPLT, lambda a,b: float(a<b), dtypes.int32)
|
||||
def test_mul_bool(self): self._test_bop_fxn(BinaryOps.MUL, lambda a,b: bool(a) and bool(b), dtypes.bool)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main(verbosity=2)
|
||||
51
tinygrad_repo/test/test_webgpu.js
Normal file
51
tinygrad_repo/test/test_webgpu.js
Normal file
@@ -0,0 +1,51 @@
|
||||
const puppeteer = require('puppeteer');
|
||||
const { spawn } = require('child_process');
|
||||
const res = spawn("python", ["-m", "http.server", "8000"], { shell: true });
|
||||
|
||||
async function timeout(time) {
|
||||
return new Promise((resolve) => setTimeout(resolve, time));
|
||||
}
|
||||
|
||||
function cleanup(err) {
|
||||
res.kill();
|
||||
if(err != null) {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
async function waitForText(selector, text) {
|
||||
let n = 0;
|
||||
let ready = false;
|
||||
while (n < 10) {
|
||||
const res = await (await selector.getProperty("textContent")).jsonValue();
|
||||
console.log(`waiting for text ${text} got ${res}`);
|
||||
if(res == text) {
|
||||
ready = true;
|
||||
break
|
||||
}
|
||||
await timeout(2000);
|
||||
n += 1
|
||||
}
|
||||
return ready;
|
||||
}
|
||||
|
||||
puppeteer.launch({ headless: false, args: ["--enable-unsafe-webgpu"]}).then(async browser => {
|
||||
const page = await browser.newPage();
|
||||
page.on("console", message => console.log(`message from console ${message.text()}`))
|
||||
.on("pageerror", ({ message }) => console.log(`error from page ${message}`))
|
||||
|
||||
const res = await page.goto("http://localhost:8000/examples/index.html");
|
||||
if(res.status() != 200) throw new Error("Failed to load page");
|
||||
const textSelector = await page.waitForSelector("#result");
|
||||
const buttonSelector = await page.waitForSelector("input[type=button]");
|
||||
const ready = await waitForText(textSelector, "ready");
|
||||
if(!ready) throw new Error("Failed to load page");
|
||||
await buttonSelector.evaluate(e => e.click());
|
||||
const done = await waitForText(textSelector, "hen");
|
||||
if(!done) throw new Error("failed to get hen");
|
||||
browser.close();
|
||||
cleanup(null);
|
||||
}).catch(err => {
|
||||
cleanup(err);
|
||||
});
|
||||
40
tinygrad_repo/test/test_winograd.py
Normal file
40
tinygrad_repo/test/test_winograd.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import unittest
|
||||
from tinygrad.helpers import Timing, CI
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import LoadOps
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
from test.test_net_speed import start_profile, stop_profile
|
||||
|
||||
class TestWinograd(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.old = Tensor.wino
|
||||
Tensor.wino = 1
|
||||
def tearDown(self): Tensor.wino = self.old
|
||||
|
||||
def test_speed(self):
|
||||
x = Tensor.empty(1,4,9,9)
|
||||
w = Tensor.empty(4,4,3,3)
|
||||
|
||||
with Timing("running conv: "):
|
||||
out = Tensor.conv2d(x, w)
|
||||
|
||||
with Timing("scheduling: "):
|
||||
sched = out.lazydata.schedule()
|
||||
|
||||
for i,s in enumerate(sched):
|
||||
if s.ast.op in LoadOps: continue
|
||||
ops = s.ast.get_lazyops()
|
||||
with Timing(f"linearize {i} with {len(ops):4d} ops: "):
|
||||
l = Linearizer(s.ast)
|
||||
l.hand_coded_optimizations()
|
||||
l.linearize()
|
||||
|
||||
def test_profile(self):
|
||||
x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
|
||||
if not CI: pr = start_profile()
|
||||
out = Tensor.conv2d(x,w).realize()
|
||||
if not CI: stop_profile(pr, sort='time')
|
||||
out.numpy()
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main(verbosity=2)
|
||||
66
tinygrad_repo/test/unit/test_disk_cache.py
Normal file
66
tinygrad_repo/test/unit/test_disk_cache.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import unittest
|
||||
import pickle
|
||||
from tinygrad.helpers import diskcache_get, diskcache_put
|
||||
|
||||
def remote_get(table,q,k): q.put(diskcache_get(table, k))
|
||||
def remote_put(table,k,v): diskcache_put(table, k, v)
|
||||
|
||||
class DiskCache(unittest.TestCase):
|
||||
def test_putget(self):
|
||||
table = "test_putget"
|
||||
diskcache_put(table, "hello", "world")
|
||||
self.assertEqual(diskcache_get(table, "hello"), "world")
|
||||
diskcache_put(table, "hello", "world2")
|
||||
self.assertEqual(diskcache_get(table, "hello"), "world2")
|
||||
|
||||
def test_putcomplex(self):
|
||||
table = "test_putcomplex"
|
||||
diskcache_put(table, "k", ("complex", 123, "object"))
|
||||
ret = diskcache_get(table, "k")
|
||||
self.assertEqual(ret, ("complex", 123, "object"))
|
||||
|
||||
def test_getotherprocess(self):
|
||||
table = "test_getotherprocess"
|
||||
from multiprocessing import Process, Queue
|
||||
diskcache_put(table, "k", "getme")
|
||||
q = Queue()
|
||||
p = Process(target=remote_get, args=(table,q,"k"))
|
||||
p.start()
|
||||
p.join()
|
||||
self.assertEqual(q.get(), "getme")
|
||||
|
||||
def test_putotherprocess(self):
|
||||
table = "test_putotherprocess"
|
||||
from multiprocessing import Process
|
||||
p = Process(target=remote_put, args=(table,"k", "remote"))
|
||||
p.start()
|
||||
p.join()
|
||||
self.assertEqual(diskcache_get(table, "k"), "remote")
|
||||
|
||||
def test_no_table(self):
|
||||
self.assertIsNone(diskcache_get("faketable", "k"))
|
||||
|
||||
def test_ret(self):
|
||||
table = "test_ret"
|
||||
self.assertEqual(diskcache_put(table, "key", ("vvs",)), ("vvs",))
|
||||
|
||||
def test_non_str_key(self):
|
||||
table = "test_non_str_key"
|
||||
diskcache_put(table, 4, 5)
|
||||
self.assertEqual(diskcache_get(table, 4), 5)
|
||||
self.assertEqual(diskcache_get(table, "4"), 5)
|
||||
|
||||
def test_dict_key(self):
|
||||
table = "test_dict_key"
|
||||
fancy_key = {"hello": "world", "goodbye": 7, "good": True, "pkl": pickle.dumps("cat")}
|
||||
fancy_key2 = {"hello": "world", "goodbye": 8, "good": True, "pkl": pickle.dumps("cat")}
|
||||
fancy_key3 = {"hello": "world", "goodbye": 8, "good": True, "pkl": pickle.dumps("dog")}
|
||||
diskcache_put(table, fancy_key, 5)
|
||||
self.assertEqual(diskcache_get(table, fancy_key), 5)
|
||||
diskcache_put(table, fancy_key2, 8)
|
||||
self.assertEqual(diskcache_get(table, fancy_key2), 8)
|
||||
self.assertEqual(diskcache_get(table, fancy_key), 5)
|
||||
self.assertEqual(diskcache_get(table, fancy_key3), None)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
150
tinygrad_repo/test/unit/test_disk_tensor.py
Normal file
150
tinygrad_repo/test/unit/test_disk_tensor.py
Normal file
@@ -0,0 +1,150 @@
|
||||
import pathlib
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
from tinygrad.nn.state import safe_load, safe_save, get_state_dict, torch_load
|
||||
from tinygrad.helpers import dtypes
|
||||
from tinygrad.runtime.ops_disk import RawDiskBuffer
|
||||
from tinygrad.helpers import Timing
|
||||
from extra.utils import fetch_as_file, temp
|
||||
|
||||
def compare_weights_both(url):
|
||||
import torch
|
||||
fn = fetch_as_file(url)
|
||||
tg_weights = get_state_dict(torch_load(fn))
|
||||
torch_weights = get_state_dict(torch.load(fn), tensor_type=torch.Tensor)
|
||||
assert list(tg_weights.keys()) == list(torch_weights.keys())
|
||||
for k in tg_weights:
|
||||
np.testing.assert_equal(tg_weights[k].numpy(), torch_weights[k].numpy(), err_msg=f"mismatch at {k}, {tg_weights[k].shape}")
|
||||
print(f"compared {len(tg_weights)} weights")
|
||||
|
||||
class TestTorchLoad(unittest.TestCase):
|
||||
# pytorch pkl format
|
||||
def test_load_enet(self): compare_weights_both("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth")
|
||||
# pytorch zip format
|
||||
def test_load_enet_alt(self): compare_weights_both("https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth")
|
||||
# pytorch zip format
|
||||
def test_load_convnext(self): compare_weights_both('https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth')
|
||||
# TODO: support pytorch tar format with minimal lines
|
||||
#def test_load_resnet(self): compare_weights_both('https://download.pytorch.org/models/resnet50-19c8e357.pth')
|
||||
|
||||
test_fn = pathlib.Path(__file__).parents[2] / "weights/LLaMA/7B/consolidated.00.pth"
|
||||
#test_size = test_fn.stat().st_size
|
||||
test_size = 1024*1024*1024*2
|
||||
|
||||
# sudo su -c 'sync; echo 1 > /proc/sys/vm/drop_caches' && python3 test/unit/test_disk_tensor.py TestRawDiskBuffer.test_readinto_read_speed
|
||||
@unittest.skipIf(not test_fn.exists(), "download LLaMA weights for read in speed tests")
|
||||
class TestRawDiskBuffer(unittest.TestCase):
|
||||
def test_readinto_read_speed(self):
|
||||
tst = np.empty(test_size, np.uint8)
|
||||
with open(test_fn, "rb") as f:
|
||||
with Timing("copy in ", lambda et_ns: f" {test_size/et_ns:.2f} GB/s"):
|
||||
f.readinto(tst)
|
||||
|
||||
def test_mmap_read_speed(self):
|
||||
db = RawDiskBuffer(test_size, dtype=dtypes.uint8, device=test_fn)
|
||||
tst = np.empty(test_size, np.uint8)
|
||||
with Timing("copy in ", lambda et_ns: f" {test_size/et_ns:.2f} GB/s"):
|
||||
np.copyto(tst, db.toCPU())
|
||||
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu doesn't support uint8 datatype")
|
||||
class TestSafetensors(unittest.TestCase):
|
||||
def test_real_safetensors(self):
|
||||
import torch
|
||||
from safetensors.torch import save_file
|
||||
torch.manual_seed(1337)
|
||||
tensors = {
|
||||
"weight1": torch.randn((16, 16)),
|
||||
"weight2": torch.arange(0, 17, dtype=torch.uint8),
|
||||
"weight3": torch.arange(0, 17, dtype=torch.int32).reshape(17,1,1),
|
||||
"weight4": torch.arange(0, 2, dtype=torch.uint8),
|
||||
}
|
||||
save_file(tensors, temp("model.safetensors"))
|
||||
|
||||
ret = safe_load(temp("model.safetensors"))
|
||||
for k,v in tensors.items(): np.testing.assert_array_equal(ret[k].numpy(), v.numpy())
|
||||
safe_save(ret, temp("model.safetensors_alt"))
|
||||
with open(temp("model.safetensors"), "rb") as f:
|
||||
with open(temp("model.safetensors_alt"), "rb") as g:
|
||||
assert f.read() == g.read()
|
||||
ret2 = safe_load(temp("model.safetensors_alt"))
|
||||
for k,v in tensors.items(): np.testing.assert_array_equal(ret2[k].numpy(), v.numpy())
|
||||
|
||||
def test_efficientnet_safetensors(self):
|
||||
from models.efficientnet import EfficientNet
|
||||
model = EfficientNet(0)
|
||||
state_dict = get_state_dict(model)
|
||||
safe_save(state_dict, temp("eff0"))
|
||||
state_dict_loaded = safe_load(temp("eff0"))
|
||||
assert sorted(list(state_dict_loaded.keys())) == sorted(list(state_dict.keys()))
|
||||
for k,v in state_dict.items():
|
||||
np.testing.assert_array_equal(v.numpy(), state_dict_loaded[k].numpy())
|
||||
|
||||
# load with the real safetensors
|
||||
from safetensors import safe_open
|
||||
with safe_open(temp("eff0"), framework="pt", device="cpu") as f:
|
||||
assert sorted(list(f.keys())) == sorted(list(state_dict.keys()))
|
||||
for k in f.keys():
|
||||
np.testing.assert_array_equal(f.get_tensor(k).numpy(), state_dict[k].numpy())
|
||||
|
||||
def test_huggingface_enet_safetensors(self):
|
||||
# test a real file
|
||||
fn = fetch_as_file("https://huggingface.co/timm/mobilenetv3_small_075.lamb_in1k/resolve/main/model.safetensors")
|
||||
state_dict = safe_load(fn)
|
||||
assert len(state_dict.keys()) == 244
|
||||
assert 'blocks.2.2.se.conv_reduce.weight' in state_dict
|
||||
assert state_dict['blocks.0.0.bn1.num_batches_tracked'].numpy() == 276570
|
||||
assert state_dict['blocks.2.0.bn2.num_batches_tracked'].numpy() == 276570
|
||||
|
||||
def test_metadata(self):
|
||||
metadata = {"hello": "world"}
|
||||
safe_save({}, temp('metadata.safetensors'), metadata)
|
||||
import struct
|
||||
with open(temp('metadata.safetensors'), 'rb') as f:
|
||||
dat = f.read()
|
||||
sz = struct.unpack(">Q", dat[0:8])[0]
|
||||
import json
|
||||
assert json.loads(dat[8:8+sz])['__metadata__']['hello'] == 'world'
|
||||
|
||||
def helper_test_disk_tensor(fn, data, np_fxn, tinygrad_fxn=None):
|
||||
if tinygrad_fxn is None: tinygrad_fxn = np_fxn
|
||||
pathlib.Path(temp(fn)).unlink(missing_ok=True)
|
||||
tinygrad_tensor = Tensor(data, device="CPU").to(f"disk:{temp(fn)}")
|
||||
numpy_arr = np.array(data)
|
||||
tinygrad_fxn(tinygrad_tensor)
|
||||
np_fxn(numpy_arr)
|
||||
np.testing.assert_allclose(tinygrad_tensor.numpy(), numpy_arr)
|
||||
|
||||
class TestDiskTensor(unittest.TestCase):
|
||||
def test_empty(self):
|
||||
pathlib.Path(temp("dt1")).unlink(missing_ok=True)
|
||||
Tensor.empty(100, 100, device=f"disk:{temp('dt1')}")
|
||||
|
||||
def test_write_ones(self):
|
||||
pathlib.Path(temp("dt2")).unlink(missing_ok=True)
|
||||
|
||||
out = Tensor.ones(10, 10, device="CPU")
|
||||
outdisk = out.to(f"disk:{temp('dt2')}")
|
||||
print(outdisk)
|
||||
outdisk.realize()
|
||||
del out, outdisk
|
||||
|
||||
# test file
|
||||
with open(temp("dt2"), "rb") as f:
|
||||
assert f.read() == b"\x00\x00\x80\x3F" * 100
|
||||
|
||||
# test load alt
|
||||
reloaded = Tensor.empty(10, 10, device=f"disk:{temp('dt2')}")
|
||||
out = reloaded.numpy()
|
||||
assert np.all(out == 1.)
|
||||
|
||||
def test_assign_slice(self):
|
||||
def assign(x,s,y): x[s] = y
|
||||
helper_test_disk_tensor("dt3", [0,1,2,3], lambda x: assign(x, slice(0,2), [13, 12]))
|
||||
helper_test_disk_tensor("dt4", [[0,1,2,3],[4,5,6,7]], lambda x: assign(x, slice(0,1), [[13, 12, 11, 10]]))
|
||||
|
||||
def test_reshape(self):
|
||||
helper_test_disk_tensor("dt5", [1,2,3,4,5], lambda x: x.reshape((1,5)))
|
||||
helper_test_disk_tensor("dt6", [1,2,3,4], lambda x: x.reshape((2,2)))
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
44
tinygrad_repo/test/unit/test_flopcounter.py
Normal file
44
tinygrad_repo/test/unit/test_flopcounter.py
Normal file
@@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
from tinygrad.ops import LazyOp, BinaryOps, ReduceOps, get_lazyop_info, BufferOps, MemBuffer
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
from tinygrad.helpers import dtypes
|
||||
|
||||
class TestFlopCounter(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.buf0 = LazyOp(BufferOps.MEM, (), MemBuffer(1, dtypes.float32, ShapeTracker.from_shape((4,))))
|
||||
self.buf1 = LazyOp(BufferOps.MEM, (), MemBuffer(2, dtypes.float32, ShapeTracker.from_shape((4,))))
|
||||
|
||||
def test_flops_add(self):
|
||||
op0 = LazyOp(BinaryOps.ADD, (self.buf0,self.buf1,), None)
|
||||
info = get_lazyop_info(op0)
|
||||
self.assertEqual(info.flops, 4)
|
||||
|
||||
def test_flops_add_twice(self):
|
||||
op0 = LazyOp(BinaryOps.ADD, (self.buf0,self.buf1,), None)
|
||||
op1 = LazyOp(BinaryOps.ADD, (op0,self.buf1,), None)
|
||||
info = get_lazyop_info(op1)
|
||||
self.assertEqual(info.flops, 8)
|
||||
|
||||
def test_flops_add_self(self):
|
||||
op0 = LazyOp(BinaryOps.ADD, (self.buf0,self.buf1,), None)
|
||||
op1 = LazyOp(BinaryOps.ADD, (op0,op0,), None)
|
||||
info = get_lazyop_info(op1)
|
||||
self.assertEqual(info.flops, 8)
|
||||
|
||||
def test_flops_add_roundabout_self(self):
|
||||
op0 = LazyOp(BinaryOps.ADD, (self.buf0,self.buf1,), None)
|
||||
op1 = LazyOp(BinaryOps.ADD, (op0,self.buf1,), None)
|
||||
op2 = LazyOp(BinaryOps.ADD, (op0,op1,), None)
|
||||
info = get_lazyop_info(op2)
|
||||
self.assertEqual(info.flops, 12)
|
||||
|
||||
def test_flops_red(self):
|
||||
op0 = LazyOp(BinaryOps.MUL, (self.buf0,self.buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1,))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1, op1,), None)
|
||||
info = get_lazyop_info(op2)
|
||||
self.assertEqual(info.flops, 9)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
142
tinygrad_repo/test/unit/test_helpers.py
Normal file
142
tinygrad_repo/test/unit/test_helpers.py
Normal file
@@ -0,0 +1,142 @@
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.helpers import Context, ContextVar, DType, dtypes, merge_dicts, strip_parens, prod
|
||||
from tinygrad.shape.symbolic import Variable, NumNode
|
||||
|
||||
VARIABLE = ContextVar("VARIABLE", 0)
|
||||
|
||||
class TestContextVars(unittest.TestCase):
|
||||
# Ensuring that the test does not modify variables outside the tests.
|
||||
ctx = Context()
|
||||
def setUp(self): TestContextVars.ctx.__enter__()
|
||||
def tearDown(self): TestContextVars.ctx.__exit__()
|
||||
|
||||
def test_initial_value_is_set(self):
|
||||
_TMP = ContextVar("_TMP", 5)
|
||||
self.assertEqual(_TMP.value, 5)
|
||||
|
||||
def test_multiple_creation_ignored(self):
|
||||
_TMP2 = ContextVar("_TMP2", 1)
|
||||
_TMP2 = ContextVar("_TMP2", 2)
|
||||
self.assertEqual(_TMP2.value, 1)
|
||||
|
||||
def test_new_var_inside_context(self):
|
||||
# Creating a _new_ variable inside a context should not have any effect on its scope (?)
|
||||
with Context(VARIABLE=1):
|
||||
_TMP3 = ContextVar("_TMP3", 1)
|
||||
_TMP3 = ContextVar("_TMP3", 2)
|
||||
self.assertEqual(_TMP3.value, 1)
|
||||
|
||||
def test_value_accross_modules(self):
|
||||
# Mocking module import by invoking the code but not in our globals().
|
||||
exec('from tinygrad.helpers import ContextVar;C = ContextVar("C", 13)', {}) # pylint:disable=exec-used
|
||||
# It should not matter that the first creation was in another module.
|
||||
C = ContextVar("C", 0)
|
||||
self.assertEqual(C.value, 13)
|
||||
|
||||
def test_assignment_across_modules(self):
|
||||
B = ContextVar("B", 1)
|
||||
# local assignment
|
||||
B.value = 2
|
||||
self.assertEqual(B.value, 2)
|
||||
# Assignment in another module.
|
||||
exec('from tinygrad.helpers import ContextVar;B = ContextVar("B", 0);B.value = 3;', {}) # pylint:disable=exec-used
|
||||
# Assignment in another module should affect this one as well.
|
||||
self.assertEqual(B.value, 3)
|
||||
|
||||
def test_context_assignment(self):
|
||||
with Context(VARIABLE=1):
|
||||
self.assertEqual(VARIABLE.value, 1)
|
||||
self.assertEqual(VARIABLE.value, 0)
|
||||
|
||||
def test_unknown_param_to_context(self):
|
||||
with self.assertRaises(KeyError):
|
||||
with Context(SOMETHING_ELSE=1):
|
||||
pass
|
||||
|
||||
def test_inside_context_assignment(self):
|
||||
with Context(VARIABLE=4):
|
||||
# What you can and cannot do inside a context.
|
||||
# 1. This type of statement has no effect.
|
||||
VARIABLE = ContextVar("VARIABLE", 0)
|
||||
self.assertTrue(VARIABLE >= 4, "ContextVars inside contextmanager may not set a new value")
|
||||
|
||||
# 2. The call syntax however has a local effect.
|
||||
VARIABLE.value = 13
|
||||
self.assertTrue(VARIABLE.value == 13, "Call syntax however works inside a contextmanager.")
|
||||
|
||||
# Related to 2. above. Note that VARIABLE is back to 0 again as expected.
|
||||
self.assertEqual(VARIABLE.value, 0)
|
||||
|
||||
def test_new_var_inside_context_other_module(self):
|
||||
with Context(VARIABLE=1):
|
||||
_NEW2 = ContextVar("_NEW2", 0)
|
||||
_NEW2 = ContextVar("_NEW2", 1)
|
||||
self.assertEqual(_NEW2.value, 0)
|
||||
|
||||
code = """\
|
||||
from tinygrad.helpers import Context, ContextVar
|
||||
with Context(VARIABLE=1):
|
||||
_NEW3 = ContextVar("_NEW3", 0)"""
|
||||
exec(code, {}) # pylint:disable=exec-used
|
||||
# While _NEW3 was created in an outside scope it should still work the same as above.
|
||||
_NEW3 = ContextVar("_NEW3", 1)
|
||||
self.assertEqual(_NEW3.value, 0)
|
||||
|
||||
def test_nested_context(self):
|
||||
with Context(VARIABLE=1):
|
||||
with Context(VARIABLE=2):
|
||||
with Context(VARIABLE=3):
|
||||
self.assertEqual(VARIABLE.value, 3)
|
||||
self.assertEqual(VARIABLE.value, 2)
|
||||
self.assertEqual(VARIABLE.value, 1)
|
||||
self.assertEqual(VARIABLE.value, 0)
|
||||
|
||||
def test_decorator(self):
|
||||
@Context(VARIABLE=1, DEBUG=4)
|
||||
def test():
|
||||
self.assertEqual(VARIABLE.value, 1)
|
||||
|
||||
self.assertEqual(VARIABLE.value, 0)
|
||||
test()
|
||||
self.assertEqual(VARIABLE.value, 0)
|
||||
|
||||
def test_context_exit_reverts_updated_values(self):
|
||||
D = ContextVar("D", 1)
|
||||
D.value = 2
|
||||
with Context(D=3):
|
||||
...
|
||||
assert D.value == 2, f"Expected D to be 2, but was {D.value}. Indicates that Context.__exit__ did not restore to the correct value."
|
||||
|
||||
class TestMergeDicts(unittest.TestCase):
|
||||
def test_merge_dicts(self):
|
||||
a = {"a": 1, "b": 2}
|
||||
b = {"a": 1, "c": 3}
|
||||
c = {}
|
||||
d = {"a": 2, "b": 2}
|
||||
assert merge_dicts([a, b]) == {"a": 1, "b": 2, "c": 3}
|
||||
assert merge_dicts([a, c]) == a
|
||||
assert merge_dicts([a, b, c]) == {"a": 1, "b": 2, "c": 3}
|
||||
with self.assertRaises(AssertionError):
|
||||
merge_dicts([a, d])
|
||||
|
||||
class TestDtypes(unittest.TestCase):
|
||||
def test_dtypes_fields(self):
|
||||
fields = dtypes.fields()
|
||||
self.assertTrue(all(isinstance(value, DType) for value in fields.values()))
|
||||
self.assertTrue(all(issubclass(value.np, np.generic) for value in fields.values() if value.np is not None))
|
||||
|
||||
class TestStripParens(unittest.TestCase):
|
||||
def test_simple(self): self.assertEqual("1+2", strip_parens("(1+2)"))
|
||||
def test_nested(self): self.assertEqual("1+(2+3)", strip_parens("(1+(2+3))"))
|
||||
def test_casted_no_strip(self): self.assertEqual("(int)(1+2)", strip_parens("(int)(1+2)"))
|
||||
|
||||
class TestProd(unittest.TestCase):
|
||||
def test_empty(self): self.assertEqual(1, prod(tuple()))
|
||||
def test_ints(self): self.assertEqual(30, prod((2, 3, 5)))
|
||||
def test_variable(self): self.assertEqual("(a*12)", prod((Variable("a", 1, 5), 3, 4)).render())
|
||||
def test_variable_order(self): self.assertEqual("(a*12)", prod((3, 4, Variable("a", 1, 5))).render())
|
||||
def test_num_nodes(self): self.assertEqual(NumNode(6), prod((NumNode(2), NumNode(3))))
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
663
tinygrad_repo/test/unit/test_shapetracker.py
Normal file
663
tinygrad_repo/test/unit/test_shapetracker.py
Normal file
@@ -0,0 +1,663 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.helpers import prod, DEBUG
|
||||
from tinygrad.shape.shapetracker import ShapeTracker, View, get_contraction
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
from itertools import product
|
||||
|
||||
def shapetracker_getitem(st, val):
|
||||
locals = {"idx": val, "valid": 1}
|
||||
idx, valid = st.expr_node()
|
||||
exec(f"valid={valid.render()};idx={idx.render()}", None, locals)
|
||||
return locals["idx"] if locals["valid"] else -1
|
||||
|
||||
class CheckingShapeTracker:
|
||||
def __init__(self, shape):
|
||||
self.st = ShapeTracker.from_shape(shape)
|
||||
self.t = np.arange(prod(shape), dtype=np.int32).reshape(shape)
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
return self.t.shape
|
||||
|
||||
def simplify(self):
|
||||
self.st = self.st.simplify()
|
||||
return self
|
||||
|
||||
def reshape(self, new_shape):
|
||||
self.st = self.st.reshape(new_shape)
|
||||
self.t = self.t.reshape(new_shape)
|
||||
return self
|
||||
|
||||
def permute(self, axis):
|
||||
self.st = self.st.permute(axis)
|
||||
self.t = np.transpose(self.t, axis)
|
||||
return self
|
||||
|
||||
def expand(self, new_shape):
|
||||
self.st = self.st.expand(new_shape)
|
||||
self.t = np.broadcast_to(self.t, new_shape)
|
||||
return self
|
||||
|
||||
def flip(self, axis):
|
||||
self.st = self.st.stride(tuple(-1 if i in axis else 1 for i in range(len(self.shape))))
|
||||
self.t = np.flip(self.t, axis)
|
||||
return self
|
||||
|
||||
def shrink(self, arg):
|
||||
self.st = self.st.shrink(arg)
|
||||
self.t = self.t[tuple([slice(x[0], x[1]) for x in arg])]
|
||||
return self
|
||||
|
||||
def pad(self, arg):
|
||||
self.st = self.st.pad(arg)
|
||||
self.t = np.pad(self.t, arg, constant_values=-1)
|
||||
return self
|
||||
|
||||
def stride(self, arg):
|
||||
self.st = self.st.stride(arg)
|
||||
self.t = self.t[tuple([slice(None, None, x) for x in arg])]
|
||||
return self
|
||||
|
||||
def __getitem__(self, val):
|
||||
return self.t.flatten()[val]
|
||||
|
||||
@property
|
||||
def views(self): return self.st.views
|
||||
|
||||
@property
|
||||
def contiguous(self): return self.st.contiguous
|
||||
|
||||
def assert_same(self):
|
||||
x = [shapetracker_getitem(self.st, i) for i in range(prod(self.st.shape))]
|
||||
y = [self[i] for i in range(prod(self.shape))]
|
||||
idx, valid = self.st.expr_node()
|
||||
if DEBUG >= 1: print(x, y, self.st.shape, self.shape, idx.render(), valid.render(), self.st)
|
||||
assert self.st.shape == self.shape
|
||||
assert x == y, f"mismatch shapetracker:{x} real:{y}"
|
||||
|
||||
class TestRealIssues(unittest.TestCase):
|
||||
def test_reshape_doesnt_multiview(self):
|
||||
self.st = ShapeTracker((View.create((256, 256, 2, 2, 2, 2, 2, 256, 8, 2), (0, 8, 0, 4, 0, 0, 2, 16384, 2048, 1), 0, None),))
|
||||
self.st.reshape((128, 2, 256, 2, 2, 2, 2, 2, 256, 8, 2))
|
||||
assert len(self.st.views) == 1
|
||||
|
||||
class TestRealDoesntSimplify(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
st = self.st.real_strides()
|
||||
print(st)
|
||||
self.st = self.st.simplify()
|
||||
assert len(self.st.views) != 1
|
||||
assert None in st
|
||||
|
||||
def test_1(self):
|
||||
self.st = ShapeTracker((
|
||||
View.create((8, 3, 1, 2, 11, 1), (33, 11, 0, 0, 1, 0), 0, None),
|
||||
View.create((8, 6, 11), (66, 11, 1), 0, None)))
|
||||
assert self.st.real_strides() == (33, None, 1)
|
||||
|
||||
def test_2(self):
|
||||
self.st = ShapeTracker((
|
||||
View.create((2, 2, 4, 3, 3), (72, 9, 18, -3, -1), 8, None),
|
||||
View.create((4, 4, 3, 3), (36, 9, 3, 1), 0, None)))
|
||||
assert self.st.real_strides() == (None, 18, -3, -1)
|
||||
|
||||
class TestRealStrides(unittest.TestCase):
|
||||
def test_1(self):
|
||||
self.st = ShapeTracker((
|
||||
View.create((2048,), (1,), 0, ((0, 512),)),
|
||||
View.create((16, 32, 4), (128, 4, 1), 0, None)))
|
||||
st = self.st.real_strides()
|
||||
print(self.st, st)
|
||||
assert st == (None, 4, 1)
|
||||
|
||||
class TestRealSimplifies(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
st = self.st.real_strides()
|
||||
self.st = self.st.simplify()
|
||||
assert len(self.st.views) == 1
|
||||
print(self.st.views[-1].strides, st)
|
||||
assert self.st.views[-1].strides == st
|
||||
|
||||
def test_1(self):
|
||||
self.st = ShapeTracker((
|
||||
View.create((1, 3, 2, 11, 4, 28), (0, 308, 0, 28, 0, 1), 0, None),
|
||||
View.create((1, 3, 2, 11, 26, 1, 1, 3), (0, 2464, 0, 112, 1, 0, 0, 29), 0, None)))
|
||||
|
||||
def test_2(self):
|
||||
self.st = ShapeTracker((
|
||||
View.create((8, 3, 3, 11, 2, 28), (924, 308, 0, 28, 0, 1), 0, None),
|
||||
View.create((8, 1, 6, 10, 28, 3, 2, 1), (5544, 0, 0, 56, 1, 1848, 672, 0), 0, None)))
|
||||
|
||||
class TestIndexExpressions2d(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
shapes = [(30, 5), (15, 10), (15, 1), (5, 10), (5, 1)] # Make sure dim0 is a multiple of 5, one of the tests divides this dimension by 5
|
||||
offsets = [0, 1, 15, 28, 10000]
|
||||
self.sts = [ShapeTracker((View.create(base_shape, offset=offset),)) for base_shape in shapes for offset in offsets]
|
||||
self.offset = [Variable.num(offset) for base_shape in shapes for offset in offsets]
|
||||
self.shapes = [shape for shape in shapes for offset in offsets]
|
||||
self.node_exprs = []
|
||||
self.idxs_exprs = []
|
||||
|
||||
def tearDown(self):
|
||||
for st, offset, shape, node_expr, idxs_expr in zip(self.sts, self.offset, self.shapes, self.node_exprs, self.idxs_exprs):
|
||||
numel = prod(shape)
|
||||
assert node_expr(self.default_idx(st.shape)) == st.expr_node()[0]
|
||||
assert node_expr(self.default_idx(st.shape)) == st.expr_node(None)[0]
|
||||
assert node_expr(self.default_idx(st.shape)) == st.expr_node('idx')[0]
|
||||
self.check_bounds(node_expr(self.default_idx(st.shape)), offset, numel)
|
||||
for idx in [(0, numel-1), (7, 203), (2, 5), (0, 0), (numel, numel), (0, numel), (0, numel+1), (numel+100, numel+100)]:
|
||||
idx = Variable("idx", idx[0], idx[1])
|
||||
assert node_expr(idx) == st.expr_node(idx)[0]
|
||||
self.check_bounds(node_expr(idx), offset, numel)
|
||||
|
||||
assert idxs_expr(self.default_idxs(st.shape)) == st.expr_idxs()[0]
|
||||
assert idxs_expr(self.default_idxs(st.shape)) == st.expr_idxs(None)[0]
|
||||
self.check_bounds(idxs_expr(self.default_idxs(st.shape)), offset, numel)
|
||||
idx0s = [(0,0), (0, min(1, st.shape[0]-1)), (0, st.shape[0]-1), (min(3, st.shape[0]-1), min(6, st.shape[0]-1)), (st.shape[0]-1, st.shape[0]-1)]
|
||||
idx1s = [(0,0), (0, min(1, st.shape[1]-1)), (0, st.shape[1]-1), (min(3, st.shape[1]-1), min(6, st.shape[1]-1)), (st.shape[1]-1, st.shape[1]-1)]
|
||||
idx2s = [(0,0), (0, min(1, st.shape[2]-1)), (0, st.shape[2]-1), (min(3, st.shape[2]-1), min(6, st.shape[2]-1)), (st.shape[2]-1, st.shape[2]-1)] if len(st.shape) == 3 else [None for _ in idx0s]
|
||||
for idx0, idx1, idx2 in product(idx0s, idx1s, idx2s):
|
||||
idxs = [Variable(f"idx{i}", idx[0], idx[1]) for i, idx in enumerate((idx0, idx1, idx2)) if idx is not None]
|
||||
assert idxs_expr(idxs) == st.expr_idxs(idxs)[0]
|
||||
self.check_bounds(idxs_expr(idxs), offset, numel)
|
||||
|
||||
def default_idx(self, shape):
|
||||
return Variable("idx", 0, prod(shape)-1)
|
||||
|
||||
def default_idxs(self, shape):
|
||||
return [Variable(f"idx{i}", 0, d-1) for i,d in enumerate(shape)]
|
||||
|
||||
def check_bounds(self, expr, offset, numel):
|
||||
assert expr.min >= offset
|
||||
assert expr.max <= offset + numel - 1
|
||||
|
||||
def test_noop(self):
|
||||
for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
|
||||
self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%prod(base_shape) + offset)
|
||||
self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: idxs[0]*base_shape[1] + idxs[1] + offset)
|
||||
|
||||
def test_permute(self):
|
||||
new_st = []
|
||||
for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
|
||||
st = st.permute((1, 0))
|
||||
self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%base_shape[0]*base_shape[1] + idx//base_shape[0]%base_shape[1] + offset)
|
||||
self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: idxs[0] + idxs[1]*base_shape[1] + offset)
|
||||
new_st.append(st)
|
||||
self.sts = new_st
|
||||
|
||||
def test_reshape(self):
|
||||
new_st = []
|
||||
for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
|
||||
st = st.reshape((base_shape[0], 1, base_shape[1]))
|
||||
self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%prod(base_shape) + offset)
|
||||
self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: idxs[0]*base_shape[1] + idxs[2] + offset)
|
||||
new_st.append(st)
|
||||
self.sts = new_st
|
||||
|
||||
def test_reshape_expand(self):
|
||||
new_st = []
|
||||
for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
|
||||
st = st.reshape((base_shape[0], 1, base_shape[1]))
|
||||
st = st.expand((base_shape[0], base_shape[1], base_shape[1]))
|
||||
self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx//(base_shape[1]*base_shape[1])%base_shape[0]*base_shape[1] + idx%base_shape[1] + offset)
|
||||
self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: idxs[0]*base_shape[1] + idxs[2] + offset)
|
||||
new_st.append(st)
|
||||
self.sts = new_st
|
||||
|
||||
def test_permute_reshape_1(self): # This tests multiple views
|
||||
new_st = []
|
||||
for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
|
||||
st = st.permute((1, 0))
|
||||
st = st.reshape((base_shape[0]//5, 1, base_shape[1]*5))
|
||||
self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%prod(base_shape)%base_shape[0]*base_shape[1] + idx//base_shape[0]%base_shape[1] + offset)
|
||||
self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: (idxs[0]*(base_shape[1]*5)+idxs[2])%base_shape[0]*base_shape[1] + (idxs[0]*(base_shape[1]*5)+idxs[2])//base_shape[0] + offset)
|
||||
new_st.append(st)
|
||||
self.sts = new_st
|
||||
|
||||
def test_permute_reshape_2(self):
|
||||
new_st = []
|
||||
for st, base_shape, offset in zip(self.sts, self.shapes, self.offset):
|
||||
st = st.permute((1, 0))
|
||||
st = st.reshape((1, base_shape[0]//5, base_shape[1]*5))
|
||||
self.node_exprs.append(lambda idx, base_shape=base_shape, offset=offset: idx%prod(base_shape)%base_shape[0]*base_shape[1] + idx//base_shape[0]%base_shape[1] + offset)
|
||||
self.idxs_exprs.append(lambda idxs, base_shape=base_shape, offset=offset: (idxs[1]*(base_shape[1]*5)+idxs[2])%base_shape[0]*base_shape[1] + (idxs[1]*(base_shape[1]*5)+idxs[2])//base_shape[0] + offset)
|
||||
new_st.append(st)
|
||||
self.sts = new_st
|
||||
|
||||
class TestSimplifyingShapeTracker(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.st = CheckingShapeTracker((1, 10))
|
||||
|
||||
def tearDown(self):
|
||||
self.st.assert_same()
|
||||
|
||||
# multiview simplify
|
||||
def test_expand_contract_simple(self):
|
||||
self.st = self.st.expand((10, 10))
|
||||
self.st = self.st.reshape((100,))
|
||||
print(self.st.views)
|
||||
assert(len(self.st.views) == 2)
|
||||
self.st = self.st.reshape((10, 10))
|
||||
print(self.st.views)
|
||||
|
||||
self.st = self.st.simplify()
|
||||
print(self.st.views)
|
||||
assert(len(self.st.views) == 1)
|
||||
|
||||
# multiview simplify
|
||||
def test_expand_contract_different_shape(self):
|
||||
self.st.expand((10, 10))
|
||||
self.st.reshape((100,))
|
||||
print(self.st.views)
|
||||
assert(len(self.st.views) == 2)
|
||||
self.st.reshape((2, 5, 2, 5))
|
||||
print(self.st.views)
|
||||
|
||||
self.st = self.st.simplify()
|
||||
print(self.st.views)
|
||||
assert(len(self.st.views) == 1)
|
||||
|
||||
# multiview simplify
|
||||
def test_expand_contract_still_complex(self):
|
||||
self.st.expand((10, 10))
|
||||
self.st.reshape((100,))
|
||||
print(self.st.views)
|
||||
assert(len(self.st.views) == 2)
|
||||
self.st.reshape((5, 20))
|
||||
|
||||
self.st = self.st.simplify()
|
||||
print(self.st.views)
|
||||
assert(len(self.st.views) == 2)
|
||||
|
||||
# Tensor.zeros(2, 4).permute(1,0).reshape(2, 4)
|
||||
# (d1*4 + d0%4), d1=x//4, d0=x%4 = ((x//4)*4) + (x%4)%4
|
||||
|
||||
class TestComplexShapeTracker(unittest.TestCase):
|
||||
def test_add_1s(self):
|
||||
self.st = CheckingShapeTracker((4, 4))
|
||||
self.st.permute((1,0))
|
||||
self.st.reshape((1,4,1,4,1))
|
||||
assert not self.st.contiguous
|
||||
self.st.permute((0,3,2,1,4))
|
||||
assert self.st.contiguous
|
||||
|
||||
def test_permute_1s_simple(self):
|
||||
self.st = CheckingShapeTracker((1, 16, 9,9))
|
||||
self.st.permute((1,0,2,3))
|
||||
assert self.st.contiguous
|
||||
self.st = CheckingShapeTracker((2, 16, 9,9))
|
||||
self.st.permute((1,0,2,3))
|
||||
assert not self.st.contiguous
|
||||
|
||||
def test_remove_1s_simple(self):
|
||||
self.st = CheckingShapeTracker((1, 16, 1, 1))
|
||||
self.st.reshape((16,))
|
||||
assert self.st.contiguous
|
||||
|
||||
def test_remove_1s(self):
|
||||
self.st = CheckingShapeTracker((1, 4, 1, 4, 1))
|
||||
self.st.permute((0,3,2,1,4))
|
||||
self.st.reshape((4,4))
|
||||
assert not self.st.contiguous
|
||||
self.st.permute((1,0))
|
||||
assert self.st.contiguous
|
||||
|
||||
def test_permute_reshape(self):
|
||||
self.st = CheckingShapeTracker((4, 4))
|
||||
self.st.permute((1,0))
|
||||
self.st.reshape((2, 2, 2, 2))
|
||||
# TODO: should also be tested by test_super_complex
|
||||
assert len(self.st.views) == 1
|
||||
|
||||
def test_factorize_split(self):
|
||||
self.st = CheckingShapeTracker((4, 4))
|
||||
self.st.permute((1,0))
|
||||
self.st.reshape((2, 2, 2, 2))
|
||||
self.st.permute((2,3,0,1))
|
||||
assert self.st.contiguous
|
||||
|
||||
def test_factorize_combine(self):
|
||||
self.st = CheckingShapeTracker((4, 4, 4))
|
||||
self.st.permute((2, 0, 1))
|
||||
self.st.reshape((4, 16))
|
||||
self.st.permute((1, 0))
|
||||
assert self.st.contiguous
|
||||
|
||||
def test_factorize_combine_add_ones(self):
|
||||
self.st = CheckingShapeTracker((4, 4, 4))
|
||||
self.st.permute((2, 0, 1))
|
||||
self.st.reshape((4, 16, 1, 1))
|
||||
self.st.permute((1, 0, 2, 3))
|
||||
assert self.st.contiguous
|
||||
|
||||
def test_fancy_factorize(self):
|
||||
self.st = CheckingShapeTracker((32, 3, 3, 1))
|
||||
self.st.reshape((8, 4, 3, 3))
|
||||
assert len(self.st.views) == 1
|
||||
|
||||
def test_super_complex_2_fail(self):
|
||||
self.st = CheckingShapeTracker((4, 4, 4))
|
||||
self.st.permute((2, 0, 1))
|
||||
self.st.reshape((16, 4))
|
||||
assert len(self.st.views) != 1
|
||||
|
||||
def test_work(self):
|
||||
self.st = CheckingShapeTracker((64, 1024, 4))
|
||||
self.st.reshape((1, 64, 128, 32))
|
||||
self.st.permute((0, 3, 1, 2))
|
||||
self.st.reshape((1, 32, 1, 64, 128))
|
||||
self.st.permute((0, 3, 4, 1, 2))
|
||||
assert self.st.contiguous
|
||||
|
||||
def test_work2(self):
|
||||
self.st = CheckingShapeTracker((64, 1024, 4))
|
||||
self.st.reshape((1, 64, 128, 32))
|
||||
self.st.permute((0, 3, 1, 2))
|
||||
self.st.reshape((1, 1, 32, 64, 128))
|
||||
self.st.permute((0, 3, 4, 1, 2))
|
||||
self.st.reshape((64, 1024, 4))
|
||||
print(self.st.views)
|
||||
assert self.st.contiguous
|
||||
|
||||
class TestSingleShapeTracker(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.st = CheckingShapeTracker((7,4))
|
||||
|
||||
def tearDown(self):
|
||||
self.st.assert_same()
|
||||
|
||||
def test_reshape(self):
|
||||
self.st.reshape((7,1,4))
|
||||
assert self.st.contiguous
|
||||
|
||||
def test_permute(self):
|
||||
self.st.permute((1,0))
|
||||
assert not self.st.contiguous
|
||||
|
||||
def test_shrink(self):
|
||||
self.st.shrink(((1,2), (0,4)))
|
||||
assert not self.st.contiguous
|
||||
|
||||
def test_double_permute(self):
|
||||
self.st.permute((1,0))
|
||||
self.st.permute((1,0))
|
||||
assert self.st.contiguous
|
||||
|
||||
def test_reshape_permute(self):
|
||||
self.st.reshape((7,1,4))
|
||||
self.st.permute((0,1,2))
|
||||
assert self.st.contiguous
|
||||
|
||||
def test_reshape_permute_yes(self):
|
||||
self.st.reshape((7,1,4))
|
||||
self.st.permute((0,2,1))
|
||||
assert self.st.contiguous
|
||||
|
||||
def test_reshape_permute_no(self):
|
||||
self.st.reshape((4,7))
|
||||
self.st.permute((1,0))
|
||||
assert not self.st.contiguous
|
||||
|
||||
class TestShapeTrackerFuzzFailures(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.st = CheckingShapeTracker((3,3,3))
|
||||
def tearDown(self):
|
||||
self.st.assert_same()
|
||||
@unittest.skip("simplify doesn't work in this case")
|
||||
def test_case_1(self):
|
||||
self.st.shrink(((1, 2), (1, 3), (1, 3)))
|
||||
self.st.reshape((1, 4))
|
||||
self.st.shrink(((0, 1), (1, 3)))
|
||||
print(self.st.st)
|
||||
self.st = self.st.simplify()
|
||||
print(self.st.st)
|
||||
def test_case_2(self):
|
||||
self.st.stride( (1, 1, -2) )
|
||||
self.st.reshape( (3, 6) )
|
||||
self.st.shrink( ((1, 2), (1, 5)) )
|
||||
self.st.stride( (1, -1) )
|
||||
def test_case_3(self):
|
||||
self.st.shrink( ((0, 2), (0, 2), (0, 1)) )
|
||||
self.st.permute( (1, 0, 2) )
|
||||
self.st.reshape( (4,) )
|
||||
self.st.shrink( ((0, 3),) )
|
||||
self.st.stride( (-1,) )
|
||||
def test_case_4(self):
|
||||
self.st.reshape( (3, 3, 3, 1) )
|
||||
self.st.pad( ((0, 0), (0, 0), (0, 0), (1, 1)) )
|
||||
self.st.shrink( ((0, 2), (1, 2), (0, 2), (0, 1)) )
|
||||
self.st.expand( (2, 1, 2, 3) )
|
||||
|
||||
class TestMaskedShapeTracker(unittest.TestCase):
|
||||
def test_pad_1x1(self):
|
||||
self.st = CheckingShapeTracker((1,1))
|
||||
self.st.pad(((1,1), (1,1)))
|
||||
self.st.assert_same()
|
||||
|
||||
def test_pad_2x2(self):
|
||||
self.st = CheckingShapeTracker((2,2))
|
||||
self.st.pad(((1,1), (1,1)))
|
||||
self.st.assert_same()
|
||||
|
||||
class TestShapeTracker(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.st = CheckingShapeTracker((7,4))
|
||||
self.apply = lambda fxn: [fxn(x) for x in [self.st]]
|
||||
|
||||
def tearDown(self):
|
||||
self.st.assert_same()
|
||||
|
||||
def test_noop(self):
|
||||
pass
|
||||
|
||||
def test_simple_split(self):
|
||||
self.test_permute()
|
||||
self.apply(lambda x: x.reshape((prod(self.st.shape), )))
|
||||
|
||||
def test_simple_pad(self):
|
||||
self.st.pad(((1,1), (1,1)))
|
||||
|
||||
def test_pad_shrink(self):
|
||||
self.st.pad(((1,1), (1,1)))
|
||||
self.st.shrink(((0,4), (0,4)))
|
||||
|
||||
def test_pad_one_sided(self):
|
||||
self.st.pad(((0,1), (0,0)))
|
||||
|
||||
def test_pad_reshape(self):
|
||||
self.st.pad(((0,1), (0,0)))
|
||||
self.st.reshape((8*4,))
|
||||
|
||||
def test_pad_pad(self):
|
||||
self.st.pad(((1,1), (1,1)))
|
||||
self.st.pad(((1,1), (1,1)))
|
||||
|
||||
def test_pad_permute(self):
|
||||
self.st.pad(((1,1), (2,2)))
|
||||
self.st.permute((1,0))
|
||||
|
||||
def test_pad_expand(self):
|
||||
self.st.reshape((7,4,1))
|
||||
self.st.pad(((1,1), (1,1), (0,0)))
|
||||
self.st.expand((9,6,4))
|
||||
|
||||
def test_pad_expand_alt(self):
|
||||
self.st.pad(((1,1), (1,1)))
|
||||
self.st.reshape((9,6,1))
|
||||
self.st.expand((9,6,4))
|
||||
|
||||
def test_pad_stride(self):
|
||||
self.st.pad(((1,4), (1,3)))
|
||||
self.st.stride((2,2))
|
||||
|
||||
def test_pad_stride_neg(self):
|
||||
self.st.pad(((1,2), (1,0)))
|
||||
self.st.stride((-1,-1))
|
||||
|
||||
def test_pad_stride_both(self):
|
||||
self.st.pad(((1,2), (1,0)))
|
||||
self.st.stride((-2,-2))
|
||||
|
||||
def test_shrink_pad(self):
|
||||
self.st.shrink(((0,4), (0,4)))
|
||||
self.st.pad(((1,1), (1,1)))
|
||||
|
||||
def test_reshape(self):
|
||||
new_shape = self.st.shape[::-1]
|
||||
self.apply(lambda x: x.reshape(new_shape))
|
||||
|
||||
def test_permute(self):
|
||||
if len(self.st.shape) == 2: self.apply(lambda x: x.permute((1,0)))
|
||||
elif len(self.st.shape) == 3: self.apply(lambda x: x.permute((2,0,1)))
|
||||
|
||||
def test_reshape_with_1(self):
|
||||
new_shape = (self.st.shape[0], 1, self.st.shape[1])
|
||||
self.apply(lambda x: x.reshape(new_shape))
|
||||
|
||||
def test_expand(self):
|
||||
self.test_reshape_with_1()
|
||||
new_shape = list(self.st.shape)
|
||||
new_shape[1] = 2
|
||||
self.apply(lambda x: x.expand(tuple(new_shape)))
|
||||
|
||||
def test_flip_0(self):
|
||||
self.apply(lambda x: x.flip((0,)))
|
||||
|
||||
def test_flip_1(self):
|
||||
self.apply(lambda x: x.flip((1,)))
|
||||
|
||||
def test_flip_01(self):
|
||||
self.apply(lambda x: x.flip((0,1)))
|
||||
|
||||
def test_slice_0(self):
|
||||
self.apply(lambda x: x.shrink(((1, x.shape[0]), (0, x.shape[1]))))
|
||||
|
||||
def test_slice_1(self):
|
||||
self.apply(lambda x: x.shrink(((0, x.shape[0]), (1, x.shape[1]))))
|
||||
|
||||
def test_slice_1c1(self):
|
||||
self.apply(lambda x: x.shrink(((0, 1), (0, 1))))
|
||||
|
||||
def test_slice_1c2(self):
|
||||
self.apply(lambda x: x.shrink(((1, 2), (1, 2))))
|
||||
|
||||
def test_double_permute(self):
|
||||
self.apply(lambda x: x.permute((1, 0)))
|
||||
self.apply(lambda x: x.permute((1, 0)))
|
||||
|
||||
def test_slice_permute(self):
|
||||
self.apply(lambda x: x.shrink(((0, 2), (2, 4))))
|
||||
self.apply(lambda x: x.permute((1, 0)))
|
||||
|
||||
def test_slice_expand(self):
|
||||
self.apply(lambda x: x.shrink(((0, 2), (3, 4))))
|
||||
self.apply(lambda x: x.expand((2, 10)))
|
||||
|
||||
def test_double_stride(self):
|
||||
self.apply(lambda x: x.stride((1, 2)))
|
||||
self.apply(lambda x: x.stride((2, 1)))
|
||||
|
||||
def test_stride(self): self.apply(lambda x: x.stride((2,1)))
|
||||
def test_stride_int(self): self.apply(lambda x: x.stride((1,2)))
|
||||
def test_stride_2(self): self.apply(lambda x: x.stride((2,2)))
|
||||
def test_stride_n(self): self.apply(lambda x: x.stride((-2,1)))
|
||||
def test_stride_int_n(self): self.apply(lambda x: x.stride((-1,2)))
|
||||
def test_stride_2_n(self): self.apply(lambda x: x.stride((-2,-2)))
|
||||
|
||||
def test_reshape_then_permute(self):
|
||||
self.test_reshape()
|
||||
self.test_permute()
|
||||
|
||||
def test_reshape_then_expand(self):
|
||||
self.test_reshape()
|
||||
self.test_expand()
|
||||
|
||||
def test_permute_then_reshape(self):
|
||||
self.test_permute()
|
||||
self.test_reshape()
|
||||
|
||||
def test_expand_then_reshape(self):
|
||||
self.test_expand()
|
||||
self.test_reshape()
|
||||
|
||||
def test_combo(self):
|
||||
self.test_permute()
|
||||
self.test_reshape()
|
||||
self.test_slice_1()
|
||||
self.test_expand()
|
||||
self.test_permute()
|
||||
|
||||
class TestGetContraction(unittest.TestCase):
|
||||
def test_contraction(self):
|
||||
r = get_contraction((1,2,3,4), (2,3,4))
|
||||
self.assertEqual(r, [[0, 1], [2], [3]])
|
||||
|
||||
r = get_contraction((2,1,3,4), (2,3,4))
|
||||
self.assertEqual(r, [[0], [1, 2], [3]])
|
||||
|
||||
r = get_contraction((1,2,3,1,4), (1,2,3,4))
|
||||
self.assertEqual(r, [[0], [1], [2], [3, 4]])
|
||||
|
||||
r = get_contraction((1,2,3,1,4,1,1), (2,3,4))
|
||||
self.assertEqual(r, [[0, 1], [2], [3, 4, 5, 6]])
|
||||
|
||||
r = get_contraction((1,2,3,4), (1,2,3*4))
|
||||
self.assertEqual(r, [[0], [1], [2, 3]])
|
||||
|
||||
r = get_contraction((1,2,3,4), (2,1,3,4))
|
||||
self.assertEqual(r, [[0, 1], [], [2], [3]])
|
||||
|
||||
r = get_contraction((1,2,3,4), (1,1,2*3*4,1))
|
||||
self.assertEqual(r, [[0], [], [1,2,3], []])
|
||||
|
||||
r = get_contraction((2,1,3,4), (1,2,3,4))
|
||||
self.assertEqual(r, [[], [0], [1, 2], [3]])
|
||||
|
||||
r = get_contraction((1,2,3,4), (2*3*4,1,1,1))
|
||||
self.assertEqual(r, [[0, 1, 2, 3], [], [], []])
|
||||
|
||||
r = get_contraction((4,4,4,4), (16,1,16))
|
||||
self.assertEqual(r, [[0, 1], [], [2, 3]])
|
||||
|
||||
r = get_contraction((1,2,3,4,1,1,1), (2,3,4))
|
||||
self.assertEqual(r, [[0, 1], [2], [3, 4, 5, 6]])
|
||||
|
||||
r = get_contraction((1,2,3,4), (1,2,3,4,1))
|
||||
self.assertEqual(r, [[0], [1], [2], [3], []])
|
||||
|
||||
r = get_contraction((14,1,384,14,1,1,1,1), (1,14,384,14))
|
||||
self.assertEqual(r, [[], [0], [1,2], [3,4,5,6,7]])
|
||||
|
||||
r = get_contraction((14,1,384,1,14,1,1,1,1), (1,14,384,14))
|
||||
self.assertEqual(r, [[], [0], [1,2], [3,4,5,6,7,8]])
|
||||
|
||||
r = get_contraction((512, 512), (1, 1, 512, 1, 1, 1, 1, 512))
|
||||
self.assertEqual(r, [[], [], [0], [], [], [], [], [1]])
|
||||
|
||||
r = get_contraction((1,2,3,4), (1,2,6,2))
|
||||
self.assertEqual(r, None)
|
||||
|
||||
def test_contraction_ones(self):
|
||||
r = get_contraction((1,), (1,1,1))
|
||||
self.assertEqual(r, [[0], [], []])
|
||||
|
||||
r = get_contraction((1,1), (1,1,1))
|
||||
self.assertEqual(r, [[0], [1], []])
|
||||
|
||||
r = get_contraction((1,1,1,1), (1,))
|
||||
self.assertEqual(r, [[0,1,2,3]])
|
||||
|
||||
r = get_contraction((1,1,1,1), (1,1))
|
||||
self.assertEqual(r, [[0], [1,2,3]])
|
||||
|
||||
r = get_contraction((1,1,1,1), (1,1,1))
|
||||
self.assertEqual(r, [[0], [1], [2,3]])
|
||||
|
||||
r = get_contraction((1,1,1,1), (1,1,1,1))
|
||||
self.assertEqual(r, [[0], [1], [2], [3]])
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
39
tinygrad_repo/test/unit/test_shm_tensor.py
Normal file
39
tinygrad_repo/test/unit/test_shm_tensor.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import unittest
|
||||
import multiprocessing.shared_memory as shared_memory
|
||||
from tinygrad.helpers import CI
|
||||
from tinygrad.runtime.ops_shm import RawShmBuffer
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
import numpy as np
|
||||
|
||||
class TestRawShmBuffer(unittest.TestCase):
|
||||
def test_e2e(self):
|
||||
t = Tensor.randn(2, 2, 2).realize()
|
||||
|
||||
# copy to shm
|
||||
shm_name = (s := shared_memory.SharedMemory(create=True, size=t.nbytes())).name
|
||||
s.close()
|
||||
t_shm = t.to(f"shm:{shm_name}").realize()
|
||||
|
||||
# copy from shm
|
||||
t2 = t_shm.to(Device.DEFAULT).realize()
|
||||
|
||||
assert np.allclose(t.numpy(), t2.numpy())
|
||||
s.unlink()
|
||||
|
||||
@unittest.skipIf(CI, "CI doesn't like big shared memory")
|
||||
def test_e2e_big(self):
|
||||
t = Tensor.randn(2048, 2048, 8).realize()
|
||||
|
||||
# copy to shm
|
||||
shm_name = (s := shared_memory.SharedMemory(create=True, size=t.nbytes())).name
|
||||
s.close()
|
||||
t_shm = t.to(f"shm:{shm_name}").realize()
|
||||
|
||||
# copy from shm
|
||||
t2 = t_shm.to(Device.DEFAULT).realize()
|
||||
|
||||
assert np.allclose(t.numpy(), t2.numpy())
|
||||
s.unlink()
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
448
tinygrad_repo/test/unit/test_symbolic.py
Normal file
448
tinygrad_repo/test/unit/test_symbolic.py
Normal file
@@ -0,0 +1,448 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
from tinygrad.shape.symbolic import Node, MulNode, SumNode, Variable, NumNode, LtNode, sym_render, sym_infer, create_rednode
|
||||
|
||||
class TestSymbolic(unittest.TestCase):
|
||||
def helper_test_variable(self, v, n, m, s):
|
||||
self.assertEqual(v.render(), s)
|
||||
self.assertEqual(v.min, n)
|
||||
self.assertEqual(v.max, m)
|
||||
|
||||
def test_ge(self):
|
||||
self.helper_test_variable(Variable("a", 3, 8)>=77, 0, 0, "0")
|
||||
self.helper_test_variable(Variable("a", 3, 8)>=9, 0, 0, "0")
|
||||
self.helper_test_variable(Variable("a", 3, 8)>=8, 0, 1, "((a*-1)<-7)")
|
||||
self.helper_test_variable(Variable("a", 3, 8)>=4, 0, 1, "((a*-1)<-3)")
|
||||
self.helper_test_variable(Variable("a", 3, 8)>=3, 1, 1, "1")
|
||||
self.helper_test_variable(Variable("a", 3, 8)>=2, 1, 1, "1")
|
||||
|
||||
def test_lt(self):
|
||||
self.helper_test_variable(Variable("a", 3, 8)<77, 1, 1, "1")
|
||||
self.helper_test_variable(Variable("a", 3, 8)<9, 1, 1, "1")
|
||||
self.helper_test_variable(Variable("a", 3, 8)<8, 0, 1, "(a<8)")
|
||||
self.helper_test_variable(Variable("a", 3, 8)<4, 0, 1, "(a<4)")
|
||||
self.helper_test_variable(Variable("a", 3, 8)<3, 0, 0, "0")
|
||||
self.helper_test_variable(Variable("a", 3, 8)<2, 0, 0, "0")
|
||||
|
||||
def test_ge_divides(self):
|
||||
expr = (Variable("idx", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 3)) < 512
|
||||
self.helper_test_variable(expr, 0, 1, "(idx<128)")
|
||||
|
||||
def test_ge_divides_and(self):
|
||||
expr = Variable.ands([(Variable("idx1", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 3)) < 512,
|
||||
(Variable("idx2", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 3)) < 512])
|
||||
self.helper_test_variable(expr, 0, 1, "((idx1<128) and (idx2<128))")
|
||||
expr = Variable.ands([(Variable("idx1", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 3)) < 512,
|
||||
(Variable("idx2", 0, 511)*4 + Variable("FLOAT8_INDEX", 0, 7)) < 512])
|
||||
self.helper_test_variable(expr//4, 0, 1, "((((FLOAT8_INDEX//4)+idx2)<128) and ((idx1//4)<32))")
|
||||
|
||||
def test_lt_factors(self):
|
||||
expr = Variable.ands([(Variable("idx1", 0, 511)*4 + Variable("FLOAT4_INDEX", 0, 256)) < 512])
|
||||
self.helper_test_variable(expr, 0, 1, "(((idx1*4)+FLOAT4_INDEX)<512)")
|
||||
|
||||
def test_div_becomes_num(self):
|
||||
assert isinstance(Variable("a", 2, 3)//2, NumNode)
|
||||
|
||||
def test_var_becomes_num(self):
|
||||
assert isinstance(Variable("a", 2, 2), NumNode)
|
||||
|
||||
def test_equality(self):
|
||||
idx1 = Variable("idx1", 0, 3)
|
||||
idx2 = Variable("idx2", 0, 3)
|
||||
assert idx1 == idx1
|
||||
assert idx1 != idx2
|
||||
assert idx1*4 == idx1*4
|
||||
assert idx1*4 != idx1*3
|
||||
assert idx1*4 != idx1+4
|
||||
assert idx1*4 != idx2*4
|
||||
assert idx1+idx2 == idx1+idx2
|
||||
assert idx1+idx2 == idx2+idx1
|
||||
assert idx1+idx2 != idx2
|
||||
|
||||
def test_factorize(self):
|
||||
a = Variable("a", 0, 8)
|
||||
self.helper_test_variable(a*2+a*3, 0, 8*5, "(a*5)")
|
||||
|
||||
def test_factorize_no_mul(self):
|
||||
a = Variable("a", 0, 8)
|
||||
self.helper_test_variable(a+a*3, 0, 8*4, "(a*4)")
|
||||
|
||||
def test_neg(self):
|
||||
self.helper_test_variable(-Variable("a", 0, 8), -8, 0, "(a*-1)")
|
||||
|
||||
def test_add_1(self):
|
||||
self.helper_test_variable(Variable("a", 0, 8)+1, 1, 9, "(1+a)")
|
||||
|
||||
def test_add_num_1(self):
|
||||
self.helper_test_variable(Variable("a", 0, 8)+Variable.num(1), 1, 9, "(1+a)")
|
||||
|
||||
def test_sub_1(self):
|
||||
self.helper_test_variable(Variable("a", 0, 8)-1, -1, 7, "(-1+a)")
|
||||
|
||||
def test_sub_num_1(self):
|
||||
self.helper_test_variable(Variable("a", 0, 8)-Variable.num(1), -1, 7, "(-1+a)")
|
||||
|
||||
def test_mul_0(self):
|
||||
self.helper_test_variable(Variable("a", 0, 8)*0, 0, 0, "0")
|
||||
|
||||
def test_mul_1(self):
|
||||
self.helper_test_variable(Variable("a", 0, 8)*1, 0, 8, "a")
|
||||
|
||||
def test_mul_neg_1(self):
|
||||
self.helper_test_variable((Variable("a", 0, 2)*-1)//3, -1, 0, "((((a*-1)+3)//3)+-1)")
|
||||
|
||||
def test_mul_2(self):
|
||||
self.helper_test_variable(Variable("a", 0, 8)*2, 0, 16, "(a*2)")
|
||||
|
||||
def test_div_1(self):
|
||||
self.helper_test_variable(Variable("a", 0, 8)//1, 0, 8, "a")
|
||||
|
||||
def test_mod_1(self):
|
||||
self.helper_test_variable(Variable("a", 0, 8)%1, 0, 0, "0")
|
||||
|
||||
def test_add_min_max(self):
|
||||
self.helper_test_variable(Variable("a", 0, 8) * 2 + 12, 12, 16+12, "((a*2)+12)")
|
||||
|
||||
def test_div_min_max(self):
|
||||
self.helper_test_variable(Variable("a", 0, 7) // 2, 0, 3, "(a//2)")
|
||||
|
||||
def test_div_neg_min_max(self):
|
||||
self.helper_test_variable(Variable("a", 0, 7) // -2, -3, 0, "((a//2)*-1)")
|
||||
|
||||
def test_sum_div_min_max(self):
|
||||
self.helper_test_variable(Variable.sum([Variable("a", 0, 7), Variable("b", 0, 3)]) // 2, 0, 5, "((a+b)//2)")
|
||||
|
||||
def test_sum_div_factor(self):
|
||||
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*4, Variable("b", 0, 3)*4]) // 2, 0, 20, "((a*2)+(b*2))")
|
||||
|
||||
def test_sum_div_some_factor(self):
|
||||
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*5, Variable("b", 0, 3)*4]) // 2, 0, 23, "(((a*5)//2)+(b*2))")
|
||||
|
||||
def test_sum_div_some_partial_factor(self):
|
||||
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*6, Variable("b", 0, 7)*6]) // 16, 0, 5, "(((a*3)+(b*3))//8)")
|
||||
self.helper_test_variable(Variable.sum([Variable.num(16), Variable("a", 0, 7)*6, Variable("b", 0, 7)*6]) // 16, 1, 6, "((((a*3)+(b*3))//8)+1)")
|
||||
|
||||
def test_sum_div_no_factor(self):
|
||||
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*5, Variable("b", 0, 3)*5]) // 2, 0, 25, "(((a*5)+(b*5))//2)")
|
||||
|
||||
def test_mod_factor(self):
|
||||
# NOTE: even though the mod max is 50, it can't know this without knowing about the mul
|
||||
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*100, Variable("b", 0, 3)*50]) % 100, 0, 99, "((b*50)%100)")
|
||||
|
||||
def test_mod_to_sub(self):
|
||||
# This is mod reduction
|
||||
self.helper_test_variable((1+Variable("a",1,2))%2, 0, 1, (Variable("a",1,2)-1).render())
|
||||
|
||||
def test_sum_div_const(self):
|
||||
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*4, Variable.num(3)]) // 4, 0, 7, "a")
|
||||
|
||||
def test_sum_div_const_big(self):
|
||||
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)*4, Variable.num(3)]) // 16, 0, 1, "(a//4)")
|
||||
|
||||
def test_sum_lt_fold(self):
|
||||
self.helper_test_variable(Variable.sum([Variable("a", 0, 7) * 4, Variable("b", 0, 3)]) < 16, 0, 1, "(a<4)")
|
||||
self.helper_test_variable(Variable.sum([Variable("a", 0, 7) * 4, Variable("b", 0, 4)]) < 16, 0, 1, "(((a*4)+b)<16)")
|
||||
|
||||
def test_mod_mul(self):
|
||||
self.helper_test_variable((Variable("a", 0, 5)*10)%9, 0, 5, "a")
|
||||
|
||||
def test_mod_mod(self):
|
||||
self.helper_test_variable((Variable("a", 0, 31)%12)%4, 0, 3, "(a%4)")
|
||||
self.helper_test_variable(((4*Variable("a", 0, 31)) % 12) % 4, 0, 0, "0")
|
||||
self.helper_test_variable((Variable("a", 0, 31) % 4) % 12, 0, 3, "(a%4)")
|
||||
|
||||
def test_mul_mul(self):
|
||||
self.helper_test_variable((Variable("a", 0, 5)*10)*9, 0, 5*10*9, "(a*90)")
|
||||
|
||||
def test_mul_lt(self):
|
||||
self.helper_test_variable((Variable("a", 0, 5)*4)<13, 0, 1, "(a<4)")
|
||||
self.helper_test_variable((Variable("a", 0, 5)*4)<16, 0, 1, "(a<4)")
|
||||
self.helper_test_variable((Variable("a", 0, 5)*4)>11, 0, 1, "((a*-1)<-2)")
|
||||
self.helper_test_variable((Variable("a", 0, 5)*4)>12, 0, 1, "((a*-1)<-3)")
|
||||
|
||||
def test_div_div(self):
|
||||
self.helper_test_variable((Variable("a", 0, 1800)//10)//9, 0, 20, "(a//90)")
|
||||
|
||||
def test_distribute_mul(self):
|
||||
self.helper_test_variable(Variable.sum([Variable("a", 0, 3), Variable("b", 0, 5)])*3, 0, 24, "((a*3)+(b*3))")
|
||||
|
||||
def test_mod_mul_sum(self):
|
||||
self.helper_test_variable(Variable.sum([Variable("b", 0, 2), Variable("a", 0, 5)*10])%9, 0, 7, "(a+b)")
|
||||
|
||||
def test_sum_0(self):
|
||||
self.helper_test_variable(Variable.sum([Variable("a", 0, 7)]), 0, 7, "a")
|
||||
|
||||
def test_mod_remove(self):
|
||||
self.helper_test_variable(Variable("a", 0, 6)%100, 0, 6, "a")
|
||||
|
||||
def test_big_mod(self):
|
||||
# NOTE: we no longer support negative variables
|
||||
#self.helper_test_variable(Variable("a", -20, 20)%10, -9, 9, "(a%10)")
|
||||
#self.helper_test_variable(Variable("a", -20, 0)%10, -9, 0, "(a%10)")
|
||||
#self.helper_test_variable(Variable("a", -20, 1)%10, -9, 1, "(a%10)")
|
||||
self.helper_test_variable(Variable("a", 0, 20)%10, 0, 9, "(a%10)")
|
||||
#self.helper_test_variable(Variable("a", -1, 20)%10, -1, 9, "(a%10)")
|
||||
|
||||
def test_gt_remove(self):
|
||||
self.helper_test_variable(Variable("a", 0, 6) >= 25, 0, 0, "0")
|
||||
|
||||
def test_lt_remove(self):
|
||||
self.helper_test_variable(Variable("a", 0, 6) < -3, 0, 0, "0")
|
||||
self.helper_test_variable(Variable("a", 0, 6) < 3, 0, 1, "(a<3)")
|
||||
self.helper_test_variable(Variable("a", 0, 6) < 8, 1, 1, "1")
|
||||
|
||||
def test_lt_sum_remove(self):
|
||||
self.helper_test_variable((Variable("a", 0, 6) + 2) < 3, 0, 1, "(a<1)")
|
||||
|
||||
def test_and_fold(self):
|
||||
self.helper_test_variable(Variable.ands([Variable.num(0), Variable("a", 0, 1)]), 0, 0, "0")
|
||||
|
||||
def test_and_remove(self):
|
||||
self.helper_test_variable(Variable.ands([Variable.num(1), Variable("a", 0, 1)]), 0, 1, "a")
|
||||
|
||||
def test_mod_factor_negative(self):
|
||||
self.helper_test_variable(Variable.sum([Variable.num(-29), Variable("a", 0, 10), Variable("b", 0, 10)*28]) % 28, 0, 27, "((27+a)%28)")
|
||||
self.helper_test_variable(Variable.sum([Variable.num(-29), Variable("a", 0, 100), Variable("b", 0, 10)*28]) % 28, 0, 27, "((27+a)%28)")
|
||||
|
||||
def test_sum_combine_num(self):
|
||||
self.helper_test_variable(Variable.sum([Variable.num(29), Variable("a", 0, 10), Variable.num(-23)]), 6, 16, "(6+a)")
|
||||
|
||||
def test_sum_num_hoisted_and_factors_cancel_out(self):
|
||||
self.helper_test_variable(Variable.sum([Variable("a", 0, 1) * -4 + 1, Variable("a", 0, 1) * 4]), 1, 1, "1")
|
||||
|
||||
def test_div_factor(self):
|
||||
self.helper_test_variable(Variable.sum([Variable.num(-40), Variable("a", 0, 10)*2, Variable("b", 0, 10)*40]) // 40, -1, 9, "(-1+b)")
|
||||
|
||||
def test_mul_div(self):
|
||||
self.helper_test_variable((Variable("a", 0, 10)*4)//4, 0, 10, "a")
|
||||
|
||||
def test_mul_div_factor_mul(self):
|
||||
self.helper_test_variable((Variable("a", 0, 10)*8)//4, 0, 20, "(a*2)")
|
||||
|
||||
def test_mul_div_factor_div(self):
|
||||
self.helper_test_variable((Variable("a", 0, 10)*4)//8, 0, 5, "(a//2)")
|
||||
|
||||
def test_div_remove(self):
|
||||
self.helper_test_variable(Variable.sum([Variable("idx0", 0, 127)*4, Variable("idx2", 0, 3)])//4, 0, 127, "idx0")
|
||||
|
||||
def test_div_numerator_negative(self):
|
||||
self.helper_test_variable((Variable("idx", 0, 9)*-10)//11, -9, 0, "((((idx*-10)+99)//11)+-9)")
|
||||
|
||||
def test_div_into_mod(self):
|
||||
self.helper_test_variable((Variable("idx", 0, 16)*4)%8//4, 0, 1, "(idx%2)")
|
||||
|
||||
class TestSymbolicNumeric(unittest.TestCase):
|
||||
def helper_test_numeric(self, f):
|
||||
# TODO: why are the negative tests broken? (even if we did support negative variables)
|
||||
#MIN, MAX = -10, 10
|
||||
MIN, MAX = 0, 10
|
||||
# one number
|
||||
for i in range(MIN, MAX):
|
||||
v = f(Variable.num(i))
|
||||
#print(i, f(i), v.min, v.max)
|
||||
self.assertEqual(v.min, v.max)
|
||||
self.assertEqual(v.min, f(i))
|
||||
for kmin in range(MIN, MAX):
|
||||
for kmax in range(MIN, MAX):
|
||||
if kmin > kmax: continue
|
||||
v = f(Variable("tmp", kmin, kmax))
|
||||
values = [f(rv) for rv in range(kmin, kmax+1)]
|
||||
# the min and max may not be exact
|
||||
self.assertLessEqual(v.min, min(values))
|
||||
self.assertGreaterEqual(v.max, max(values))
|
||||
|
||||
def test_mod_4(self): self.helper_test_numeric(lambda x: (x%4))
|
||||
def test_div_4(self): self.helper_test_numeric(lambda x: (x//4))
|
||||
def test_plus_1_div_2(self): self.helper_test_numeric(lambda x: (x+1)//2)
|
||||
def test_plus_1_mod_2(self): self.helper_test_numeric(lambda x: (x+1)%2)
|
||||
def test_times_2(self): self.helper_test_numeric(lambda x: x*2)
|
||||
def test_times_2_plus_3(self): self.helper_test_numeric(lambda x: x*2 + 3)
|
||||
def test_times_2_plus_3_mod_4(self): self.helper_test_numeric(lambda x: (x*2 + 3)%4)
|
||||
def test_times_2_plus_3_div_4(self): self.helper_test_numeric(lambda x: (x*2 + 3)//4)
|
||||
def test_times_2_plus_3_div_4_mod_4(self): self.helper_test_numeric(lambda x: ((x*2 + 3)//4)%4)
|
||||
|
||||
class TestSymbolicVars(unittest.TestCase):
|
||||
def test_simple(self):
|
||||
z = NumNode(0)
|
||||
a = Variable("a", 0, 10)
|
||||
b = Variable("b", 0, 10)
|
||||
c = Variable("c", 0, 10)
|
||||
assert z.vars() == z.vars() == []
|
||||
assert a.vars() == a.vars() == [a]
|
||||
m = MulNode(a, 3)
|
||||
assert m.vars() == [a]
|
||||
s = SumNode([a, b, c])
|
||||
assert s.vars() == [a, b, c]
|
||||
|
||||
def test_compound(self):
|
||||
a = Variable("a", 0, 10)
|
||||
b = Variable("b", 0, 10)
|
||||
c = Variable("c", 0, 10)
|
||||
assert (a + b * c).vars() == [a, b, c]
|
||||
assert (a % 3 + b // 5).vars() == [a, b]
|
||||
assert (a + b + c - a).vars() == [b, c]
|
||||
|
||||
class TestSymbolicMinMax(unittest.TestCase):
|
||||
def test_min_max_known(self):
|
||||
a = Variable("a", 1, 8)
|
||||
assert max(1, a) == max(a, 1) == a
|
||||
assert min(1, a) == min(a, 1) == 1
|
||||
|
||||
class TestSymRender(unittest.TestCase):
|
||||
def test_sym_render(self):
|
||||
a = Variable("a", 1, 8)
|
||||
b = Variable("b", 1, 10)
|
||||
assert sym_render(a) == "a"
|
||||
assert sym_render(1) == "1"
|
||||
assert sym_render(a+1) == "(1+a)"
|
||||
assert sym_render(a*b) == "(a*b)"
|
||||
|
||||
class TestSymInfer(unittest.TestCase):
|
||||
def test_sym_infer(self):
|
||||
a = Variable("a", 0, 10)
|
||||
b = Variable("b", 0, 10)
|
||||
c = Variable("c", 0, 10)
|
||||
var_vals = {a: 2, b: 3, c: 4}
|
||||
assert sym_infer(5, var_vals) == 5
|
||||
assert sym_infer(a, var_vals) == 2
|
||||
assert sym_infer(b, var_vals) == 3
|
||||
assert sym_infer(a+b, var_vals) == 5
|
||||
assert sym_infer(a-b, var_vals) == -1
|
||||
assert sym_infer(a+b+c, var_vals) == 9
|
||||
assert sym_infer(a*b, var_vals) == 6
|
||||
assert sym_infer(a*b+c, var_vals) == 10
|
||||
|
||||
class TestSymbolicSymbolicOps(unittest.TestCase):
|
||||
def test_node_divmod_node(self):
|
||||
i = Variable("i", 1, 10)
|
||||
idx0 = Variable("idx0", 0, i*3-1)
|
||||
assert NumNode(0) // (Variable("i", 1, 10)*128) == 0
|
||||
assert NumNode(0) % (Variable("i", 1, 10)*128) == 0
|
||||
assert NumNode(127) // (Variable("i", 1, 10)*128) == 0
|
||||
assert NumNode(127) % (Variable("i", 1, 10)*128) == 127
|
||||
assert 127 // (Variable("i", 1, 10)*128) == 0
|
||||
assert 127 % (Variable("i", 1, 10)*128) == 127
|
||||
assert NumNode(128) // (Variable("i", 1, 10)*128 + 128) == 0
|
||||
assert NumNode(128) % (Variable("i", 1, 10)*128 + 128) == 128
|
||||
assert 128 // (Variable("i", 1, 10)*128 + 128) == 0
|
||||
assert 128 % (Variable("i", 1, 10)*128 + 128) == 128
|
||||
assert 0 // (Variable("i", 1, 10)*128) == 0
|
||||
assert 0 % (Variable("i", 1, 10)*128) == 0
|
||||
assert idx0 // (i*3) == 0
|
||||
assert idx0 % (i*3) == idx0
|
||||
assert i // i == 1
|
||||
assert i % i == 0
|
||||
assert 128 // NumNode(4) == 32
|
||||
assert 128 % NumNode(4) == 0
|
||||
assert NumNode(128) // NumNode(4) == 32
|
||||
assert NumNode(128) % NumNode(4) == 0
|
||||
|
||||
def test_mulnode_divmod_node(self):
|
||||
i = Variable("i", 1, 10)
|
||||
idx0 = Variable("idx0", 0, 31)
|
||||
assert (idx0*(i*4+4)) // (i+1) == (idx0*4)
|
||||
assert (idx0*(i*4+4)) % (i+1) == 0
|
||||
assert (idx0*i) % i == 0
|
||||
|
||||
def test_sumnode_divmod_sumnode(self):
|
||||
i = Variable("i", 1, 10)
|
||||
idx0 = Variable("idx0", 0, 7)
|
||||
idx1 = Variable("idx1", 0, 3)
|
||||
idx2 = Variable("idx2", 0, i)
|
||||
assert (idx0*(i*4+4)+idx1*(i+1)+idx2) // (i+1) == idx0*4+idx1
|
||||
assert (idx0*(i*4+4)+idx1*(i+1)+idx2) % (i+1) == idx2
|
||||
assert (i+1) // (i*128+128) == 0
|
||||
assert (i+1) % (i*128+128) == (i+1)
|
||||
assert (i+1+idx2) // (i+1) == 1
|
||||
assert (i+1+idx2) % (i+1) == idx2
|
||||
assert (idx0*(i*4+4)+i+1+idx2) // (i+1) == idx0*4+1
|
||||
assert (idx0*(i*4+4)+i+1+idx2) % (i+1) == idx2
|
||||
assert (i*128+128)*2 // (i*128+128) == 2
|
||||
assert (i*128+128)*2 % (i*128+128) == 0
|
||||
|
||||
def test_sumnode_divmod_sumnode_complex(self):
|
||||
i = Variable("i", 1, 1024)
|
||||
gidx0 = Variable("gidx0", 0, i)
|
||||
lidx1 = Variable("lidx1", 0, 7)
|
||||
ridx2 = Variable("ridx1", 0, 31)
|
||||
assert ((i*128+128)*2 + gidx0*128 + lidx1*(i*512+512) + ridx2*4) // (i*128+128) == 2 + lidx1*4
|
||||
assert ((i*128+128)*2 + gidx0*128 + lidx1*(i*512+512) + ridx2*4) % (i*128+128) == gidx0*128 + ridx2*4
|
||||
assert ((gidx0*128+i*128+ridx2*4+129)) // (i*128+128) == 1
|
||||
assert ((gidx0*128+i*128+ridx2*4+129)) % (i*128+128) == gidx0*128 + ridx2*4 + 1
|
||||
assert (ridx2*(i*4+4)+1+i+gidx0) // (i*128+128) == 0
|
||||
assert (ridx2*(i*4+4)+1+i+gidx0) % (i*128+128) == (ridx2*(i*4+4)+1+i+gidx0)
|
||||
|
||||
def test_node_lt_node(self):
|
||||
a = Variable("a", 1, 5)
|
||||
b = Variable("b", 6, 9)
|
||||
c = Variable("c", 1, 10)
|
||||
d = Variable("d", 5, 10)
|
||||
# if the value is always the same, it folds to num
|
||||
assert (a < b) == 1
|
||||
assert (b < a) == 0
|
||||
assert (d < a) == 0
|
||||
# if it remains as a LtNode, bool is always true and (min, max) == (0, 1)
|
||||
assert isinstance((a < c), LtNode) and (a < c).min == 0 and (a < c).max == 1
|
||||
assert a < c
|
||||
assert isinstance((a > c), LtNode) and (a > c).min == 0 and (a > c).max == 1
|
||||
# same when comparing with a constant
|
||||
assert a < 3 and (a < 3).min == 0 and (a < 3).max == 1
|
||||
assert a > 3 and (a > 3).min == 0 and (a > 3).max == 1
|
||||
|
||||
def test_num_node_mul_node(self):
|
||||
a = Variable("a", 1, 5)
|
||||
b = NumNode(2) * a
|
||||
assert b == a * 2
|
||||
assert isinstance(b, MulNode)
|
||||
b = NumNode(1) * a
|
||||
assert b == a
|
||||
assert isinstance(b, Variable)
|
||||
b = NumNode(0) * a
|
||||
assert b == 0
|
||||
assert isinstance(b, NumNode)
|
||||
|
||||
def test_num_node_expand(self):
|
||||
a = NumNode(42)
|
||||
assert a.expand() == [a]
|
||||
|
||||
def test_variable_expand(self):
|
||||
a = Variable("a", 5, 7)
|
||||
assert a.expand() == [a]
|
||||
|
||||
def test_variable_expand_expr_none(self):
|
||||
a = Variable(None, 5, 7)
|
||||
assert a.expand() == [NumNode(5), NumNode(6), NumNode(7)]
|
||||
|
||||
def test_mul_node_expand(self):
|
||||
a = Variable(None, 5, 7)
|
||||
m = MulNode(a, 3)
|
||||
assert m.expand() == [NumNode(15), NumNode(18), NumNode(21)]
|
||||
|
||||
b = Variable("b", 1, 3)
|
||||
n = MulNode(b, 3)
|
||||
assert n.expand() == [Variable("b", 1, 3)*3]
|
||||
|
||||
def test_sum_node_expand(self):
|
||||
a = Variable(None, 1, 3)
|
||||
b = Variable("b", 5, 7)
|
||||
|
||||
s1 = create_rednode(SumNode, [a, b])
|
||||
assert s1.expand() == [Variable.sum([NumNode(i),b]) for i in range(1,4)]
|
||||
|
||||
def test_multi_expand(self):
|
||||
a = Variable("a", 1, 3)
|
||||
b = Variable("b", 14, 17)
|
||||
s1 = create_rednode(SumNode, [a, b])
|
||||
# expand increments earlier variables faster than later variables (as specified in the argument)
|
||||
# this behavior was just copied from before, no idea why this should be true
|
||||
assert s1.expand((a, b)) == [NumNode(x + y) for x in range(b.min, b.max + 1) for y in range(a.min, a.max + 1)]
|
||||
|
||||
def test_substitute(self):
|
||||
a = Variable(None, 1, 3)
|
||||
b = a + 1
|
||||
c = b.substitute({a: NumNode(1)})
|
||||
assert c == NumNode(2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user