Add openpilot tests
This commit is contained in:
62
tinygrad_repo/test/external/dist/test_collectives.py
vendored
Normal file
62
tinygrad_repo/test/external/dist/test_collectives.py
vendored
Normal file
@@ -0,0 +1,62 @@
|
||||
from extra import dist
|
||||
from tinygrad.jit import TinyJit
|
||||
if __name__ == "__main__":
|
||||
dist.preinit()
|
||||
|
||||
from extra.dist import collectives
|
||||
from tinygrad.helpers import CI, getenv
|
||||
from tinygrad.tensor import Tensor
|
||||
import numpy as np
|
||||
|
||||
@TinyJit
|
||||
def allreduce_jit(t:Tensor, cache_id=None) -> Tensor:
|
||||
return collectives.allreduce(t, cache_id=cache_id).realize()
|
||||
|
||||
SIZE = 2048 if not CI else 2
|
||||
SIZE_2 = 255 if not CI else 3
|
||||
|
||||
def run():
|
||||
# set a deterministic seed so that both ranks generate the same random tensor
|
||||
Tensor.manual_seed(42)
|
||||
|
||||
rank = getenv("RANK")
|
||||
|
||||
# loop 3 times to make sure it works with the jit
|
||||
for _ in range(3):
|
||||
# create a tensor to send
|
||||
t = Tensor.zeros(SIZE, SIZE) if rank != 0 else Tensor.ones(SIZE, SIZE)
|
||||
t2 = allreduce_jit(t.contiguous().realize(), cache_id="test")
|
||||
assert np.allclose(np.ones((SIZE, SIZE)), t2.numpy()), f"{t2.numpy()} wasn't ones"
|
||||
|
||||
# reset jit
|
||||
allreduce_jit.cnt = 0
|
||||
allreduce_jit.input_replace = {}
|
||||
|
||||
# test uneven chunk sizes
|
||||
for _ in range(3):
|
||||
# create a tensor to send
|
||||
t = Tensor.ones(SIZE_2, SIZE_2, SIZE_2) if rank == 0 else Tensor.zeros(SIZE_2, SIZE_2, SIZE_2)
|
||||
t2 = allreduce_jit(t.contiguous().realize(), cache_id="test2")
|
||||
assert np.allclose(np.ones((SIZE_2, SIZE_2, SIZE_2)), t2.numpy()), f"{t2.numpy()} wasn't ones"
|
||||
|
||||
print(f"rank {rank} passed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if getenv("HIP"):
|
||||
from tinygrad.runtime.ops_hip import HIP
|
||||
devices = [f"hip:{i}" for i in range(HIP.device_count)]
|
||||
else:
|
||||
from tinygrad.runtime.ops_gpu import CL
|
||||
devices = [f"gpu:{i}" for i in range(len(CL.devices))] if not CI else ["gpu:0", "gpu:0"]
|
||||
world_size = len(devices)
|
||||
|
||||
dist.init_oob(world_size)
|
||||
|
||||
processes = []
|
||||
for rank, device in enumerate(devices):
|
||||
processes.append(dist.spawn(rank, device, fn=run, args=()))
|
||||
for p in processes: p.join()
|
||||
|
||||
# exit with error code if any of the processes failed
|
||||
for p in processes:
|
||||
if p.exitcode != 0: exit(p.exitcode)
|
||||
68
tinygrad_repo/test/external/dist/test_world.py
vendored
Normal file
68
tinygrad_repo/test/external/dist/test_world.py
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
from extra import dist
|
||||
from tinygrad.jit import TinyJit
|
||||
if __name__ == "__main__":
|
||||
dist.preinit()
|
||||
|
||||
from extra.dist import world
|
||||
from tinygrad.helpers import CI, getenv
|
||||
from tinygrad.tensor import Tensor
|
||||
import numpy as np
|
||||
|
||||
@TinyJit
|
||||
def send_jit(t, target_rank, cache_id=None) -> Tensor:
|
||||
return world.send(t, target_rank, cache_id=cache_id).realize()
|
||||
|
||||
@TinyJit
|
||||
def recv_jit(t, target_rank, cache_id=None) -> Tensor:
|
||||
return world.recv(t, target_rank, cache_id=cache_id).realize()
|
||||
|
||||
SIZE = 2048 if not CI else 2
|
||||
|
||||
def run():
|
||||
# set a deterministic seed so that both ranks generate the same random tensor
|
||||
Tensor.manual_seed(42)
|
||||
|
||||
rank = getenv("RANK")
|
||||
|
||||
# loop 3 times to make sure it works with the jit
|
||||
for _ in range(3):
|
||||
# create a tensor to send
|
||||
t = Tensor.randn(SIZE, SIZE)
|
||||
|
||||
# send to rank 1
|
||||
if rank == 0:
|
||||
send_jit(t, 1, cache_id="test")
|
||||
elif rank == 1:
|
||||
t2 = Tensor.empty(SIZE, SIZE)
|
||||
recv_jit(t2, 0, cache_id="test")
|
||||
|
||||
# recv from rank 1
|
||||
if rank == 0:
|
||||
t2 = Tensor.empty(SIZE, SIZE)
|
||||
recv_jit(t2, 1, cache_id="test2")
|
||||
elif rank == 1:
|
||||
send_jit(t2, 0, cache_id="test2")
|
||||
|
||||
# check that the received tensor is the same as the sent tensor
|
||||
if rank == 0:
|
||||
assert np.allclose(t.numpy(), t2.numpy()), f"{t2.numpy()} wasn't equal to {t.numpy()}"
|
||||
|
||||
print(f"rank {rank} passed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if getenv("HIP"):
|
||||
devices = ["hip:0", "hip:1"]
|
||||
else:
|
||||
devices = ["gpu:0", "gpu:1" if not CI else "gpu:0"]
|
||||
world_size = len(devices)
|
||||
|
||||
dist.init_oob(world_size)
|
||||
|
||||
processes = []
|
||||
for rank, device in enumerate(devices):
|
||||
processes.append(dist.spawn(rank, device, fn=run, args=()))
|
||||
for p in processes: p.join()
|
||||
|
||||
# exit with error code if any of the processes failed
|
||||
for p in processes:
|
||||
if p.exitcode != 0: exit(p.exitcode)
|
||||
27
tinygrad_repo/test/external/external_copy_benchmark.py
vendored
Normal file
27
tinygrad_repo/test/external/external_copy_benchmark.py
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
import unittest
|
||||
from tinygrad.helpers import prod
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.jit import CacheCollector
|
||||
|
||||
class TestCopy(unittest.TestCase):
|
||||
def test_add1(self):
|
||||
pts = []
|
||||
for i in range(16384, 16384*256, 16384):
|
||||
t = Tensor.randn(i).realize()
|
||||
CacheCollector.start()
|
||||
t.assign(t+1).realize()
|
||||
fxn, args, _ = CacheCollector.finish()[0]
|
||||
GlobalCounters.reset()
|
||||
def run(): return fxn(args, force_wait=True)
|
||||
ct = min([run() for _ in range(10)])
|
||||
mb = prod(t.shape)*t.dtype.itemsize*2*1e-6
|
||||
print(f"{mb*1e3:.2f} kB, {ct*1e3:.2f} ms, {mb/ct:.2f} MB/s")
|
||||
pts.append((mb, mb/ct))
|
||||
from matplotlib import pyplot as plt
|
||||
plt.plot([x[0] for x in pts], [x[1] for x in pts])
|
||||
plt.show()
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
102
tinygrad_repo/test/external/external_llama_eval.py
vendored
Normal file
102
tinygrad_repo/test/external/external_llama_eval.py
vendored
Normal file
@@ -0,0 +1,102 @@
|
||||
from lm_eval.base import BaseLM
|
||||
from lm_eval import evaluator, tasks
|
||||
import torch, json, argparse
|
||||
|
||||
from examples.llama import LLaMa
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
|
||||
class LLaMaAdaptor(BaseLM):
|
||||
def __init__(
|
||||
self,
|
||||
model_size="7B",
|
||||
model_gen=1,
|
||||
device="",
|
||||
quantize=False,
|
||||
batch_size=1,
|
||||
max_batch_size=1,
|
||||
do_sample=False,
|
||||
temperature=1.0,
|
||||
checkpoint_path="",
|
||||
tokenizer_path="",
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
if batch_size is None:
|
||||
batch_size = 1
|
||||
self.do_sample = do_sample
|
||||
self.temperature = temperature
|
||||
self._device = device
|
||||
|
||||
assert isinstance(model_gen, int)
|
||||
assert isinstance(model_size, str)
|
||||
assert isinstance(batch_size, int)
|
||||
assert isinstance(checkpoint_path, str)
|
||||
assert isinstance(tokenizer_path, str)
|
||||
|
||||
self.llama = LLaMa.build(checkpoint_path, tokenizer_path, model_gen, model_size, quantize)
|
||||
|
||||
@classmethod
|
||||
def create_from_arg_string(cls, arg_string, additional_config=None):
|
||||
kwargs = {el.split("=")[0]: el.split("=")[1] for el in arg_string.split(",")}
|
||||
return cls(**kwargs, **additional_config)
|
||||
|
||||
@property
|
||||
def eot_token_id(self):
|
||||
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
|
||||
return self.llama.tokenizer.eos_id()
|
||||
|
||||
@property
|
||||
def max_length(self):
|
||||
return 1024
|
||||
|
||||
@property
|
||||
def max_gen_toks(self):
|
||||
return 256
|
||||
|
||||
@property
|
||||
def batch_size(self):
|
||||
return 1
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return self._device
|
||||
|
||||
def tok_encode(self, string: str):
|
||||
return [self.llama.tokenizer.bos_id()] + self.llama.tokenizer.encode(string)
|
||||
|
||||
def tok_decode(self, tokens):
|
||||
return self.llama.tokenizer.decode(tokens)
|
||||
|
||||
def _model_call(self, inps):
|
||||
Tensor.no_grad = True
|
||||
return torch.Tensor(self.llama.model(Tensor(inps.numpy()), 0).numpy())
|
||||
|
||||
def greedy_until(self, requests):
|
||||
continuations = []
|
||||
for request in requests:
|
||||
prompt, until = request[0], request[1]['until']
|
||||
output = self.llama.greedy_until(prompt, until, max_length=128, temperature=0.0)
|
||||
continuations.append(output[len(prompt):])
|
||||
return continuations
|
||||
|
||||
def _model_generate(self, context, max_length, eos_token_id):
|
||||
raise NotImplementedError()
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(f"using {Device.DEFAULT} backend")
|
||||
|
||||
parser = argparse.ArgumentParser(description='Run LLaMA evals in tinygrad', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('--size', type=str, default="7B", help="Size of model to use [7B, 13B, 30B, 65B] for Gen 1, [7B, 13B] for Gen 2")
|
||||
parser.add_argument('--gen', type=int, default="1", help="Generation of the model to use [1, 2]")
|
||||
parser.add_argument('--quantize', action='store_true', help="Quantize the weights to int8 in memory")
|
||||
parser.add_argument('--eval', type=str, default="arc_easy", help="Run in evaluation mode")
|
||||
parser.add_argument('--limit', type=int, default=None, help="Limit tests in eval")
|
||||
parser.add_argument('--weights', type=str, default="./weights/LLaMa/", help="Location of the weights")
|
||||
parser.add_argument('--tokenizer', type=str, default="./weights/LLaMa/tokenizer.model", help="Location of the tokenizer")
|
||||
args = parser.parse_args()
|
||||
|
||||
# run eval and exit
|
||||
adaptor = LLaMaAdaptor(model_gen=args.gen, model_size=args.size, quantize=args.quantize, checkpoint_path=args.weights, tokenizer_path=args.tokenizer, device="cpu")
|
||||
results = evaluator.evaluate(adaptor, tasks.get_task_dict(args.eval.split(",")), False, 0, args.limit)
|
||||
print(json.dumps(results, indent=2))
|
||||
128
tinygrad_repo/test/external/external_model_benchmark.py
vendored
Normal file
128
tinygrad_repo/test/external/external_model_benchmark.py
vendored
Normal file
@@ -0,0 +1,128 @@
|
||||
import csv, pathlib, time, numpy as np
|
||||
from os import getenv
|
||||
import torch
|
||||
torch.set_num_threads(1)
|
||||
import onnx
|
||||
from onnx.helper import tensor_dtype_to_np_dtype
|
||||
import onnxruntime as ort
|
||||
from onnx2torch import convert
|
||||
from extra.utils import download_file
|
||||
from extra.onnx import get_run_onnx
|
||||
from tinygrad.helpers import OSX, DEBUG
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
|
||||
MODELS = {
|
||||
"resnet50": "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx",
|
||||
"openpilot": "https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx",
|
||||
"efficientnet": "https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx",
|
||||
"shufflenet": "https://github.com/onnx/models/raw/main/vision/classification/shufflenet/model/shufflenet-9.onnx",
|
||||
"commavq": "https://huggingface.co/commaai/commavq-gpt2m/resolve/main/gpt2m.onnx",
|
||||
|
||||
# broken in torch MPS
|
||||
#"zfnet": "https://github.com/onnx/models/raw/main/vision/classification/zfnet-512/model/zfnet512-9.onnx",
|
||||
# TypeError: BatchNormalization() got an unexpected keyword argument 'is_test'
|
||||
#"densenet": "https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-3.onnx",
|
||||
# AssertionError: only onnx version >= 10 supported for slice
|
||||
#"bert": "https://github.com/onnx/models/raw/main/text/machine_comprehension/bert-squad/model/bertsquad-8.onnx",
|
||||
# really slow
|
||||
#"resnet18": "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx",
|
||||
}
|
||||
|
||||
CSV = {}
|
||||
open_csv = None
|
||||
torch.manual_seed(1)
|
||||
|
||||
def benchmark(mnm, nm, fxn):
|
||||
tms = []
|
||||
for _ in range(3):
|
||||
st = time.perf_counter_ns()
|
||||
ret = fxn()
|
||||
tms.append(time.perf_counter_ns() - st)
|
||||
print(f"{mnm:15s} {nm:25s} {min(tms)*1e-6:7.2f} ms")
|
||||
CSV[nm] = min(tms)*1e-6
|
||||
return min(tms), ret
|
||||
|
||||
#BASE = pathlib.Path(__file__).parents[2] / "weights" / "onnx"
|
||||
BASE = pathlib.Path("/tmp/onnx")
|
||||
def benchmark_model(m, validate_outs=False):
|
||||
global open_csv, CSV
|
||||
CSV = {"model": m}
|
||||
|
||||
fn = BASE / MODELS[m].split("/")[-1]
|
||||
download_file(MODELS[m], fn)
|
||||
onnx_model = onnx.load(fn)
|
||||
output_names = [out.name for out in onnx_model.graph.output]
|
||||
excluded = {inp.name for inp in onnx_model.graph.initializer}
|
||||
input_shapes = {inp.name:tuple(x.dim_value if x.dim_value != 0 else 1 for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input if inp.name not in excluded}
|
||||
input_types = {inp.name: tensor_dtype_to_np_dtype(inp.type.tensor_type.elem_type) for inp in onnx_model.graph.input if inp.name not in excluded}
|
||||
#input_types = {k:v if v!=np.float16 else np.float32 for k,v in input_types.items()} # cast
|
||||
np_inputs = {k:torch.randn(shp).numpy().astype(input_types[k]) for k,shp in input_shapes.items()}
|
||||
assert len(input_shapes) < 30, f"too many input shapes {len(input_shapes)}"
|
||||
|
||||
# print input names
|
||||
if DEBUG >= 2: print([inp.name for inp in onnx_model.graph.input if inp.name not in excluded])
|
||||
|
||||
for device in ["METAL" if OSX else "GPU", "CLANG"]: # + (["CUDA"] if torch.cuda.is_available() else []):
|
||||
Device.DEFAULT = device
|
||||
inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
|
||||
tinygrad_model = get_run_onnx(onnx_model)
|
||||
benchmark(m, f"tinygrad_{device.lower()}_jitless", lambda: {k:v.numpy() for k,v in tinygrad_model(inputs).items()})
|
||||
|
||||
from tinygrad.jit import TinyJit
|
||||
tinygrad_jitted_model = TinyJit(lambda **kwargs: {k:v.realize() for k,v in tinygrad_model(kwargs).items()})
|
||||
for _ in range(3): {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()}
|
||||
benchmark(m, f"tinygrad_{device.lower()}_jit", lambda: {k:v.numpy() for k,v in tinygrad_jitted_model(**inputs).items()})
|
||||
del inputs, tinygrad_model, tinygrad_jitted_model
|
||||
|
||||
try:
|
||||
torch_model = convert(onnx_model)
|
||||
torch_inputs = [torch.tensor(x) for x in np_inputs.values()]
|
||||
benchmark(m, "torch_cpu", lambda: torch_model(*torch_inputs))
|
||||
|
||||
torch_device = "mps" if OSX else "cuda"
|
||||
torch_mps_model = torch_model.to(torch_device)
|
||||
torch_mps_inputs = [x.to(torch_device) for x in torch_inputs]
|
||||
benchmark(m, f"torch_{torch_device}", lambda: torch_mps_model(*torch_mps_inputs))
|
||||
except Exception as e: print(f"{m:16s}onnx2torch {type(e).__name__:>25}")
|
||||
|
||||
# bench onnxruntime
|
||||
ort_options = ort.SessionOptions()
|
||||
ort_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
ort_options.log_severity_level = 3 # no warnings
|
||||
for backend in ["CPU", "CUDA" if not OSX else "CoreML"]: # https://onnxruntime.ai/docs/execution-providers/
|
||||
provider = backend+"ExecutionProvider"
|
||||
if provider not in ort.get_available_providers(): continue
|
||||
ort_sess = ort.InferenceSession(str(fn), ort_options, [provider])
|
||||
benchmark(m, f"onnxruntime_{backend.lower()}", lambda: ort_sess.run(output_names, np_inputs))
|
||||
del ort_sess
|
||||
|
||||
if validate_outs:
|
||||
rtol, atol = 2e-3, 2e-3 # tolerance for fp16 models
|
||||
inputs = {k:Tensor(inp) for k,inp in np_inputs.items()}
|
||||
tinygrad_model = get_run_onnx(onnx_model)
|
||||
tinygrad_out = tinygrad_model(inputs)
|
||||
|
||||
ort_sess = ort.InferenceSession(str(fn), ort_options, ["CPUExecutionProvider"])
|
||||
onnx_out = ort_sess.run(output_names, np_inputs)
|
||||
onnx_out = dict([*[(name,x) for name, x in zip(output_names, onnx_out)]])
|
||||
|
||||
assert_allclose(tinygrad_out, onnx_out, rtol=rtol, atol=atol)
|
||||
print(f"{m:16s}outputs validated with rtol={rtol:.1e}, atol={atol:.1e}")
|
||||
|
||||
if open_csv is None:
|
||||
open_csv = csv.DictWriter(open('onnx_inference_speed.csv', 'w', newline=''), fieldnames=list(CSV.keys()))
|
||||
open_csv.writeheader()
|
||||
open_csv.writerow(CSV)
|
||||
|
||||
def assert_allclose(tiny_out:dict, onnx_out:dict, rtol=1e-5, atol=1e-5):
|
||||
assert len(tiny_out) == len(onnx_out) and tiny_out.keys() == onnx_out.keys()
|
||||
for k in tiny_out.keys():
|
||||
tiny_v, onnx_v = tiny_out[k], onnx_out[k]
|
||||
if tiny_v is None: assert tiny_v == onnx_v
|
||||
else: np.testing.assert_allclose(tiny_v.numpy(), onnx_v, rtol=rtol, atol=atol, err_msg=f"For tensor '{k}' in {tiny_out.keys()}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if getenv("MODEL", "") != "": benchmark_model(getenv("MODEL", ""), True)
|
||||
else:
|
||||
for m in MODELS: benchmark_model(m, True)
|
||||
70
tinygrad_repo/test/external/external_multi_gpu.py
vendored
Normal file
70
tinygrad_repo/test/external/external_multi_gpu.py
vendored
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
# cd disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
|
||||
# LD_PRELOAD=$PWD/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import colored
|
||||
from tinygrad.helpers import Timing
|
||||
from tinygrad.runtime.ops_gpu import CL
|
||||
|
||||
# TODO: support multidevice in cuda
|
||||
device = 'gpu'
|
||||
|
||||
if __name__ == "__main__":
|
||||
sz = 1024*1024*256 # 1 GB
|
||||
#sz = 1024*64
|
||||
|
||||
with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
|
||||
c0 = Tensor.ones(sz, device="cpu").realize()
|
||||
c1 = (Tensor.ones(sz, device="cpu")/2).realize()
|
||||
|
||||
with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
a0 = c0.to(f'{device}:0').realize()
|
||||
CL.synchronize()
|
||||
with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
b1 = c1.to(f'{device}:1').realize()
|
||||
CL.synchronize()
|
||||
|
||||
# cross copy. this is going through the CPU
|
||||
with Timing("0 -> CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
a1 = a0.to(f'{device}:1').realize()
|
||||
CL.synchronize()
|
||||
with Timing("1 -> CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
b0 = b1.to(f'{device}:0').realize()
|
||||
CL.synchronize()
|
||||
|
||||
# sum
|
||||
with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
ab0 = (a0 + b0).realize()
|
||||
CL.synchronize()
|
||||
with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
ab1 = (a1 + b1).realize()
|
||||
CL.synchronize()
|
||||
|
||||
# cross device sum (does this work?)
|
||||
# is this making a copy first? is that copy through the CPU?
|
||||
# the slowness comes from the *blocking* clprg call, is this pyopencl?
|
||||
with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
abx0 = (a0 + b1).realize()
|
||||
CL.synchronize()
|
||||
|
||||
with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
abx1 = (b1 + a0).realize()
|
||||
CL.synchronize()
|
||||
|
||||
# copy back
|
||||
# NOTE: half of this slowness is caused by allocating memory on the CPU
|
||||
with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
cc0 = ab0.numpy()
|
||||
with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
cc1 = ab1.numpy()
|
||||
|
||||
# same
|
||||
print("testing")
|
||||
np.testing.assert_allclose(cc0, cc1)
|
||||
|
||||
# devices
|
||||
print(ab0)
|
||||
print(ab1)
|
||||
print(abx0)
|
||||
print(abx1)
|
||||
41
tinygrad_repo/test/external/external_osx_profiling.py
vendored
Normal file
41
tinygrad_repo/test/external/external_osx_profiling.py
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
from tinygrad.runtime.ops_gpu import CLProgram, CL, CLBuffer
|
||||
from tinygrad.helpers import dtypes
|
||||
import time
|
||||
|
||||
N = 1000000
|
||||
a = CLBuffer(N, dtypes.float32)
|
||||
b = CLBuffer(N, dtypes.float32)
|
||||
c = CLBuffer(N, dtypes.float32)
|
||||
|
||||
prg = CLProgram("test", """__kernel void test(__global float *a, __global float *b, __global float *c) {
|
||||
int idx = get_global_id(0);
|
||||
a[idx] = b[idx] + c[idx];
|
||||
}""")
|
||||
prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
|
||||
t1 = time.monotonic_ns()
|
||||
e1 = prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
|
||||
CL.synchronize()
|
||||
t2 = time.monotonic_ns()
|
||||
time.sleep(3)
|
||||
t3 = time.monotonic_ns()
|
||||
e2 = prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
|
||||
CL.synchronize()
|
||||
t4 = time.monotonic_ns()
|
||||
|
||||
print(e1.profile.queued)
|
||||
print(e1.profile.submit)
|
||||
print(e1.profile.start)
|
||||
print(e1.profile.end)
|
||||
|
||||
print(e1, e2)
|
||||
print(t2-t1, e1.profile.end - e1.profile.start)
|
||||
print(t4-t3, e2.profile.end - e2.profile.start)
|
||||
print(t3-t2, e2.profile.queued-e1.profile.end)
|
||||
print((t3-t2) / (e2.profile.start-e1.profile.end), "ratio")
|
||||
|
||||
print("ratio since boot", t1/e1.profile.start)
|
||||
|
||||
print(e1.profile.start)
|
||||
print(e1.profile.end)
|
||||
print(e2.profile.start)
|
||||
print(e2.profile.end)
|
||||
125
tinygrad_repo/test/external/external_test_allocator_on_models.py
vendored
Normal file
125
tinygrad_repo/test/external/external_test_allocator_on_models.py
vendored
Normal file
@@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest, gc
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn.state import get_state_dict
|
||||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.runtime.lib import RawBuffer, LRUAllocator
|
||||
from tinygrad.helpers import dtypes, prod
|
||||
from tinygrad.ops import Device
|
||||
from test.helpers import derandomize_model
|
||||
|
||||
from examples.llama import Transformer
|
||||
|
||||
ALLOCATED_DEV_BUFS = 0
|
||||
class FakeDeviceBuffer:
|
||||
def __init__(self, sz, dt, device):
|
||||
self.id = 1
|
||||
self.size = sz
|
||||
self.dtype = dt
|
||||
self.device = device
|
||||
|
||||
global ALLOCATED_DEV_BUFS
|
||||
ALLOCATED_DEV_BUFS += 1
|
||||
class FakeAllocator(LRUAllocator):
|
||||
def _do_alloc(self, size, dtype, device, **kwargs): return FakeDeviceBuffer(size, dtype, device)
|
||||
def _do_free(self, buf):
|
||||
buf.id -= 1
|
||||
assert buf.id == 0, f"Free should be called once, but {buf.id}"
|
||||
def __del__(self): # Fake allocator should clear all buffers after each test.
|
||||
for v in self.cached_buffers.values():
|
||||
for buf, _ in v: self._free_buffer(buf)
|
||||
|
||||
FAKE_GLOBAL_ALLOCATOR = None
|
||||
class FakeBuffer(RawBuffer):
|
||||
def __init__(self, size, dtype, device='0'):
|
||||
global FAKE_GLOBAL_ALLOCATOR
|
||||
super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
|
||||
assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
|
||||
@classmethod
|
||||
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
|
||||
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
|
||||
class FakeProgram:
|
||||
def __init__(self, name:str, prg:str): pass
|
||||
def __call__(self, *bufs, global_size, local_size, wait=False): pass
|
||||
|
||||
def helper_test_correctness(gen, train):
|
||||
from tinygrad.runtime.ops_gpu import CL, CLAllocator
|
||||
old_alloc = CL.cl_allocator
|
||||
CL.cl_allocator = CLAllocator(0)
|
||||
no_alloc_result = train(*gen()).numpy()
|
||||
Device[Device.DEFAULT].synchronize()
|
||||
CL.cl_allocator = CLAllocator(512<<30) # Test cache correctness, so cache as much as possible, 512gb
|
||||
for _ in range(4):
|
||||
GlobalCounters.reset()
|
||||
np.testing.assert_allclose(train(*gen()).numpy(), no_alloc_result, rtol=1e-3, atol=1e-5)
|
||||
Device[Device.DEFAULT].synchronize()
|
||||
assert len(CL.cl_allocator.cached_buffers) != 0, "Cache must be used"
|
||||
CL.cl_allocator = old_alloc
|
||||
|
||||
def __helper_test_alloc_count(gen, train):
|
||||
was_alloc = ALLOCATED_DEV_BUFS
|
||||
for _ in range(2):
|
||||
train(*gen())
|
||||
return ALLOCATED_DEV_BUFS - was_alloc
|
||||
|
||||
def helper_test_alloc_count(mm, gen, train):
|
||||
global FAKE_GLOBAL_ALLOCATOR
|
||||
backup_program = Device[Device.DEFAULT].runtime
|
||||
backup_buffer = Device[Device.DEFAULT].buffer
|
||||
Device[Device.DEFAULT].runtime = FakeProgram
|
||||
Device[Device.DEFAULT].buffer = FakeBuffer
|
||||
Device[Device.DEFAULT].method_cache.clear()
|
||||
FAKE_GLOBAL_ALLOCATOR = FakeAllocator(16<<30)
|
||||
new_allocs = __helper_test_alloc_count(gen, train)
|
||||
Device[Device.DEFAULT].method_cache.clear()
|
||||
FAKE_GLOBAL_ALLOCATOR = FakeAllocator(0)
|
||||
old_allocs = __helper_test_alloc_count(gen, train)
|
||||
print(f"{mm}: llama: old allocs count {old_allocs}, new allocs count {new_allocs}")
|
||||
assert new_allocs < old_allocs, f"Hmm, doesn't cache work any more?"
|
||||
Device[Device.DEFAULT].runtime = backup_program
|
||||
Device[Device.DEFAULT].buffer = backup_buffer
|
||||
FAKE_GLOBAL_ALLOCATOR = None
|
||||
|
||||
def check_gc():
|
||||
if Device.DEFAULT == "GPU":
|
||||
gc.collect() # Need to collect Tensors.
|
||||
from extra.introspection import print_objects
|
||||
assert print_objects() == 0
|
||||
|
||||
class TestAllocators(unittest.TestCase):
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
def test_lru_allocator_tiny_llama(self):
|
||||
old_type = Tensor.default_type
|
||||
Tensor.default_type = dtypes.float16
|
||||
|
||||
args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
|
||||
def __test():
|
||||
model = Transformer(**args_tiny)
|
||||
derandomize_model(model)
|
||||
def test(t): return model(t, 0).realize()
|
||||
helper_test_correctness(lambda: (Tensor([[1,]]),), test)
|
||||
__test()
|
||||
Tensor.default_type = old_type
|
||||
check_gc()
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
def test_lru_allocator_tiny_llama_alloc_counts(self):
|
||||
args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
|
||||
def test_alloc_count(t):
|
||||
model = Transformer(**args_tiny)
|
||||
for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
|
||||
return model(t, 0).realize()
|
||||
helper_test_alloc_count("llama", lambda: (Tensor([[2,]]),), test_alloc_count)
|
||||
check_gc()
|
||||
|
||||
@unittest.skip("huge for CI")
|
||||
def test_stable_diffusion(self):
|
||||
from examples.stable_diffusion import UNetModel
|
||||
model = UNetModel()
|
||||
derandomize_model(model)
|
||||
def test(t, t2): return model(t, 801, t2).realize()
|
||||
helper_test_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
8
tinygrad_repo/test/external/external_test_embedding.py
vendored
Normal file
8
tinygrad_repo/test/external/external_test_embedding.py
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn import Embedding
|
||||
|
||||
if __name__ == "__main__":
|
||||
vocab_size = 50257
|
||||
dim = 128
|
||||
test = Embedding(vocab_size, dim)
|
||||
ret = test(Tensor([[1,2,3]])).numpy()
|
||||
208
tinygrad_repo/test/external/external_test_gpu_ast.py
vendored
Normal file
208
tinygrad_repo/test/external/external_test_gpu_ast.py
vendored
Normal file
@@ -0,0 +1,208 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.ops import LazyOp, ReduceOps, BinaryOps, UnaryOps, MovementOps
|
||||
from tinygrad.shape.shapetracker import ShapeTracker, View, ZeroView
|
||||
from tinygrad.runtime.ops_gpu import GPUBuffer, CLProgram, CLCodegen
|
||||
#from tinygrad.runtime.ops_metal import MetalBuffer as GPUBuffer, MetalProgram as CLProgram, MetalCodegen as CLCodegen
|
||||
from tinygrad.helpers import getenv
|
||||
from extra.lib_test_ast import test_ast
|
||||
|
||||
import platform
|
||||
OSX = platform.system() == "Darwin"
|
||||
|
||||
def compile_and_test_ast(ast, local_size=None):
|
||||
k = CLCodegen(ast)
|
||||
prg = k.codegen().build(CLProgram)
|
||||
if local_size is not None: prg.local_size = local_size
|
||||
for i in range(5): prg(prg.lower(k.bufs))
|
||||
if getenv("TEST", 0): test_ast(k)
|
||||
|
||||
class TestAST(unittest.TestCase):
|
||||
def test_conv_zeroview_ast(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 3, 4), views=[View((1, 1, 3, 4), (2, 2, 2, 1), -3), ZeroView((1, 1, 1, 2), ((0, 1), (0, 1), (-1, 2), (-1, 3))), View((1, 1, 3, 4), (0, 0, 4, 1), 0)]))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 3, 4), views=[View((1, 1, 3, 4), (0, 0, 0, 0), 0)]))
|
||||
op1 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
ast = LazyOp(UnaryOps.RELU, (op1,), None)
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_cifar_conv(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(512, 1, 128, 32, 32, 64, 3, 3), views=[View((512, 64, 34, 34), (65536, 1024, 32, 1), -33), ZeroView((512, 64, 32, 32), ((0, 512), (0, 64), (-1, 33), (-1, 33))), View((512, 1, 128, 32, 32, 64, 3, 3), (73984, 73984, 0, 34, 1, 1156, 34, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 64, 32, 32), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(512, 1, 128, 32, 32, 64, 3, 3), views=[View((512, 1, 128, 32, 32, 64, 3, 3), (0, 0, 576, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 64, 3, 3), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
ast = LazyOp(ReduceOps.SUM, (op0,), (512, 1, 128, 32, 32, 1, 1, 1))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_cifar_conv_backward(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(256, 1, 512, 3, 3, 512, 8, 8), views=[View((256, 512, 10, 10), (64, 16384, 8, 1), -9), ZeroView((256, 512, 8, 8), ((0, 256), (0, 512), (-1, 9), (-1, 9))), View((256, 1, 512, 3, 3, 512, 8, 8), (51200, 51200, 0, 10, 1, 100, 10, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 256, 8, 8), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(256, 1, 512, 3, 3, 512, 8, 8), views=[View((256, 1, 512, 3, 3, 512, 8, 8), (0, 0, 64, 0, 0, 32768, 8, 1), 0)]), hostbuf=GPUBuffer(shape=(512, 512, 8, 8), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (256, 1, 512, 3, 3, 1, 1, 1))
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op1,), (256, 512, 3, 3))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_first_op_conv(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 3, 3, 3, 4), views=[View((1, 130, 258, 1, 12), (393216, 3072, 12, 12, 1), -3084), ZeroView((1, 128, 256, 1, 12), ((0, 1), (-1, 129), (-1, 257), (0, 1), (0, 12))), View((1, 64, 128, 8, 4, 3, 3, 3, 4), (0, 6192, 24, 0, 0, 3096, 12, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 768, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 3, 3, 3, 4), views=[View((1, 64, 128, 8, 4, 3, 3, 3, 4), (0, 0, 0, 432, 4, 144, 16, 48, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 108, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 8, 4, 1, 1, 1, 1))
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(32,), force_create=True))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
|
||||
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
|
||||
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
|
||||
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
|
||||
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
|
||||
op6 = LazyOp(UnaryOps.RELU, (op5,), None)
|
||||
op7 = LazyOp(BinaryOps.MUL, (buf3,op6,), None)
|
||||
op8 = LazyOp(BinaryOps.SUB, (op3,op7,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_second_op_conv(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 3, 3), views=[View((1, 66, 130, 32, 1), (262144, 4096, 32, 1, 1), -4128), ZeroView((1, 64, 128, 32, 1), ((0, 1), (-1, 65), (-1, 129), (0, 32), (0, 1))), View((1, 64, 128, 8, 4, 1, 1, 3, 3), (266240, 4160, 32, 4, 1, 12480, 12480, 4160, 32), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 3, 3), views=[View((1, 64, 128, 8, 4, 1, 1, 3, 3), (0, 0, 0, 36, 1, 0, 0, 12, 4), 0)]), hostbuf=GPUBuffer(shape=(8, 9, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 8, 4, 1, 1, 1, 1))
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(32,), force_create=True))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
|
||||
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
|
||||
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 8, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 8, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
|
||||
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
|
||||
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
|
||||
op6 = LazyOp(UnaryOps.RELU, (op5,), None)
|
||||
op7 = LazyOp(BinaryOps.MUL, (buf3,op6,), None)
|
||||
op8 = LazyOp(BinaryOps.SUB, (op3,op7,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_third_op_conv(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 4096, 32, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 0, 0, 128, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(4, 32, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 4, 4, 1, 1, 1, 1))
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 4, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(16,), force_create=True))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op2,), (64, 512, 4))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
# VALIDHACKS=1 IMAGE=2 DEBUG=4 PYTHONPATH="." GPU=1 OPT=2 python3 test/external_test_gpu_ast.py TestAST.test_reduce_op
|
||||
# 164 time 27.75 ms running re_S128_4 with [128] None count 4 runtime 1016.06 us 2.07 GFLOPS () -> (128, 1)
|
||||
# 169 time 22.51 ms running matmul with [4, 16, 128] [4, 16, 16] count 5 runtime 110.08 us 19.06 GFLOPS ('-DMATMUL',) -> (128, 1)
|
||||
def test_reduce_op(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 512, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 8192, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 2048, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 128, 4, 1, 1, 1, 1))
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 128, 4, 1, 1, 1, 1), (512, 512, 512, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
op3 = LazyOp(UnaryOps.RELU, (op2,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op3,), (1, 128, 4))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_alt_reduce_op(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 512, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 512, 4), (0, 0, 0, 0, 0, 8192, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(128, 2048, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1))
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), views=[View((1, 1, 1, 1, 1, 128, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
|
||||
op3 = LazyOp(BinaryOps.MAX, (op2,buf3,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op3,), (1, 128, 4))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
# re_S32_16_36_6 is fast
|
||||
def test_1x1_36_6(self): # 36 <- 6
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), (0, 1536, 24, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 384, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 6, 4), (0, 0, 0, 0, 0, 96, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(36, 24, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1))
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(144,), force_create=True))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
|
||||
op3 = LazyOp(BinaryOps.MAX, (op2,buf3,), None)
|
||||
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.], dtype=np.float32)))
|
||||
op4 = LazyOp(UnaryOps.EXP, (op2,), None)
|
||||
op5 = LazyOp(BinaryOps.SUB, (buf4,op4,), None)
|
||||
buf5 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 36, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), force_create=True))
|
||||
op6 = LazyOp(BinaryOps.MAX, (op5,buf5,), None)
|
||||
op7 = LazyOp(BinaryOps.SUB, (op3,op6,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op7,), (32, 2304, 4))
|
||||
compile_and_test_ast(ast, None if OSX else (16, 16, 4))
|
||||
|
||||
# re_S32_16_6_36 is slow
|
||||
def test_1x1_6_36(self): # 6 <- 36
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), (0, 9216, 144, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 2304, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 36, 4), (0, 0, 0, 0, 0, 576, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(6, 144, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1))
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(24,), force_create=True))
|
||||
op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 1536, 24, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(32, 384, 4), force_create=True))
|
||||
op3 = LazyOp(BinaryOps.ADD, (op2,buf3,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op3,), (32, 384, 4))
|
||||
compile_and_test_ast(ast, (6, 16, 4))
|
||||
|
||||
# re_S32_16_6_24
|
||||
def test_1x1_6_24(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), (0, 6144, 96, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 1536, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 24, 4), (0, 0, 0, 0, 0, 384, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(6, 96, 4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1))
|
||||
#buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), views=[View((1, 32, 64, 1, 1, 6, 4, 1, 1, 1, 1), (0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0), 0)]), hostbuf=GPUBuffer(shape=(24,), force_create=True))
|
||||
#op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op1,), (32, 384, 4))
|
||||
compile_and_test_ast(ast, (6, 4, 8))
|
||||
|
||||
def test_full_reduce_op(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (512, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 1, 1, 128, 4, 1, 1, 1, 1), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (0, 1), 0)]), hostbuf=GPUBuffer(shape=(512,), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.ADD, (buf0,buf1,), None)
|
||||
buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 512), views=[View((1, 512), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([2.], dtype=np.float32)))
|
||||
op1 = LazyOp(BinaryOps.POW, (op0,buf2,), None)
|
||||
op2 = LazyOp(ReduceOps.SUM, (op1,), (1, 1))
|
||||
buf3 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([0.5], dtype=np.float32)))
|
||||
op3 = LazyOp(BinaryOps.POW, (op2,buf3,), None)
|
||||
buf4 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.e-12], dtype=np.float32)))
|
||||
op4 = LazyOp(BinaryOps.SUB, (op3,buf4,), None)
|
||||
op5 = LazyOp(UnaryOps.RELU, (op4,), None)
|
||||
buf5 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([1.e-12], dtype=np.float32)))
|
||||
op6 = LazyOp(BinaryOps.ADD, (op5,buf5,), None)
|
||||
buf6 = GPUBuffer(shape=ShapeTracker(shape=(1, 1), views=[View((1, 1), (0, 0), 0)]), hostbuf=GPUBuffer(shape=(1,), backing=np.array([3.4e+38], dtype=np.float32)))
|
||||
op7 = LazyOp(BinaryOps.SUB, (op3,buf6,), None)
|
||||
op8 = LazyOp(UnaryOps.RELU, (op7,), None)
|
||||
op9 = LazyOp(BinaryOps.SUB, (op6,op8,), None)
|
||||
op10 = LazyOp(UnaryOps.RECIPROCAL, (op9,), None)
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op10,), (1, 1))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_1239_reduce(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1239, 4, 1, 1, 64, 4), views=[View((1, 1, 1, 1239, 4, 1, 1, 64, 4), (0, 0, 0, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 64, 4), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 1, 1, 1239, 4, 1, 1, 64, 4), views=[View((1, 1, 1, 1239, 4, 1, 1, 64, 4), (0, 0, 0, 1024, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(1239, 256,
|
||||
4), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 1, 1, 1239, 4, 1, 1, 1, 1))
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op1,), (1, 1, 1, 1, 4956))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_enet_first_conv_bs32(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 3, 225, 225), (150528, 50176, 224, 1), 0), ZeroView((8, 3, 224, 224), ((0, 8), (0, 3), (0, 225), (0, 225))), View((8, 1, 32, 112, 112, 3, 3, 3), (151875, 151875, 0, 450, 2, 50625, 225, 1), 0)]), hostbuf=GPUBuffer(shape=(8, 3, 224, 224), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(8, 1, 32, 112, 112, 3, 3, 3), views=[View((8, 1, 32, 112, 112, 3, 3, 3), (0, 0, 27, 0, 0, 9, 3, 1), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 3, 3), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (8, 1, 32, 112, 112, 1, 1, 1))
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op1,), (8, 32, 112, 112))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
def test_enet_reduce_bs32(self):
|
||||
buf0 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 32, 225, 225), (50176, 150528, 224, 1), 0), ZeroView((3, 32, 224, 224), ((0, 3), (0, 32), (0, 225), (0, 225))), View((3, 1, 32, 3, 3, 32, 112, 112), (1620000, 1620000, 0, 225, 1, 50625, 450, 2), 0)]), hostbuf=GPUBuffer(shape=(32, 3, 224, 224), force_create=True))
|
||||
buf1 = GPUBuffer(shape=ShapeTracker(shape=(3, 1, 32, 3, 3, 32, 112, 112), views=[View((3, 1, 32, 3, 3, 32, 112, 112), (0, 12845056, 401408, 0, 0, 12544, 112, 1), 0)]), hostbuf=GPUBuffer(shape=(1, 1, 32, 1, 1, 32, 112, 112), force_create=True))
|
||||
op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None)
|
||||
op1 = LazyOp(ReduceOps.SUM, (op0,), (3, 1, 32, 3, 3, 1, 1, 1))
|
||||
ast = LazyOp(MovementOps.RESHAPE, (op1,), (3, 32, 3, 3))
|
||||
compile_and_test_ast(ast)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
52
tinygrad_repo/test/external/external_test_image.py
vendored
Normal file
52
tinygrad_repo/test/external/external_test_image.py
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env python
|
||||
import os
|
||||
import unittest
|
||||
import numpy as np
|
||||
if 'IMAGE' not in os.environ:
|
||||
os.environ['IMAGE'] = '2'
|
||||
os.environ['GPU'] = '1'
|
||||
os.environ['OPT'] = '2'
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn import Conv2d
|
||||
Tensor.no_grad = True
|
||||
|
||||
class TestImage(unittest.TestCase):
|
||||
def test_create_image(self):
|
||||
t = Tensor.ones(128, 128, 1)
|
||||
t = t.reshape(128, 32, 4) + 3
|
||||
t.realize()
|
||||
np.testing.assert_array_equal(t.numpy(), np.ones((128,32,4))*4)
|
||||
|
||||
def test_sum_image(self):
|
||||
t1 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 3
|
||||
t1.realize()
|
||||
t1 = t1.sum()
|
||||
t1.realize()
|
||||
assert t1.numpy() == 16*4*4*4, f"got {t1.numpy()}"
|
||||
|
||||
def test_add_image(self):
|
||||
t1 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 3
|
||||
t2 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 4
|
||||
t1.realize()
|
||||
t2.realize()
|
||||
t3 = t1 + t2
|
||||
t3.realize()
|
||||
np.testing.assert_array_equal(t3.numpy(), np.ones((16,4,4))*9)
|
||||
|
||||
def test_padded_conv(self):
|
||||
bs, in_chans, out_chans = 1,12,32
|
||||
tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
|
||||
tiny_dat = Tensor.ones(bs, 12, 64, 128)
|
||||
tiny_conv(tiny_dat).realize()
|
||||
|
||||
def test_op_conv(self):
|
||||
bs, in_chans, out_chans = 1,12,32
|
||||
tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1)
|
||||
tiny_dconv = Conv2d(out_chans, out_chans, 1, bias=None, padding=0)
|
||||
tiny_dat = Tensor.ones(bs, 12, 64, 128)
|
||||
p2 = tiny_conv(tiny_dat).relu()
|
||||
p2 = tiny_dconv(p2)
|
||||
p2.realize()
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
45
tinygrad_repo/test/external/external_test_jit_on_models.py
vendored
Normal file
45
tinygrad_repo/test/external/external_test_jit_on_models.py
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.jit import TinyJit, JIT_SUPPORTED_DEVICE
|
||||
from tinygrad.helpers import dtypes, CI
|
||||
from tinygrad.ops import Device
|
||||
from test.helpers import derandomize_model
|
||||
|
||||
from examples.llama import Transformer
|
||||
|
||||
def helper_test_jitted_correctness(gen, train, train_jit):
|
||||
nojit = train(*gen()).numpy()
|
||||
for _ in range(5): jit = train_jit(*gen()).numpy()
|
||||
np.testing.assert_allclose(nojit, jit, rtol=1e-3, atol=1e-5)
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE, "needs JIT")
|
||||
class TestJittedModels(unittest.TestCase):
|
||||
def test_jitted_tiny_llama(self):
|
||||
old_type = Tensor.default_type
|
||||
Tensor.default_type = dtypes.float16
|
||||
|
||||
args_tiny = {"dim": 1024, "multiple_of": 256, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
|
||||
model = Transformer(**args_tiny)
|
||||
derandomize_model(model)
|
||||
def test(t): return model(t, 0).realize()
|
||||
|
||||
@TinyJit
|
||||
def test_jit(t): return model(t, 0).realize()
|
||||
helper_test_jitted_correctness(lambda: (Tensor([[1,]]),), test, test_jit)
|
||||
Tensor.default_type = old_type
|
||||
|
||||
@unittest.skipUnless(not CI, "huge for CI")
|
||||
def test_jitted_stable_diffusion(self):
|
||||
from examples.stable_diffusion import UNetModel
|
||||
model = UNetModel()
|
||||
derandomize_model(model)
|
||||
def test(t, t2): return model(t, 801, t2).realize()
|
||||
|
||||
@TinyJit
|
||||
def test_jit(t, t2): return model(t, 801, t2).realize()
|
||||
helper_test_jitted_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test, test_jit)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
208
tinygrad_repo/test/external/external_test_onnx_backend.py
vendored
Normal file
208
tinygrad_repo/test/external/external_test_onnx_backend.py
vendored
Normal file
@@ -0,0 +1,208 @@
|
||||
import unittest
|
||||
from onnx.backend.base import Backend, BackendRep
|
||||
import onnx.backend.test
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from typing import Any, Tuple
|
||||
from tinygrad.helpers import getenv, CI
|
||||
|
||||
# pip3 install tabulate
|
||||
pytest_plugins = 'onnx.backend.test.report',
|
||||
|
||||
from extra.onnx import get_run_onnx
|
||||
|
||||
class TinygradModel(BackendRep):
|
||||
def __init__(self, run_onnx, input_names):
|
||||
super().__init__()
|
||||
self.fxn = run_onnx
|
||||
self.input_names = input_names
|
||||
|
||||
def run(self, inputs: Any, **kwargs: Any) -> Tuple[Any, ...]:
|
||||
real_inputs = {k:v for k,v in zip(self.input_names, inputs)}
|
||||
ret = self.fxn(real_inputs, debug=True)
|
||||
return tuple(x.numpy() if isinstance(x, Tensor) else [i.numpy() for i in x] if isinstance(x, list) else np.array(x) for x in ret.values())
|
||||
|
||||
class TinygradBackend(Backend):
|
||||
@classmethod
|
||||
def prepare(cls, model, device):
|
||||
input_all = [x.name for x in model.graph.input]
|
||||
input_initializer = [x.name for x in model.graph.initializer]
|
||||
net_feed_input = [x for x in input_all if x not in input_initializer]
|
||||
print("prepare", cls, device, net_feed_input)
|
||||
run_onnx = get_run_onnx(model)
|
||||
return TinygradModel(run_onnx, net_feed_input)
|
||||
|
||||
@classmethod
|
||||
def supports_device(cls, device: str) -> bool:
|
||||
return device == "CPU"
|
||||
|
||||
backend_test = onnx.backend.test.BackendTest(TinygradBackend, __name__)
|
||||
|
||||
# no support for reduce with multiply (needs llop)
|
||||
backend_test.exclude('test_reduce_prod_*')
|
||||
|
||||
# TODO figure out why it's returning wrong values, geohotstan's uneducated guess is it's due to imprecision from float64 (double) -> float32
|
||||
# see Type Constraints: https://onnx.ai/onnx/operators/onnx_aionnxpreviewtraining_Adam.html#type-constraints
|
||||
backend_test.exclude('test_adam_multiple_cpu')
|
||||
backend_test.exclude('test_nesterov_momentum_cpu')
|
||||
|
||||
# we only support float32
|
||||
backend_test.exclude('uint8')
|
||||
backend_test.exclude('uint16')
|
||||
backend_test.exclude('uint32')
|
||||
backend_test.exclude('uint64')
|
||||
backend_test.exclude('int8')
|
||||
backend_test.exclude('int16')
|
||||
backend_test.exclude('float64')
|
||||
backend_test.exclude('string')
|
||||
|
||||
backend_test.exclude('test_pow_types_int*')
|
||||
backend_test.exclude('test_cast_*')
|
||||
backend_test.exclude('test_castlike_*')
|
||||
backend_test.exclude('test_convinteger_*')
|
||||
backend_test.exclude('test_matmulinteger_*')
|
||||
|
||||
backend_test.exclude('test_reduce_log_sum_exp*') # dependent on actual float64 implementation for backends
|
||||
backend_test.exclude('test_operator_add*') # dependent on float64 math. Without it values default to 0 or inf
|
||||
|
||||
# we don't support indexes
|
||||
# backend_test.exclude('test_argmax_*') # Needs more work: select_last_index
|
||||
# backend_test.exclude('test_argmin_*') # Needs more work: select_last_index
|
||||
backend_test.exclude('test_nonzero_*')
|
||||
|
||||
# no support for mod
|
||||
backend_test.exclude('test_mod_*')
|
||||
|
||||
# no boolean ops (2d, 3d, 4d)
|
||||
backend_test.exclude('test_bitshift_*')
|
||||
|
||||
# no scatternd gathernd
|
||||
backend_test.exclude('test_gathernd_*')
|
||||
backend_test.exclude('test_scatternd_*')
|
||||
|
||||
# no quantize
|
||||
backend_test.exclude('test_dynamicquantizelinear_*')
|
||||
backend_test.exclude('test_qlinearmatmul_*')
|
||||
backend_test.exclude('test_qlinearconv_*')
|
||||
backend_test.exclude('test_quantizelinear_*')
|
||||
|
||||
# no rnn
|
||||
backend_test.exclude('test_gru_*')
|
||||
backend_test.exclude('test_rnn_*')
|
||||
backend_test.exclude('test_lstm_*')
|
||||
backend_test.exclude('test_simple_rnn_*')
|
||||
|
||||
# no control flow
|
||||
backend_test.exclude('test_if_*')
|
||||
backend_test.exclude('test_loop*')
|
||||
backend_test.exclude('test_range_float_type_positive_delta_expanded_cpu') # requires loop
|
||||
|
||||
# unsupported (strange) ops
|
||||
backend_test.exclude('test_bitwise_*')
|
||||
backend_test.exclude('test_blackmanwindow_*')
|
||||
backend_test.exclude('test_bernoulli_*')
|
||||
backend_test.exclude('test_cumsum_*')
|
||||
backend_test.exclude('test_det_*')
|
||||
|
||||
backend_test.exclude('test_tril_zero_cpu') # TODO: zero array support
|
||||
backend_test.exclude('test_triu_zero_cpu') # TODO: zero array support
|
||||
|
||||
backend_test.exclude('test_col2im_*')
|
||||
backend_test.exclude('test_hammingwindow_*')
|
||||
backend_test.exclude('test_hannwindow_*')
|
||||
backend_test.exclude('test_hardmax_*')
|
||||
backend_test.exclude('test_gridsample_*')
|
||||
backend_test.exclude('test_dft_*')
|
||||
backend_test.exclude('test_einsum_*')
|
||||
backend_test.exclude('test_strnorm_*')
|
||||
backend_test.exclude('test_unique_*')
|
||||
backend_test.exclude('test_sequence_*')
|
||||
backend_test.exclude('test_nonmaxsuppression_*')
|
||||
backend_test.exclude('test_reversesequence_*')
|
||||
backend_test.exclude('test_roialign_*')
|
||||
backend_test.exclude('test_top_k_*')
|
||||
backend_test.exclude('test_tfidfvectorizer_*')
|
||||
backend_test.exclude('test_stft_*')
|
||||
backend_test.exclude('test_melweightmatrix_*')
|
||||
|
||||
# more strange ops
|
||||
backend_test.exclude('test_basic_deform_conv_*')
|
||||
backend_test.exclude('test_deform_conv_*')
|
||||
backend_test.exclude('test_lppool_*')
|
||||
backend_test.exclude('test_depthtospace_*')
|
||||
backend_test.exclude('test_spacetodepth_*')
|
||||
backend_test.exclude('test_scan*')
|
||||
backend_test.exclude('test_split_to_sequence_*')
|
||||
backend_test.exclude('test_resize_downsample_scales_cubic_*') # unsure how to implement cubic
|
||||
backend_test.exclude('test_resize_downsample_sizes_cubic_*') # unsure how to implement cubic
|
||||
backend_test.exclude('test_resize_upsample_scales_cubic_*') # unsure how to implement cubic
|
||||
backend_test.exclude('test_resize_upsample_sizes_cubic_*') # unsure how to implement cubic
|
||||
|
||||
# rest of the failing tests
|
||||
backend_test.exclude('test_averagepool_2d_dilations_cpu') # dilations != 1 not supported for avgpool
|
||||
backend_test.exclude('test_convtranspose_autopad_same_cpu') # TODO geohotstan has no idea how this is done, autopad requires output_shape but output_shape requires pads from autopad
|
||||
backend_test.exclude('test_optional_has_element_empty_optional_input_cpu') # Attempts to create Tensor from None
|
||||
backend_test.exclude('test_range_int32_type_negative_delta_expanded_cpu') # AttributeProto.GRAPH not implemented
|
||||
backend_test.exclude('test_reshape_allowzero_reordered_cpu') # reshaping to 0 shape
|
||||
backend_test.exclude('test_resize_downsample_scales_linear_antialias_cpu') # antialias not implemented
|
||||
backend_test.exclude('test_resize_downsample_sizes_linear_antialias_cpu') # antialias not implemented
|
||||
backend_test.exclude('test_resize_tf_crop_and_resize_cpu') # unsure about fill value after clip
|
||||
backend_test.exclude('test_operator_addconstant_cpu') # bad data type
|
||||
|
||||
# issue 1556 https://github.com/tinygrad/tinygrad/issues/1556
|
||||
backend_test.exclude('test_isinf_cpu')
|
||||
backend_test.exclude('test_isinf_negative_cpu')
|
||||
backend_test.exclude('test_isinf_positive_cpu')
|
||||
backend_test.exclude('test_isnan_cpu')
|
||||
|
||||
# issue 1791 fast math messes with these https://github.com/tinygrad/tinygrad/issues/1791
|
||||
backend_test.exclude('test_resize_upsample_sizes_nearest_axes_2_3_cpu')
|
||||
backend_test.exclude('test_resize_upsample_sizes_nearest_axes_3_2_cpu')
|
||||
backend_test.exclude('test_resize_upsample_sizes_nearest_cpu')
|
||||
|
||||
# issue 2067 potentially also a fastmath issue https://github.com/tinygrad/tinygrad/issues/2067
|
||||
if getenv('METAL'):
|
||||
backend_test.exclude('test_maxpool_2d_pads_cpu')
|
||||
backend_test.exclude('test_maxpool_2d_same_lower_cpu')
|
||||
|
||||
# Don't know how to treat special TensorProto like TensorProto.FLOAT8E4M3FN
|
||||
if getenv("CPU") or getenv("TORCH"):
|
||||
backend_test.exclude('test_dequantizelinear_axis_cpu')
|
||||
backend_test.exclude('test_dequantizelinear_cpu')
|
||||
|
||||
# compiled backends cannot reshape to and from 0
|
||||
if getenv('LLVM') or getenv('GPU') or getenv('CLANG') or getenv('METAL') or getenv('CUDA'):
|
||||
backend_test.exclude('test_slice_start_out_of_bounds_cpu')
|
||||
backend_test.exclude('test_constantofshape_int_shape_zero_cpu')
|
||||
|
||||
if getenv('GPU') or getenv('METAL'):
|
||||
backend_test.exclude('test_mish_cpu') # weird inaccuracy
|
||||
backend_test.exclude('test_mish_expanded_cpu') # weird inaccuracy
|
||||
backend_test.exclude('test_eyelike_with_dtype_cpu') # backend does not support dtype: Double
|
||||
|
||||
# Segfaults in CI
|
||||
if (getenv('LLVM') or getenv('CUDA')) and CI:
|
||||
backend_test.exclude('test_max_float16_cpu')
|
||||
backend_test.exclude('test_min_float16_cpu')
|
||||
|
||||
# disable model tests for now since they are slow
|
||||
if not getenv("MODELTESTS"):
|
||||
for x in backend_test.test_suite:
|
||||
if 'OnnxBackendRealModelTest' in str(type(x)):
|
||||
backend_test.exclude(str(x).split(" ")[0])
|
||||
else:
|
||||
# model tests all pass!
|
||||
backend_test.include('test_resnet50')
|
||||
backend_test.include('test_inception_v1')
|
||||
backend_test.include('test_inception_v2')
|
||||
backend_test.include('test_densenet121')
|
||||
backend_test.include('test_shufflenet')
|
||||
backend_test.include('test_squeezenet')
|
||||
backend_test.include('test_bvlc_alexnet')
|
||||
backend_test.include('test_zfnet512')
|
||||
backend_test.include('test_vgg19')
|
||||
|
||||
globals().update(backend_test.enable_report().test_cases)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
392
tinygrad_repo/test/external/external_test_opt.py
vendored
Normal file
392
tinygrad_repo/test/external/external_test_opt.py
vendored
Normal file
@@ -0,0 +1,392 @@
|
||||
#!/usr/bin/env python
|
||||
import os
|
||||
|
||||
import torch
|
||||
if "OPT" not in os.environ:
|
||||
os.environ["OPT"] = "2"
|
||||
|
||||
import gc
|
||||
import numpy as np
|
||||
|
||||
import unittest
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
from tinygrad import nn
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.nn import optim
|
||||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.lazy import PUSH_PERMUTES
|
||||
from tinygrad.jit import CacheCollector
|
||||
|
||||
class CLCache:
|
||||
def __init__(self, allowed=None, strict=False, preclear=True): self.allowed, self.strict, self.preclear = allowed, strict, preclear
|
||||
def __enter__(self):
|
||||
if self.preclear:
|
||||
gc.collect()
|
||||
for x in [x for x in gc.get_objects() if isinstance(x, Tensor)]:
|
||||
x.realize()
|
||||
GlobalCounters.reset()
|
||||
CacheCollector.start()
|
||||
print("cache: entering")
|
||||
def __exit__(self, type, value, traceback):
|
||||
cache = CacheCollector.finish()
|
||||
print(f"cache: exiting with size {len(cache)}", f"allowed {self.allowed}" if self.allowed is not None else "")
|
||||
if self.allowed is not None:
|
||||
assert len(cache) <= self.allowed and (not self.strict or len(cache) == self.allowed), f"used too many kernels! {len(cache)} > {self.allowed}"
|
||||
|
||||
from models.convnext import ConvNeXt
|
||||
from models.efficientnet import EfficientNet
|
||||
from models.resnet import ResNet18
|
||||
from models.vit import ViT
|
||||
from tinygrad.nn.state import get_parameters
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
class TestInferenceMinKernels(unittest.TestCase):
|
||||
def setUp(self):
|
||||
Tensor.training = False
|
||||
|
||||
@unittest.skipIf(not PUSH_PERMUTES, "this test requires PUSH_PERMUTES")
|
||||
def test_convnext(self):
|
||||
model = ConvNeXt()
|
||||
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
|
||||
img = Tensor.randn(1, 3, 224, 224)
|
||||
with CLCache(129):
|
||||
model(img).realize()
|
||||
|
||||
def test_enet(self):
|
||||
model = EfficientNet(getenv("ENET_NUM", 0), has_se=False)
|
||||
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
|
||||
img = Tensor.randn(1, 3, 224, 224)
|
||||
with CLCache(51):
|
||||
model.forward(img).realize()
|
||||
|
||||
def test_enet_se(self):
|
||||
model = EfficientNet(getenv("ENET_NUM", 0), has_se=True)
|
||||
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
|
||||
img = Tensor.randn(1, 3, 224, 224)
|
||||
# TODO: this seems very high
|
||||
with CLCache(115):
|
||||
model.forward(img).realize()
|
||||
|
||||
def test_resnet(self):
|
||||
model = ResNet18()
|
||||
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
|
||||
img = Tensor.randn(1, 3, 224, 224)
|
||||
with CLCache(26):
|
||||
model.forward(img).realize()
|
||||
|
||||
def test_vit(self):
|
||||
model = ViT(embed_dim=192, num_heads=3)
|
||||
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
|
||||
img = Tensor.randn(1, 3, 224, 224)
|
||||
with CLCache(222): # NOTE: this is way too high
|
||||
out = model.forward(img)
|
||||
assert len(CacheCollector.cache) == 0, "ViT prerealized?"
|
||||
out.realize()
|
||||
|
||||
def test_llama(self):
|
||||
from examples.llama import Transformer
|
||||
args_tiny = {"dim": 512, "multiple_of": 256, "n_heads": 8, "n_layers": 4, "norm_eps": 1e-05, "vocab_size": 1000}
|
||||
model = Transformer(**args_tiny)
|
||||
for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np))
|
||||
with CLCache(85):
|
||||
model(Tensor([[1,2,3,4]]), 0).realize()
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
class TestOptBinOp(unittest.TestCase):
|
||||
def _test_no_binop_rerun(self, f1, f2=None, allowed=1):
|
||||
a = Tensor.randn(16, 16)
|
||||
b = Tensor.randn(16, 16)
|
||||
with CLCache():
|
||||
c = f1(a, b)
|
||||
if f2 is not None: d = f2(a, b)
|
||||
c.realize()
|
||||
if f2 is not None: d.realize()
|
||||
assert len(CacheCollector.cache) == allowed, "binop was rerun!"
|
||||
if f2 is not None: np.testing.assert_allclose(c.numpy().ravel(), d.numpy().ravel(), rtol=1e-3, atol=1e-5)
|
||||
|
||||
def test_no_binop_rerun(self): return self._test_no_binop_rerun(lambda a,b: a*b, lambda a,b: (a*b).reshape(16, 16, 1))
|
||||
def test_no_binop_rerun_alt(self): return self._test_no_binop_rerun(lambda a,b: (a*b).reshape(16, 16, 1), lambda a,b: a*b)
|
||||
def test_no_binop_rerun_reduce_broadcast(self): return self._test_no_binop_rerun(lambda a,b: a.sum()+b, lambda a,b: a.sum().reshape(1,1)+b, allowed=2)
|
||||
@unittest.skip("this test started failing with the new change, based movementop issue")
|
||||
def test_no_binop_rerun_transposed(self): return self._test_no_binop_rerun(lambda a,b: (a.T*b.T).T, lambda a,b: a*b)
|
||||
def test_no_binop_rerun_mid_reshape(self): return self._test_no_binop_rerun(lambda a,b: (a*b).reshape(256)+a.reshape(256))
|
||||
|
||||
# currently non working tests
|
||||
#def test_no_binop_rerun_preshape(self): return self._test_no_binop_rerun(lambda a,b: a.reshape(16, 16, 1)*b.reshape(16, 16, 1), lambda a,b: a*b)
|
||||
#def test_no_binop_rerun_reduce(self): return self._test_no_binop_rerun(lambda a,b: (a*b).sum(), lambda a,b: (a*b).reshape(16, 16, 1).sum())
|
||||
#def test_no_binop_rerun_reduce_alt(self): return self._test_no_binop_rerun(lambda a,b: a.sum(1)+b[0], lambda a,b: a.sum(1).reshape(1,16)+b[0])
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
class TestOptReduceLoop(unittest.TestCase):
|
||||
@unittest.skip("this is broken")
|
||||
def test_loop_left(self):
|
||||
a = Tensor.randn(16, 16)
|
||||
b = Tensor.randn(16, 16)
|
||||
with CLCache():
|
||||
t = a.sum(0)
|
||||
b = t.reshape(16,1).expand(16,16).sum(0)
|
||||
c = (t+b)
|
||||
c.realize()
|
||||
assert len(CacheCollector.cache) == 2, "loop left fusion broken"
|
||||
|
||||
def test_loop_right(self):
|
||||
a = Tensor.randn(16, 16)
|
||||
b = Tensor.randn(16, 16)
|
||||
with CLCache():
|
||||
t = a.sum(0)
|
||||
b = t.reshape(16,1).expand(16,16).sum(0)
|
||||
c = (b+t)
|
||||
c.realize()
|
||||
assert len(CacheCollector.cache) == 2, "loop right fusion broken"
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
class TestOptWChild(unittest.TestCase):
|
||||
def test_unrealized_child(self):
|
||||
a = Tensor.randn(16, 16)
|
||||
b = Tensor.randn(16, 16)
|
||||
with CLCache():
|
||||
c = (a*b).sum()
|
||||
d = c+1
|
||||
e = c+2
|
||||
d.realize()
|
||||
assert len(CacheCollector.cache) == 2, "don't fuse if you have children"
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
class TestOpt(unittest.TestCase):
|
||||
def test_muladd(self):
|
||||
a,b,c = [Tensor.ones(2,2) for _ in range(3)]
|
||||
with CLCache():
|
||||
d = a * b + c
|
||||
d.realize()
|
||||
assert len(CacheCollector.cache) == 1, "optimizer didn't fold muladd"
|
||||
np.testing.assert_allclose(d.numpy(), np.ones((2,2))*2, rtol=1e-5)
|
||||
|
||||
def test_fold_reduce_elementwise(self):
|
||||
img = Tensor.ones(32)
|
||||
addme = Tensor.ones(1)
|
||||
with CLCache():
|
||||
ret = img.sum() + addme
|
||||
ret.realize()
|
||||
assert len(CacheCollector.cache) == 1, "optimizer didn't fold reduce/elementwise"
|
||||
assert ret.numpy()[0] == 33
|
||||
|
||||
def test_fold_batchnorm(self):
|
||||
with Tensor.train():
|
||||
img = Tensor.ones(1,32,4,4)
|
||||
bn = nn.BatchNorm2d(32, track_running_stats=False)
|
||||
with CLCache():
|
||||
img_bn = bn(img).realize()
|
||||
print(img_bn)
|
||||
assert len(CacheCollector.cache) == 3, f"optimizer didn't fold batchnorm, got {len(CacheCollector.cache)}"
|
||||
# Tensor.training = False
|
||||
|
||||
def test_fold_conv_sgd(self):
|
||||
with Tensor.train():
|
||||
img = Tensor.ones(2,3,4,4)
|
||||
c1 = nn.Conv2d(3,32,3)
|
||||
opt = optim.SGD(get_parameters(c1))
|
||||
with CLCache():
|
||||
opt.zero_grad()
|
||||
c1(img).relu().sum().backward()
|
||||
opt.step()
|
||||
# TODO: this should be 4, but the sum output child stays around
|
||||
# with pushing_permutes it can be 3
|
||||
# TODO: broken with optim fixes
|
||||
assert len(CacheCollector.cache) in [4,5,6], f"optimizer didn't fold conv-backward SGD, got {len(CacheCollector.cache)}"
|
||||
# Tensor.training = False
|
||||
|
||||
def test_fold_2convs_sgd(self):
|
||||
with Tensor.train():
|
||||
img = Tensor.ones(2,3,64,64)
|
||||
c1 = nn.Conv2d(3,16,3,bias=False)
|
||||
c2 = nn.Conv2d(16,32,3,bias=False)
|
||||
opt = optim.SGD(get_parameters([c1, c2]))
|
||||
with CLCache(allowed=9):
|
||||
opt.zero_grad()
|
||||
c2(c1(img).relu()).relu().sum().backward()
|
||||
opt.step()
|
||||
# Tensor.training = False
|
||||
|
||||
def test_fold_4convs_sgd(self):
|
||||
with Tensor.train():
|
||||
img = Tensor.ones(2,3,64,64)
|
||||
c1 = nn.Conv2d(3,4,3,bias=False)
|
||||
c2 = nn.Conv2d(4,8,3,bias=False)
|
||||
c3 = nn.Conv2d(8,16,3,bias=False)
|
||||
c4 = nn.Conv2d(16,32,3,bias=False)
|
||||
opt = optim.SGD(get_parameters([c1, c2, c3, c4]))
|
||||
with CLCache(allowed=19):
|
||||
opt.zero_grad()
|
||||
c4(c3(c2(c1(img).relu()).relu()).relu()).relu().sum().backward()
|
||||
opt.step()
|
||||
# Tensor.training = False
|
||||
|
||||
def test_fold_conv_batchnorm_sgd(self):
|
||||
with Tensor.train():
|
||||
img = Tensor.ones(1,3,4,4)
|
||||
c1 = nn.Conv2d(3,32,3)
|
||||
bn = nn.BatchNorm2d(32, track_running_stats=False)
|
||||
opt = optim.SGD(get_parameters([c1, bn]))
|
||||
with CLCache(allowed=18): # this is too high
|
||||
img_bn = bn(c1(img)).elu().sum()
|
||||
opt.zero_grad()
|
||||
img_bn.backward()
|
||||
opt.step()
|
||||
# Tensor.training = False
|
||||
|
||||
def test_fold_conv_batchnorm_notrain(self):
|
||||
img = Tensor.ones(1,3,8,8)
|
||||
c1 = nn.Conv2d(3,32,3)
|
||||
bn = nn.BatchNorm2d(32, track_running_stats=False)
|
||||
# precache the bn
|
||||
img_conv = bn(c1(img)).relu().realize()
|
||||
with CLCache():
|
||||
img_conv = bn(c1(img)).relu().realize()
|
||||
assert len(CacheCollector.cache) == 1, f"optimizer didn't fold conv-batchnorm at test time, got {len(CacheCollector.cache)}"
|
||||
|
||||
def test_fold_conv_batchnorm(self):
|
||||
with Tensor.train():
|
||||
img = Tensor.ones(1,3,8,8)
|
||||
c1 = nn.Conv2d(3,32,3)
|
||||
bn = nn.BatchNorm2d(32, track_running_stats=False)
|
||||
with CLCache():
|
||||
img_conv = bn(c1(img)).relu().realize()
|
||||
print(img_conv)
|
||||
assert len(CacheCollector.cache) == 4, f"optimizer didn't fold conv-batchnorm, got {len(CacheCollector.cache)}"
|
||||
|
||||
def test_fold_conv_elu(self):
|
||||
img = Tensor.ones(1,4,8,8)
|
||||
c1 = nn.Conv2d(4, 4, kernel_size=3)
|
||||
c2 = nn.Conv2d(4, 4, kernel_size=3)
|
||||
with CLCache():
|
||||
img_conv = img.sequential([c1, Tensor.elu, c2, Tensor.elu]).realize()
|
||||
print(img_conv)
|
||||
assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/elu"
|
||||
|
||||
def test_fold_conv_relu(self):
|
||||
img = Tensor.ones(1,4,8,8)
|
||||
c1 = nn.Conv2d(4, 4, kernel_size=3)
|
||||
c2 = nn.Conv2d(4, 4, kernel_size=3)
|
||||
with CLCache():
|
||||
img_conv = img.sequential([c1, Tensor.relu, c2, Tensor.relu]).realize()
|
||||
print(img_conv)
|
||||
assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/relu"
|
||||
|
||||
def test_fold_conv_relu_nobias(self):
|
||||
img = Tensor.ones(1,4,8,8)
|
||||
c1 = nn.Conv2d(4, 4, kernel_size=3, bias=False)
|
||||
c2 = nn.Conv2d(4, 4, kernel_size=3, bias=False)
|
||||
with CLCache():
|
||||
img_conv = img.sequential([c1, Tensor.relu, c2, Tensor.relu]).realize()
|
||||
print(img_conv)
|
||||
assert len(CacheCollector.cache) == 2, "optimizer didn't fold conv/relu"
|
||||
|
||||
def test_permute_was_pushed(self):
|
||||
a = Tensor.randn(16, 16, 16)
|
||||
with CLCache():
|
||||
c = a.sum(2)
|
||||
d = c.permute(1,0).contiguous()
|
||||
d.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(a.numpy().sum(2).transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
|
||||
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
|
||||
|
||||
def test_permute_was_pushed_through_contract_reshape(self):
|
||||
a = Tensor.randn(4, 4, 4, 4, 4)
|
||||
with CLCache():
|
||||
c = a.sum(-1)
|
||||
d = c.reshape(16,16).permute(1,0).contiguous()
|
||||
d.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(a.numpy().sum(-1).reshape(16,16).transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
|
||||
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
|
||||
|
||||
def test_permute_was_pushed_through_contractw1s_reshape(self):
|
||||
a = Tensor.randn(4, 4, 4, 4, 4)
|
||||
with CLCache():
|
||||
c = a.sum(-1)
|
||||
d = c.reshape(16,1,16).permute(2,1,0).contiguous()
|
||||
d.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(a.numpy().sum(-1).reshape(16,1,16).transpose(2,1,0), d.numpy(), rtol=1e-3, atol=1e-5)
|
||||
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
|
||||
|
||||
# TODO: push permute through expansion reshape
|
||||
@unittest.skip("expansion can't push expand permute yet")
|
||||
@unittest.skipIf(not PUSH_PERMUTES, "this test requires PUSH_PERMUTES")
|
||||
def test_permute_was_pushed_through_expand_reshape(self):
|
||||
a = Tensor.randn(16, 16, 16)
|
||||
with CLCache():
|
||||
c = a.sum(2)
|
||||
d = c.reshape(4,4,4,4).permute(2,3,0,1).contiguous()
|
||||
d.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(a.numpy().sum(2).transpose(1,0).reshape(4,4,4,4), d.numpy(), rtol=1e-3, atol=1e-5)
|
||||
if PUSH_PERMUTES: assert cache_len == 1, "permute wasn't pushed!"
|
||||
|
||||
@unittest.skipIf(PUSH_PERMUTES, "this test is broken with PUSH_PERMUTES")
|
||||
def test_no_reduceop_rerun(self):
|
||||
a = Tensor.randn(16, 16, 16)
|
||||
with CLCache():
|
||||
c = a.sum(2)
|
||||
d = a.sum(2).permute(1,0)
|
||||
c.realize()
|
||||
d.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(c.numpy().transpose(1,0), d.numpy(), rtol=1e-3, atol=1e-5)
|
||||
assert cache_len == 1, "reduceop was rerun!"
|
||||
|
||||
@unittest.skipIf(PUSH_PERMUTES, "this test is broken with PUSH_PERMUTES")
|
||||
def test_no_reduceop_rerun_alt(self):
|
||||
a = Tensor.randn(16, 16, 16)
|
||||
with CLCache():
|
||||
c = a.sum(2).permute(1,0)
|
||||
d = a.sum(2)
|
||||
c.realize()
|
||||
d.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(c.numpy(), d.numpy().transpose(1,0), rtol=1e-3, atol=1e-5)
|
||||
assert cache_len == 1, "reduceop was rerun!"
|
||||
|
||||
def test_fold_with_contiguous(self):
|
||||
a = Tensor.randn(16, 16, 16)
|
||||
b = Tensor.randn(16, 16)
|
||||
with CLCache():
|
||||
c = (a.sum(2).contiguous() + b).contiguous()
|
||||
c.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
assert cache_len == 1, "contiguous wasn't folded"
|
||||
|
||||
def _test_fold_expand_reduce_helper(self, n, m, axis, allowed):
|
||||
b = torch.ones(n, m).sum(axis).reshape(n, 1).expand(n, m).sum(axis)
|
||||
with CLCache(allowed=allowed):
|
||||
a = Tensor.ones(n, m).sum(axis).reshape(n, 1).expand(n, m).sum(axis)
|
||||
a.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
|
||||
return cache_len
|
||||
|
||||
def test_expand_reduce_is_folded_on_same_axis(self):
|
||||
for axis in [0, 1]:
|
||||
for n in [4, 8, 16]:
|
||||
b = torch.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis)
|
||||
with CLCache(allowed=2):
|
||||
a = Tensor.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis)
|
||||
a.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
|
||||
return cache_len
|
||||
|
||||
def test_expand_reduce_is_not_folded_on_different_axes(self):
|
||||
axis1, axis2 = 0, 1
|
||||
for n in [4, 8, 16]:
|
||||
b = torch.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
|
||||
with CLCache(allowed=3):
|
||||
a = Tensor.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2)
|
||||
a.realize()
|
||||
cache_len = len(CacheCollector.cache)
|
||||
np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5)
|
||||
return cache_len
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
75
tinygrad_repo/test/external/external_test_optim.py
vendored
Normal file
75
tinygrad_repo/test/external/external_test_optim.py
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import tensorflow_addons as tfa
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn.optim import LAMB
|
||||
|
||||
np.random.seed(1337)
|
||||
x_init = np.random.randn(1,4).astype(np.float32)
|
||||
W_init = np.random.randn(4,4).astype(np.float32)
|
||||
m_init = np.random.randn(1,4).astype(np.float32)
|
||||
|
||||
class TinyNet:
|
||||
def __init__(self):
|
||||
self.x = Tensor(x_init.copy(), requires_grad=True)
|
||||
self.W = Tensor(W_init.copy(), requires_grad=True)
|
||||
self.m = Tensor(m_init.copy())
|
||||
|
||||
def forward(self):
|
||||
out = self.x.matmul(self.W).relu()
|
||||
out = out.log_softmax(1)
|
||||
out = out.mul(self.m).add(self.m).sum()
|
||||
return out
|
||||
|
||||
class TinyNetTF:
|
||||
def __init__(self):
|
||||
self.x = tf.Variable(x_init.copy(), trainable=True)
|
||||
self.W = tf.Variable(W_init.copy(), trainable=True)
|
||||
self.m = tf.constant(m_init.copy())
|
||||
|
||||
def forward(self):
|
||||
out = tf.matmul(self.x, self.W)
|
||||
out = tf.nn.relu(out)
|
||||
out = tf.nn.log_softmax(out, axis=1)
|
||||
out = tf.multiply(out, self.m) + self.m
|
||||
out = tf.reduce_sum(out)
|
||||
return out
|
||||
|
||||
def step(optim, steps=1, kwargs={}):
|
||||
net = TinyNet()
|
||||
optim = optim([net.x, net.W], **kwargs)
|
||||
for _ in range(steps):
|
||||
out = net.forward()
|
||||
optim.zero_grad()
|
||||
out.backward()
|
||||
optim.step()
|
||||
return net.x.detach().numpy(), net.W.detach().numpy()
|
||||
|
||||
def step_tf(optim, steps=1, kwargs={}):
|
||||
net = TinyNetTF()
|
||||
optim = optim(**kwargs)
|
||||
for _ in range(steps):
|
||||
with tf.GradientTape() as tape:
|
||||
out = net.forward()
|
||||
grads = tape.gradient(out, [net.x, net.W])
|
||||
optim.apply_gradients(zip(grads, [net.x, net.W]))
|
||||
return net.x.numpy(), net.W.numpy()
|
||||
|
||||
class ExternalTestOptim(unittest.TestCase):
|
||||
def _test_optim(self, tinygrad_optim, tensorflow_optim, steps, opts, atol, rtol):
|
||||
for x,y in zip(step(tinygrad_optim, steps, kwargs=opts),
|
||||
step_tf(tensorflow_optim, steps, kwargs=opts)):
|
||||
np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
|
||||
|
||||
def _test_lamb(self, steps, opts, atol, rtol): self._test_optim(LAMB, tfa.optimizers.LAMB, steps, opts, atol, rtol)
|
||||
|
||||
def test_lamb(self): self._test_lamb(1, {'lr': 0.001}, 1e-5, 0)
|
||||
def test_lamb_high_lr(self): self._test_lamb(1, {'lr': 10}, 1e-5, 1e-5)
|
||||
|
||||
def test_multistep_lamb(self): self._test_lamb(10, {'lr': 0.001}, 1e-5, 0)
|
||||
def test_multistep_lamb_high_lr(self): self._test_lamb(10, {'lr': 10}, 1e-5, 3e-4)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
57
tinygrad_repo/test/external/external_test_speed_llama.py
vendored
Normal file
57
tinygrad_repo/test/external/external_test_speed_llama.py
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
# NOTE: this only tests the speed of the LLaMA codegen, it doesn't actually run the net
|
||||
import unittest, time
|
||||
import numpy as np
|
||||
from examples.llama import Transformer, MODEL_PARAMS
|
||||
from test.test_net_speed import start_profile, stop_profile
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad.nn.state import get_state_dict
|
||||
from tinygrad.ops import Compiled
|
||||
from tinygrad.helpers import dtypes, prod
|
||||
from tinygrad.runtime.lib import RawBuffer
|
||||
|
||||
class FakeProgram:
|
||||
def __init__(self, name:str, prg:str): pass
|
||||
def __call__(self, *bufs, global_size, local_size, wait=False): pass
|
||||
|
||||
class RawFakeBuffer(RawBuffer):
|
||||
@classmethod
|
||||
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
|
||||
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
|
||||
|
||||
class TestLLaMASpeed(unittest.TestCase):
|
||||
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
|
||||
def test_llama_compile(self):
|
||||
backup_program = Device[Device.DEFAULT].runtime
|
||||
backup_buffer = Device[Device.DEFAULT].buffer
|
||||
Device[Device.DEFAULT].runtime = FakeProgram
|
||||
Device[Device.DEFAULT].buffer = RawFakeBuffer
|
||||
|
||||
print("testing llama python run time")
|
||||
model = Transformer(**MODEL_PARAMS["1"]["7B"]["args"])
|
||||
print("built model")
|
||||
# assign fake tensors to the values
|
||||
for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
|
||||
print("assigned empty tensors, doing warmup")
|
||||
|
||||
def run_llama(st, empty_method_cache=True):
|
||||
if empty_method_cache: Device[Device.DEFAULT].method_cache.clear()
|
||||
tms = [time.perf_counter()]
|
||||
for i in range(10):
|
||||
model(Tensor([[2]]), i).realize()
|
||||
tms.append(time.perf_counter())
|
||||
timings = [(tms[i+1]-tms[i])*1000 for i in range(len(tms)-1)]
|
||||
print(f"{st:15s} mean runtime: {sum(timings)/len(timings):7.2f}ms, runs: ", ", ".join(f'{x:7.2f}' for x in timings))
|
||||
|
||||
run_llama("codegen")
|
||||
run_llama("methodcache", False)
|
||||
|
||||
pr = start_profile()
|
||||
run_llama("profile")
|
||||
stop_profile(pr, sort='time', frac=0.1)
|
||||
|
||||
Device[Device.DEFAULT].runtime = backup_program
|
||||
Device[Device.DEFAULT].buffer = backup_buffer
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
44
tinygrad_repo/test/external/external_test_uops_graphing.py
vendored
Normal file
44
tinygrad_repo/test/external/external_test_uops_graphing.py
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
from tinygrad.renderer.opencl import OpenCLRenderer
|
||||
from tinygrad.graph import graph_uops
|
||||
from tinygrad.nn import Conv2d
|
||||
|
||||
class TestUopsGraph(unittest.TestCase):
|
||||
def test_matmul(self):
|
||||
N = 1024
|
||||
a = Tensor.rand(N,N)
|
||||
b = Tensor.rand(N,N)
|
||||
si = (a@b).lazydata.schedule()[-1]
|
||||
lin = Linearizer(si.ast)
|
||||
lin.hand_coded_optimizations()
|
||||
print(lin.colored_shape())
|
||||
uops = lin.linearize().uops
|
||||
graph_uops(uops)
|
||||
for u in uops: print(u)
|
||||
print(OpenCLRenderer("matmul", uops)[0])
|
||||
|
||||
def test_reduce(self):
|
||||
a = Tensor.rand(1024*1024)
|
||||
si = a.sum().lazydata.schedule()[-1]
|
||||
lin = Linearizer(si.ast)
|
||||
lin.hand_coded_optimizations()
|
||||
uops = lin.linearize().uops
|
||||
graph_uops(uops)
|
||||
#print(OpenCLRenderer("reduce", uops)[0])
|
||||
|
||||
def test_conv(self):
|
||||
x = Tensor.rand(1,3,16,16)
|
||||
c = Conv2d(3, 16, (3,3))
|
||||
si = c(x).elu().lazydata.schedule()[-1]
|
||||
lin = Linearizer(si.ast)
|
||||
lin.hand_coded_optimizations()
|
||||
uops = lin.linearize().uops
|
||||
graph_uops(uops)
|
||||
print(lin.colored_shape())
|
||||
print(OpenCLRenderer("conv", uops)[0])
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
36
tinygrad_repo/test/external/external_test_yolo.py
vendored
Normal file
36
tinygrad_repo/test/external/external_test_yolo.py
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
import io
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
import cv2
|
||||
import requests # type: ignore
|
||||
import numpy as np
|
||||
|
||||
from tinygrad.tensor import Tensor
|
||||
from examples.yolov3 import Darknet, infer, show_labels
|
||||
from extra.utils import fetch
|
||||
|
||||
chicken_img = cv2.imread(str(Path(__file__).parent / 'efficientnet/Chicken.jpg'))
|
||||
car_img = cv2.imread(str(Path(__file__).parent / 'efficientnet/car.jpg'))
|
||||
|
||||
class TestYOLO(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = Darknet(fetch("https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg"))
|
||||
print("Loading weights file (237MB). This might take a while…")
|
||||
cls.model.load_weights("https://pjreddie.com/media/files/yolov3.weights")
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
del cls.model
|
||||
|
||||
def test_chicken(self):
|
||||
labels = show_labels(infer(self.model, chicken_img), confidence=0.56)
|
||||
self.assertEqual(labels, ["bird"])
|
||||
|
||||
def test_car(self):
|
||||
labels = show_labels(infer(self.model, car_img))
|
||||
self.assertEqual(labels, ["car"])
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
76
tinygrad_repo/test/external/external_test_yolov8.py
vendored
Normal file
76
tinygrad_repo/test/external/external_test_yolov8.py
vendored
Normal file
@@ -0,0 +1,76 @@
|
||||
import numpy as np
|
||||
from extra.utils import fetch, download_file, get_child
|
||||
from examples.yolov8 import YOLOv8, get_variant_multiples, preprocess, postprocess, label_predictions
|
||||
from pathlib import Path
|
||||
import unittest
|
||||
import io, cv2, os
|
||||
import onnxruntime as ort
|
||||
import ultralytics
|
||||
from tinygrad.nn.state import safe_load, load_state_dict
|
||||
|
||||
class TestYOLOv8(unittest.TestCase):
|
||||
def test_all_load_weights(self):
|
||||
for variant in ['n', 's', 'm', 'l', 'x']:
|
||||
weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
|
||||
download_file(f'https://gitlab.com/r3sist/yolov8_weights/-/raw/master/yolov8{variant}.safetensors', weights_location)
|
||||
|
||||
depth, width, ratio = get_variant_multiples(variant)
|
||||
TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
|
||||
state_dict = safe_load(weights_location)
|
||||
load_state_dict(TinyYolov8, state_dict)
|
||||
print(f'successfully loaded weights for yolov{variant}')
|
||||
|
||||
def test_predictions(self):
|
||||
test_image_urls = ['https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg', 'https://www.aljazeera.com/wp-content/uploads/2022/10/2022-04-28T192650Z_1186456067_UP1EI4S1I0P14_RTRMADP_3_SOCCER-ENGLAND-MUN-CHE-REPORT.jpg']
|
||||
variant = 'n'
|
||||
weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
|
||||
depth, width, ratio = get_variant_multiples(variant)
|
||||
TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
|
||||
state_dict = safe_load(weights_location)
|
||||
load_state_dict(TinyYolov8, state_dict)
|
||||
|
||||
for i in range(len(test_image_urls)):
|
||||
img_stream = io.BytesIO(fetch(test_image_urls[i]))
|
||||
img = cv2.imdecode(np.frombuffer(img_stream.read(), np.uint8), 1)
|
||||
test_image = preprocess([img])
|
||||
predictions = TinyYolov8(test_image)
|
||||
post_predictions = postprocess(preds=predictions, img=test_image, orig_imgs=[img])
|
||||
labels = label_predictions(post_predictions)
|
||||
assert labels == {5: 1, 0: 4, 11: 1} if i == 0 else labels == {0: 13, 29: 1, 32: 1}
|
||||
|
||||
def test_forward_pass_torch_onnx(self):
|
||||
variant = 'n'
|
||||
weights_location_onnx = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.onnx'
|
||||
weights_location_pt = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.pt'
|
||||
weights_location = Path(__file__).parents[2] / "weights" / f'yolov8{variant}.safetensors'
|
||||
|
||||
download_file(f'https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8{variant}.pt', weights_location_pt)
|
||||
# the ultralytics export prints a lot of unneccesary things
|
||||
if not weights_location_onnx.is_file():
|
||||
model = ultralytics.YOLO(model=weights_location_pt, task='Detect')
|
||||
model.export(format="onnx",imgsz=[640, 480])
|
||||
|
||||
depth, width, ratio = get_variant_multiples(variant)
|
||||
TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
|
||||
state_dict = safe_load(weights_location)
|
||||
load_state_dict(TinyYolov8, state_dict)
|
||||
|
||||
image_location = [np.frombuffer(io.BytesIO(fetch('https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg')).read(), np.uint8)]
|
||||
orig_image = [cv2.imdecode(image_location[0], 1)]
|
||||
|
||||
input_image = preprocess(orig_image)
|
||||
|
||||
onnx_session = ort.InferenceSession(weights_location_onnx)
|
||||
onnx_input_name = onnx_session.get_inputs()[0].name
|
||||
onnx_output_name = onnx_session.get_outputs()[0].name
|
||||
onnx_output = onnx_session.run([onnx_output_name], {onnx_input_name: input_image.numpy()})
|
||||
|
||||
tiny_output = TinyYolov8(input_image)
|
||||
|
||||
# currently rtol is 0.025 because there is a 1-2% difference in our predictions
|
||||
# because of the zero padding in SPPF module (line 280) maxpooling layers rather than the -infinity in torch.
|
||||
# This difference does not make a difference "visually".
|
||||
np.testing.assert_allclose(onnx_output[0], tiny_output.numpy(), atol=5e-4, rtol=0.025)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
61
tinygrad_repo/test/external/fuzz_shapetracker.py
vendored
Normal file
61
tinygrad_repo/test/external/fuzz_shapetracker.py
vendored
Normal file
@@ -0,0 +1,61 @@
|
||||
import random
|
||||
from tinygrad.helpers import DEBUG
|
||||
from test.unit.test_shapetracker import CheckingShapeTracker
|
||||
random.seed(42)
|
||||
|
||||
def do_permute(st):
|
||||
perm = list(range(0, len(st.shape)))
|
||||
random.shuffle(perm)
|
||||
perm = tuple(perm)
|
||||
if DEBUG >= 1: print("st.permute(", perm, ")")
|
||||
st.permute(perm)
|
||||
|
||||
def do_pad(st):
|
||||
c = random.randint(0, len(st.shape)-1)
|
||||
pad = tuple((random.randint(0,2), random.randint(0,2)) if i==c else (0,0) for i in range(len(st.shape)))
|
||||
if DEBUG >= 1: print("st.pad(", pad, ")")
|
||||
st.pad(pad)
|
||||
|
||||
def do_reshape_split_one(st):
|
||||
c = random.randint(0, len(st.shape)-1)
|
||||
poss = [n for n in [1,2,3,4,5] if st.shape[c]%n == 0]
|
||||
spl = random.choice(poss)
|
||||
shp = st.shape[0:c] + (st.shape[c]//spl, spl) + st.shape[c+1:]
|
||||
if DEBUG >= 1: print("st.reshape(", shp, ")")
|
||||
st.reshape(shp)
|
||||
|
||||
def do_reshape_combine_two(st):
|
||||
if len(st.shape) < 2: return
|
||||
c = random.randint(0, len(st.shape)-2)
|
||||
shp = st.shape[:c] + (st.shape[c] * st.shape[c+1], ) + st.shape[c+2:]
|
||||
if DEBUG >= 1: print("st.reshape(", shp, ")")
|
||||
st.reshape(shp)
|
||||
|
||||
def do_shrink(st):
|
||||
c = random.randint(0, len(st.shape)-1)
|
||||
while 1:
|
||||
shrink = tuple((random.randint(0,s), random.randint(0,s)) if i == c else (0,s) for i,s in enumerate(st.shape))
|
||||
if all(x<y for (x,y) in shrink): break
|
||||
if DEBUG >= 1: print("st.shrink(", shrink, ")")
|
||||
st.shrink(shrink)
|
||||
|
||||
def do_stride(st):
|
||||
c = random.randint(0, len(st.shape)-1)
|
||||
stride = tuple(random.choice([-2,-1,2]) if i==c else 1 for i in range(len(st.shape)))
|
||||
if DEBUG >= 1: print("st.stride(", stride, ")")
|
||||
st.stride(stride)
|
||||
|
||||
def do_expand(st):
|
||||
c = [i for i,s in enumerate(st.shape) if s==1]
|
||||
if len(c) == 0: return
|
||||
c = random.choice(c)
|
||||
expand = tuple(random.choice([2,3,4]) if i==c else s for i,s in enumerate(st.shape))
|
||||
if DEBUG >= 1: print("st.expand(", expand, ")")
|
||||
st.expand(expand)
|
||||
|
||||
if __name__ == "__main__":
|
||||
ops = [do_permute, do_pad, do_shrink, do_reshape_split_one, do_reshape_combine_two, do_stride, do_expand]
|
||||
for _ in range(200):
|
||||
st = CheckingShapeTracker((random.randint(2, 10), random.randint(2, 10), random.randint(2, 10)))
|
||||
for i in range(8): random.choice(ops)(st)
|
||||
st.assert_same()
|
||||
69
tinygrad_repo/test/external/fuzz_symbolic.py
vendored
Normal file
69
tinygrad_repo/test/external/fuzz_symbolic.py
vendored
Normal file
@@ -0,0 +1,69 @@
|
||||
import itertools
|
||||
import random
|
||||
from tinygrad.helpers import DEBUG
|
||||
from tinygrad.shape.symbolic import Variable
|
||||
random.seed(42)
|
||||
|
||||
def add_v(expr, rng=None):
|
||||
if rng is None: rng = random.randint(0,2)
|
||||
return expr + v[rng], rng
|
||||
|
||||
def div(expr, rng=None):
|
||||
if rng is None: rng = random.randint(1,9)
|
||||
return expr // rng, rng
|
||||
|
||||
def mul(expr, rng=None):
|
||||
if rng is None: rng = random.randint(-4,4)
|
||||
return expr * rng, rng
|
||||
|
||||
def mod(expr, rng=None):
|
||||
if rng is None: rng = random.randint(1,9)
|
||||
return expr % rng, rng
|
||||
|
||||
def add_num(expr, rng=None):
|
||||
if rng is None: rng = random.randint(-4,4)
|
||||
return expr + rng, rng
|
||||
|
||||
def lt(expr, rng=None):
|
||||
if rng is None: rng = random.randint(-4,4)
|
||||
return expr < rng, rng
|
||||
|
||||
def ge(expr, rng=None):
|
||||
if rng is None: rng = random.randint(-4,4)
|
||||
return expr >= rng, rng
|
||||
|
||||
def le(expr, rng=None):
|
||||
if rng is None: rng = random.randint(-4,4)
|
||||
return expr <= rng, rng
|
||||
|
||||
def gt(expr, rng=None):
|
||||
if rng is None: rng = random.randint(-4,4)
|
||||
return expr > rng, rng
|
||||
|
||||
if __name__ == "__main__":
|
||||
ops = [add_v, div, mul, add_num, mod]
|
||||
for _ in range(1000):
|
||||
upper_bounds = [*list(range(1, 10)), 16, 32, 64, 128, 256]
|
||||
u1 = Variable("v1", 0, random.choice(upper_bounds))
|
||||
u2 = Variable("v2", 0, random.choice(upper_bounds))
|
||||
u3 = Variable("v3", 0, random.choice(upper_bounds))
|
||||
v = [u1,u2,u3]
|
||||
tape = [random.choice(ops) for _ in range(random.randint(2, 30))]
|
||||
# 10% of the time, add one of lt, le, gt, ge
|
||||
if random.random() < 0.1: tape.append(random.choice([lt, le, gt, ge]))
|
||||
expr = Variable.num(0)
|
||||
rngs = []
|
||||
for t in tape:
|
||||
expr, rng = t(expr)
|
||||
if DEBUG >= 1: print(t.__name__, rng)
|
||||
rngs.append(rng)
|
||||
if DEBUG >=1: print(expr)
|
||||
space = list(itertools.product(range(u1.min, u1.max+1), range(u2.min, u2.max+1), range(u3.min, u3.max+1)))
|
||||
volume = len(space)
|
||||
for (v1, v2, v3) in random.sample(space, min(100, volume)):
|
||||
v = [v1,v2,v3]
|
||||
rn = 0
|
||||
for t,r in zip(tape, rngs): rn, _ = t(rn, r)
|
||||
num = eval(expr.render())
|
||||
assert num == rn, f"mismatched {expr.render()} at {v1=} {v2=} {v3=} = {num} != {rn}"
|
||||
if DEBUG >= 1: print(f"matched {expr.render()} at {v1=} {v2=} {v3=} = {num} == {rn}")
|
||||
61
tinygrad_repo/test/external/graph_batchnorm.py
vendored
Normal file
61
tinygrad_repo/test/external/graph_batchnorm.py
vendored
Normal file
@@ -0,0 +1,61 @@
|
||||
import unittest
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn import Conv2d, BatchNorm2d, optim
|
||||
|
||||
def model_step(lm):
|
||||
with Tensor.train():
|
||||
x = Tensor.ones(8,12,128,256, requires_grad=False)
|
||||
optimizer = optim.SGD(get_parameters(lm), lr=0.001)
|
||||
loss = lm.forward(x).sum()
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
del x,loss
|
||||
optimizer.step()
|
||||
|
||||
class TestBatchnorm(unittest.TestCase):
|
||||
def test_conv(self):
|
||||
class LilModel:
|
||||
def __init__(self):
|
||||
self.c = Conv2d(12, 32, 3, padding=1, bias=False)
|
||||
def forward(self, x):
|
||||
return self.c(x).relu()
|
||||
lm = LilModel()
|
||||
model_step(lm)
|
||||
|
||||
def test_two_conv(self):
|
||||
class LilModel:
|
||||
def __init__(self):
|
||||
self.c = Conv2d(12, 32, 3, padding=1, bias=False)
|
||||
self.c2 = Conv2d(32, 32, 3, padding=1, bias=False)
|
||||
def forward(self, x):
|
||||
return self.c2(self.c(x)).relu()
|
||||
lm = LilModel()
|
||||
model_step(lm)
|
||||
|
||||
def test_two_conv_bn(self):
|
||||
class LilModel:
|
||||
def __init__(self):
|
||||
self.c = Conv2d(12, 24, 3, padding=1, bias=False)
|
||||
self.bn = BatchNorm2d(24, track_running_stats=False)
|
||||
self.c2 = Conv2d(24, 32, 3, padding=1, bias=False)
|
||||
self.bn2 = BatchNorm2d(32, track_running_stats=False)
|
||||
def forward(self, x):
|
||||
x = self.bn(self.c(x)).relu()
|
||||
return self.bn2(self.c2(x)).relu()
|
||||
lm = LilModel()
|
||||
model_step(lm)
|
||||
|
||||
def test_conv_bn(self):
|
||||
class LilModel:
|
||||
def __init__(self):
|
||||
self.c = Conv2d(12, 32, 3, padding=1, bias=False)
|
||||
self.bn = BatchNorm2d(32, track_running_stats=False)
|
||||
def forward(self, x):
|
||||
return self.bn(self.c(x)).relu()
|
||||
lm = LilModel()
|
||||
model_step(lm)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
74
tinygrad_repo/test/external/test_example.py
vendored
Normal file
74
tinygrad_repo/test/external/test_example.py
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.ops import Device
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import getenv, CI
|
||||
|
||||
def multidevice_test(fxn):
|
||||
exclude_devices = getenv("EXCLUDE_DEVICES", "").split(",")
|
||||
def ret(self):
|
||||
for device in Device._buffers:
|
||||
if device in ["DISK", "SHM", "FAKE"]: continue
|
||||
if not CI: print(device)
|
||||
if device in exclude_devices:
|
||||
if not CI: print(f"WARNING: {device} test is excluded")
|
||||
continue
|
||||
with self.subTest(device=device):
|
||||
try:
|
||||
Device[device]
|
||||
except Exception:
|
||||
if not CI: print(f"WARNING: {device} test isn't running")
|
||||
continue
|
||||
fxn(self, device)
|
||||
return ret
|
||||
|
||||
class TestExample(unittest.TestCase):
|
||||
@multidevice_test
|
||||
def test_convert_to_cpu(self, device):
|
||||
a = Tensor([[1,2],[3,4]], device=device)
|
||||
assert a.numpy().shape == (2,2)
|
||||
b = a.cpu()
|
||||
assert b.numpy().shape == (2,2)
|
||||
|
||||
@multidevice_test
|
||||
def test_2_plus_3(self, device):
|
||||
a = Tensor([2], device=device)
|
||||
b = Tensor([3], device=device)
|
||||
result = a + b
|
||||
print(f"{a.numpy()} + {b.numpy()} = {result.numpy()}")
|
||||
assert result.numpy()[0] == 5.
|
||||
|
||||
@multidevice_test
|
||||
def test_example_readme(self, device):
|
||||
x = Tensor.eye(3, device=device, requires_grad=True)
|
||||
y = Tensor([[2.0,0,-2.0]], device=device, requires_grad=True)
|
||||
z = y.matmul(x).sum()
|
||||
z.backward()
|
||||
|
||||
x.grad.numpy() # dz/dx
|
||||
y.grad.numpy() # dz/dy
|
||||
|
||||
assert x.grad.device == device
|
||||
assert y.grad.device == device
|
||||
|
||||
@multidevice_test
|
||||
def test_example_matmul(self, device):
|
||||
try:
|
||||
Device[device]
|
||||
except Exception:
|
||||
print(f"WARNING: {device} test isn't running")
|
||||
return
|
||||
|
||||
x = Tensor.eye(64, device=device, requires_grad=True)
|
||||
y = Tensor.eye(64, device=device, requires_grad=True)
|
||||
z = y.matmul(x).sum()
|
||||
z.backward()
|
||||
|
||||
x.grad.numpy() # dz/dx
|
||||
y.grad.numpy() # dz/dy
|
||||
|
||||
assert x.grad.device == device
|
||||
assert y.grad.device == device
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user