wip

2024-04-27 03:27:27 -05:00
parent 886a019ad5
commit c22b4866eb
55 changed files with 49557 additions and 116523 deletions
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -0,0 +1,72 @@
 Import('env', 'envCython', 'arch', 'cereal', 'messaging', 'common', 'gpucommon', 'visionipc', 'transformations')
 lenv = env.Clone()
 lenvCython = envCython.Clone()
 libs = [cereal, messaging, visionipc, gpucommon, common, 'capnp', 'zmq', 'kj', 'pthread']
 frameworks = []
 common_src = [
  "models/commonmodel.cc",
  "transforms/loadyuv.cc",
  "transforms/transform.cc",
 ]
 thneed_src_common = [
  "thneed/thneed_common.cc",
  "thneed/serialize.cc",
 ]
 thneed_src_qcom = thneed_src_common + ["thneed/thneed_qcom2.cc"]
 thneed_src_pc = thneed_src_common + ["thneed/thneed_pc.cc"]
 thneed_src = thneed_src_qcom if arch == "larch64" else thneed_src_pc
 # SNPE except on Mac and ARM Linux
 snpe_lib = []
 if arch != "Darwin" and arch != "aarch64":
  common_src += ['runners/snpemodel.cc']
  snpe_lib += ['SNPE']
 # OpenCL is a framework on Mac
 if arch == "Darwin":
  frameworks += ['OpenCL']
 else:
  libs += ['OpenCL']
 # Set path definitions
 for pathdef, fn in {'TRANSFORM': 'transforms/transform.cl', 'LOADYUV': 'transforms/loadyuv.cl'}.items():
  for xenv in (lenv, lenvCython):
    xenv['CXXFLAGS'].append(f'-D{pathdef}_PATH=\\"{File(fn).abspath}\\"')
 # Compile cython
 snpe_rpath_qcom = "/data/pythonpath/third_party/snpe/larch64"
 snpe_rpath_pc = f"{Dir('#').abspath}/third_party/snpe/x86_64-linux-clang"
 snpe_rpath = lenvCython['RPATH'] + [snpe_rpath_qcom if arch == "larch64" else snpe_rpath_pc]
 cython_libs = envCython["LIBS"] + libs
 snpemodel_lib = lenv.Library('snpemodel', ['runners/snpemodel.cc'])
 commonmodel_lib = lenv.Library('commonmodel', common_src)
 lenvCython.Program('runners/runmodel_pyx.so', 'runners/runmodel_pyx.pyx', LIBS=cython_libs, FRAMEWORKS=frameworks)
 lenvCython.Program('runners/snpemodel_pyx.so', 'runners/snpemodel_pyx.pyx', LIBS=[snpemodel_lib, snpe_lib, *cython_libs], FRAMEWORKS=frameworks, RPATH=snpe_rpath)
 lenvCython.Program('models/commonmodel_pyx.so', 'models/commonmodel_pyx.pyx', LIBS=[commonmodel_lib, *cython_libs], FRAMEWORKS=frameworks)
 # Get model metadata
 fn = File("models/supercombo").abspath
 cmd = f'python3 {Dir("#selfdrive/modeld").abspath}/get_model_metadata.py {fn}.onnx'
 files = sum([lenv.Glob("#"+x) for x in open(File("#release/files_common").abspath).read().split("\n") if x.endswith("get_model_metadata.py")], [])
 lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"]+files, cmd)
 # Build thneed model
 if arch == "larch64" or GetOption('pc_thneed'):
  tinygrad_opts = []
  if not GetOption('pc_thneed'):
    # use FLOAT16 on device for speed + don't cache the CL kernels for space
    tinygrad_opts += ["FLOAT16=1", "PYOPENCL_NO_CACHE=1"]
  cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn}.onnx {fn}.thneed"
  tinygrad_files = sum([lenv.Glob("#"+x) for x in open(File("#release/files_common").abspath).read().split("\n") if x.startswith("tinygrad_repo/")], [])
  lenv.Command(fn + ".thneed", [fn + ".onnx"] + tinygrad_files, cmd)
  thneed_lib = env.SharedLibrary('thneed', thneed_src, LIBS=[gpucommon, common, 'zmq', 'OpenCL', 'dl'])
  thneedmodel_lib = env.Library('thneedmodel', ['runners/thneedmodel.cc'])
  lenvCython.Program('runners/thneedmodel_pyx.so', 'runners/thneedmodel_pyx.pyx', LIBS=envCython["LIBS"]+[thneedmodel_lib, thneed_lib, gpucommon, common, 'dl', 'zmq', 'OpenCL'])
--- a/selfdrive/modeld/constants.py
+++ b/selfdrive/modeld/constants.py
@@ -24,6 +24,7 @@ class ModelConstants:
  LAT_PLANNER_STATE_LEN = 4
  LATERAL_CONTROL_PARAMS_LEN = 2
  PREV_DESIRED_CURV_LEN = 1
  RADAR_TRACKS_LEN = 64
  # model outputs constants
  FCW_THRESHOLDS_5MS2 = np.array([.05, .05, .15, .15, .15], dtype=np.float32)
@@ -42,6 +43,7 @@ class ModelConstants:
  DESIRE_PRED_WIDTH = 8
  LAT_PLANNER_SOLUTION_WIDTH = 4
  DESIRED_CURV_WIDTH = 1
  RADAR_TRACKS_WIDTH = 3
  NUM_LANE_LINES = 4
  NUM_ROAD_EDGES = 2
--- a/selfdrive/modeld/dmonitoringmodeld.py
+++ b/selfdrive/modeld/dmonitoringmodeld.py
@@ -6,7 +6,6 @@ import time
 import ctypes
 import numpy as np
 from pathlib import Path
 from typing import Tuple, Dict
 from cereal import messaging
 from cereal.messaging import PubMaster, SubMaster
@@ -53,7 +52,7 @@ class DMonitoringModelResult(ctypes.Structure):
    ("wheel_on_right_prob", ctypes.c_float)]
 class ModelState:
-  inputs: Dict[str, np.ndarray]
+  inputs: dict[str, np.ndarray]
  output: np.ndarray
  model: ModelRunner
@@ -68,7 +67,7 @@ class ModelState:
    self.model.addInput("input_img", None)
    self.model.addInput("calib", self.inputs['calib'])
-  def run(self, buf:VisionBuf, calib:np.ndarray) -> Tuple[np.ndarray, float]:
+  def run(self, buf:VisionBuf, calib:np.ndarray) -> tuple[np.ndarray, float]:
    self.inputs['calib'][:] = calib
    v_offset = buf.height - MODEL_HEIGHT
--- a/selfdrive/modeld/fill_model_msg.py
+++ b/selfdrive/modeld/fill_model_msg.py
@@ -1,7 +1,6 @@
 import os
 import capnp
 import numpy as np
 from typing import Dict
 from cereal import log
 from openpilot.selfdrive.modeld.constants import ModelConstants, Plan, Meta
@@ -42,7 +41,7 @@ def fill_xyvat(builder, t, x, y, v, a, x_std=None, y_std=None, v_std=None, a_std
  if a_std is not None:
    builder.aStd = a_std.tolist()
-def fill_model_msg(msg: capnp._DynamicStructBuilder, net_output_data: Dict[str, np.ndarray], publish_state: PublishState,
+def fill_model_msg(msg: capnp._DynamicStructBuilder, net_output_data: dict[str, np.ndarray], publish_state: PublishState,
                   vipc_frame_id: int, vipc_frame_id_extra: int, frame_id: int, frame_drop: float,
                   timestamp_eof: int, timestamp_llk: int, model_execution_time: float,
                   nav_enabled: bool, valid: bool) -> None:
@@ -104,11 +103,12 @@ def fill_model_msg(msg: capnp._DynamicStructBuilder, net_output_data: Dict[str,
      lane_line = modelV2.laneLines[i]
      far_lane, near_lane, road_edge = (0, 1, 0) if i == 4 else (3, 2, 1)
      y_min = net_output_data['lane_lines'][0, near_lane,:,0]
      z_min = net_output_data['lane_lines'][0, near_lane,:,1]
      lane_diff = np.abs(net_output_data['lane_lines'][0,near_lane] - net_output_data['lane_lines'][0,far_lane])
      road_edge_diff = np.abs(net_output_data['lane_lines'][0,near_lane] - net_output_data['road_edges'][0,road_edge])
      y_min = net_output_data['lane_lines'][0, near_lane,:,0]
      z_min = net_output_data['lane_lines'][0, near_lane,:,1]
      y_min += np.where(lane_diff[:,0] < road_edge_diff[:,0], net_output_data['lane_lines'][0,far_lane,:,0], net_output_data['road_edges'][0,road_edge,:,0])
      z_min += np.where(lane_diff[:,1] < road_edge_diff[:,1], net_output_data['lane_lines'][0,far_lane,:,1], net_output_data['road_edges'][0,road_edge,:,1])
@@ -192,7 +192,7 @@ def fill_model_msg(msg: capnp._DynamicStructBuilder, net_output_data: Dict[str,
  if SEND_RAW_PRED:
    modelV2.rawPredictions = net_output_data['raw_pred'].tobytes()
-def fill_pose_msg(msg: capnp._DynamicStructBuilder, net_output_data: Dict[str, np.ndarray],
+def fill_pose_msg(msg: capnp._DynamicStructBuilder, net_output_data: dict[str, np.ndarray],
                  vipc_frame_id: int, vipc_dropped_frames: int, timestamp_eof: int, live_calib_seen: bool) -> None:
  msg.valid = live_calib_seen & (vipc_dropped_frames < 1)
  cameraOdometry = msg.cameraOdometry
--- a/selfdrive/modeld/get_model_metadata.py
+++ b/selfdrive/modeld/get_model_metadata.py
@@ -4,9 +4,8 @@ import pathlib
 import onnx
 import codecs
 import pickle
 from typing import Tuple
-def get_name_and_shape(value_info:onnx.ValueInfoProto) -> Tuple[str, Tuple[int,...]]:
+def get_name_and_shape(value_info:onnx.ValueInfoProto) -> tuple[str, tuple[int,...]]:
  shape = tuple([int(dim.dim_value) for dim in value_info.type.tensor_type.shape.dim])
  name = value_info.name
  return name, shape
--- a/selfdrive/modeld/libthneed.so
+++ b/selfdrive/modeld/libthneed.so
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -6,7 +6,6 @@ import numpy as np
 import cereal.messaging as messaging
 from cereal import car, log
 from pathlib import Path
 from typing import Dict, Optional
 from setproctitle import setproctitle
 from cereal.messaging import PubMaster, SubMaster
 from cereal.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
@@ -14,6 +13,7 @@ from openpilot.common.swaglog import cloudlog
 from openpilot.common.params import Params
 from openpilot.common.filter_simple import FirstOrderFilter
 from openpilot.common.realtime import config_realtime_process
 from openpilot.common.transformations.camera import DEVICE_CAMERAS
 from openpilot.common.transformations.model import get_warp_matrix
 from openpilot.selfdrive import sentry
 from openpilot.selfdrive.car.car_helpers import get_demo_car_params
@@ -24,15 +24,18 @@ from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_
 from openpilot.selfdrive.modeld.constants import ModelConstants
 from openpilot.selfdrive.modeld.models.commonmodel_pyx import ModelFrame, CLContext
-from openpilot.selfdrive.frogpilot.functions.frogpilot_functions import DEFAULT_MODEL
+from openpilot.selfdrive.frogpilot.controls.lib.model_manager import DEFAULT_MODEL, MODELS_PATH, NAVIGATIONLESS_MODELS, RADARLESS_MODELS
 PROCESS_NAME = "selfdrive.modeld.modeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
 MODEL_NAME = Params().get("Model", encoding='utf-8')
 DISABLE_NAV = MODEL_NAME in NAVIGATIONLESS_MODELS
 DISABLE_RADAR = MODEL_NAME in RADARLESS_MODELS
 MODEL_PATHS = {
-  ModelRunner.THNEED: Path(__file__).parent / ('models/supercombo.thneed' if MODEL_NAME == DEFAULT_MODEL else f'models/models/{MODEL_NAME}.thneed'),
+  ModelRunner.THNEED: Path(__file__).parent / ('models/supercombo.thneed' if MODEL_NAME == DEFAULT_MODEL else f'{MODELS_PATH}/{MODEL_NAME}.thneed'),
  ModelRunner.ONNX: Path(__file__).parent / 'models/supercombo.onnx'}
 METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'
@@ -49,7 +52,7 @@ class FrameMeta:
 class ModelState:
  frame: ModelFrame
  wide_frame: ModelFrame
-  inputs: Dict[str, np.ndarray]
+  inputs: dict[str, np.ndarray]
  output: np.ndarray
  prev_desire: np.ndarray  # for tracking the rising edge of the pulse
  model: ModelRunner
@@ -63,9 +66,10 @@ class ModelState:
      'traffic_convention': np.zeros(ModelConstants.TRAFFIC_CONVENTION_LEN, dtype=np.float32),
      'lateral_control_params': np.zeros(ModelConstants.LATERAL_CONTROL_PARAMS_LEN, dtype=np.float32),
      'prev_desired_curv': np.zeros(ModelConstants.PREV_DESIRED_CURV_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
-      'nav_features': np.zeros(ModelConstants.NAV_FEATURE_LEN, dtype=np.float32),
+      **({'nav_features': np.zeros(ModelConstants.NAV_FEATURE_LEN, dtype=np.float32),
-      'nav_instructions': np.zeros(ModelConstants.NAV_INSTRUCTION_LEN, dtype=np.float32),
+          'nav_instructions': np.zeros(ModelConstants.NAV_INSTRUCTION_LEN, dtype=np.float32)} if not DISABLE_NAV else {}),
      'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32),
      **({'radar_tracks': np.zeros(ModelConstants.RADAR_TRACKS_LEN * ModelConstants.RADAR_TRACKS_WIDTH, dtype=np.float32)} if DISABLE_RADAR else {}),
    }
    with open(METADATA_PATH, 'rb') as f:
@@ -82,14 +86,14 @@ class ModelState:
    for k,v in self.inputs.items():
      self.model.addInput(k, v)
-  def slice_outputs(self, model_outputs: np.ndarray) -> Dict[str, np.ndarray]:
+  def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]:
    parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()}
    if SEND_RAW_PRED:
      parsed_model_outputs['raw_pred'] = model_outputs.copy()
    return parsed_model_outputs
  def run(self, buf: VisionBuf, wbuf: VisionBuf, transform: np.ndarray, transform_wide: np.ndarray,
-                inputs: Dict[str, np.ndarray], prepare_only: bool) -> Optional[Dict[str, np.ndarray]]:
+                inputs: dict[str, np.ndarray], prepare_only: bool) -> dict[str, np.ndarray] | None:
    # Model decides when action is completed, so desire input is just a pulse triggered on rising edge
    inputs['desire'][0] = 0
    self.inputs['desire'][:-ModelConstants.DESIRE_LEN] = self.inputs['desire'][ModelConstants.DESIRE_LEN:]
@@ -98,8 +102,11 @@ class ModelState:
    self.inputs['traffic_convention'][:] = inputs['traffic_convention']
    self.inputs['lateral_control_params'][:] = inputs['lateral_control_params']
    if not DISABLE_NAV:
      self.inputs['nav_features'][:] = inputs['nav_features']
      self.inputs['nav_instructions'][:] = inputs['nav_instructions']
    if DISABLE_RADAR:
      self.inputs['radar_tracks'][:] = inputs['radar_tracks']
    # if getCLBuffer is not None, frame will be None
    self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs")))
@@ -158,7 +165,7 @@ def main(demo=False):
  # messaging
  pm = PubMaster(["modelV2", "cameraOdometry"])
-  sm = SubMaster(["carState", "roadCameraState", "liveCalibration", "driverMonitoringState", "navModel", "navInstruction", "carControl", "frogpilotPlan"])
+  sm = SubMaster(["deviceState", "carState", "roadCameraState", "liveCalibration", "driverMonitoringState", "navModel", "navInstruction", "carControl", "liveTracks", "frogpilotPlan"])
  publish_state = PublishState()
  params = Params()
@@ -230,10 +237,11 @@ def main(demo=False):
    is_rhd = sm["driverMonitoringState"].isRHD
    frame_id = sm["roadCameraState"].frameId
    lateral_control_params = np.array([sm["carState"].vEgo, steer_delay], dtype=np.float32)
-    if sm.updated["liveCalibration"]:
+    if sm.updated["liveCalibration"] and sm.seen['roadCameraState'] and sm.seen['deviceState']:
      device_from_calib_euler = np.array(sm["liveCalibration"].rpyCalib, dtype=np.float32)
-      model_transform_main = get_warp_matrix(device_from_calib_euler, main_wide_camera, False).astype(np.float32)
+      dc = DEVICE_CAMERAS[(str(sm['deviceState'].deviceType), str(sm['roadCameraState'].sensor))]
-      model_transform_extra = get_warp_matrix(device_from_calib_euler, True, True).astype(np.float32)
+      model_transform_main = get_warp_matrix(device_from_calib_euler, dc.ecam.intrinsics if main_wide_camera else dc.fcam.intrinsics, False).astype(np.float32)
      model_transform_extra = get_warp_matrix(device_from_calib_euler, dc.ecam.intrinsics, True).astype(np.float32)
      live_calib_seen = True
    traffic_convention = np.zeros(2)
@@ -246,7 +254,7 @@ def main(demo=False):
    # Enable/disable nav features
    timestamp_llk = sm["navModel"].locationMonoTime
    nav_valid = sm.valid["navModel"] # and (nanos_since_boot() - timestamp_llk < 1e9)
-    nav_enabled = nav_valid and (params.get_bool("ExperimentalMode") or params.get_bool("NavChill"))
+    nav_enabled = nav_valid and not DISABLE_NAV
    if not nav_enabled:
      nav_features[:] = 0
@@ -267,6 +275,14 @@ def main(demo=False):
        if 0 <= distance_idx < 50:
          nav_instructions[distance_idx*3 + direction_idx] = 1
    radar_tracks = np.zeros(ModelConstants.RADAR_TRACKS_LEN * ModelConstants.RADAR_TRACKS_WIDTH, dtype=np.float32)
    if sm.updated["liveTracks"]:
      for i, track in enumerate(sm["liveTracks"]):
        if i >= ModelConstants.RADAR_TRACKS_LEN:
          break
        vec_index = i * ModelConstants.RADAR_TRACKS_WIDTH
        radar_tracks[vec_index:vec_index+ModelConstants.RADAR_TRACKS_WIDTH] = [track.dRel, track.yRel, track.vRel]
    # tracked dropped frames
    vipc_dropped_frames = max(0, meta_main.frame_id - last_vipc_frame_id - 1)
    frames_dropped = frame_dropped_filter.update(min(vipc_dropped_frames, 10))
@@ -280,12 +296,13 @@ def main(demo=False):
    if prepare_only:
      cloudlog.error(f"skipping model eval. Dropped {vipc_dropped_frames} frames")
-    inputs:Dict[str, np.ndarray] = {
+    inputs:dict[str, np.ndarray] = {
      'desire': vec_desire,
      'traffic_convention': traffic_convention,
      'lateral_control_params': lateral_control_params,
-      'nav_features': nav_features,
+      **({'nav_features': nav_features, 'nav_instructions': nav_instructions} if not DISABLE_NAV else {}),
-      'nav_instructions': nav_instructions}
+      **({'radar_tracks': radar_tracks,} if DISABLE_RADAR else {}),
    }
    mt1 = time.perf_counter()
    model_output = model.run(buf_main, buf_extra, model_transform_main, model_transform_extra, inputs, prepare_only)
--- a/selfdrive/modeld/models/README.md
+++ b/selfdrive/modeld/models/README.md
@@ -0,0 +1,66 @@
 ## Neural networks in openpilot
 To view the architecture of the ONNX networks, you can use [netron](https://netron.app/)
 ## Supercombo
 ### Supercombo input format (Full size: 799906 x float32)
 * **image stream**
  * Two consecutive images (256 * 512 * 3 in RGB) recorded at 20 Hz : 393216 = 2 * 6 * 128 * 256
    * Each 256 * 512 image is represented in YUV420 with 6 channels : 6 * 128 * 256
      * Channels 0,1,2,3 represent the full-res Y channel and are represented in numpy as Y[::2, ::2], Y[::2, 1::2], Y[1::2, ::2], and Y[1::2, 1::2]
      * Channel 4 represents the half-res U channel
      * Channel 5 represents the half-res V channel
 * **wide image stream**
  * Two consecutive images (256 * 512 * 3 in RGB) recorded at 20 Hz : 393216 = 2 * 6 * 128 * 256
    * Each 256 * 512 image is represented in YUV420 with 6 channels : 6 * 128 * 256
      * Channels 0,1,2,3 represent the full-res Y channel and are represented in numpy as Y[::2, ::2], Y[::2, 1::2], Y[1::2, ::2], and Y[1::2, 1::2]
      * Channel 4 represents the half-res U channel
      * Channel 5 represents the half-res V channel
 * **desire**
  * one-hot encoded buffer to command model to execute certain actions, bit needs to be sent for the past 5 seconds (at 20FPS) : 100 * 8
 * **traffic convention**
  * one-hot encoded vector to tell model whether traffic is right-hand or left-hand traffic : 2
 * **feature buffer**
  * A buffer of intermediate features that gets appended to the current feature to form a 5 seconds temporal context (at 20FPS) : 99 * 512
 * **nav features**
  * 1 * 150
 * **nav instructions**
  * 1 * 256
 ### Supercombo output format (Full size: XXX x float32)
 Read [here](https://github.com/commaai/openpilot/blob/90af436a121164a51da9fa48d093c29f738adf6a/selfdrive/modeld/models/driving.h#L236) for more.
 ## Driver Monitoring Model
 * .onnx model can be run with onnx runtimes
 * .dlc file is a pre-quantized model and only runs on qualcomm DSPs
 ### input format
 * single image W = 1440 H = 960 luminance channel (Y) from the planar YUV420 format:
  * full input size is 1440 * 960 = 1382400
  * normalized ranging from 0.0 to 1.0 in float32 (onnx runner) or ranging from 0 to 255 in uint8 (snpe runner)
 * camera calibration angles (roll, pitch, yaw) from liveCalibration: 3 x float32 inputs
 ### output format
 * 84 x float32 outputs = 2 + 41 * 2 ([parsing example](https://github.com/commaai/openpilot/blob/22ce4e17ba0d3bfcf37f8255a4dd1dc683fe0c38/selfdrive/modeld/models/dmonitoring.cc#L33))
  * for each person in the front seats (2 * 41)
    * face pose: 12 = 6 + 6
      * face orientation [pitch, yaw, roll] in camera frame: 3
      * face position [dx, dy] relative to image center: 2
      * normalized face size: 1
      * standard deviations for above outputs: 6
    * face visible probability: 1
    * eyes: 20 = (8 + 1) + (8 + 1) + 1 + 1
      * eye position and size, and their standard deviations: 8
      * eye visible probability: 1
      * eye closed probability: 1
    * wearing sunglasses probability: 1
    * face occluded probability: 1
    * touching wheel probability: 1
    * paying attention probability: 1
    * (deprecated) distracted probabilities: 2
    * using phone probability: 1
    * distracted probability: 1
  * common outputs 2
    * poor camera vision probability: 1
    * left hand drive probability: 1
--- a/selfdrive/modeld/models/commonmodel.cc
+++ b/selfdrive/modeld/models/commonmodel.cc
@@ -0,0 +1,72 @@
 #include "selfdrive/modeld/models/commonmodel.h"
 #include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstring>
 #include "common/clutil.h"
 #include "common/mat.h"
 #include "common/timing.h"
 ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
  input_frames = std::make_unique<float[]>(buf_size);
  q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
  y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_WIDTH * MODEL_HEIGHT, NULL, &err));
  u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err));
  v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err));
  net_input_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_FRAME_SIZE * sizeof(float), NULL, &err));
  transform_init(&transform, context, device_id);
  loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
 }
 float* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection, cl_mem *output) {
  transform_queue(&this->transform, q,
                  yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
                  y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, projection);
  if (output == NULL) {
    loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, net_input_cl);
    std::memmove(&input_frames[0], &input_frames[MODEL_FRAME_SIZE], sizeof(float) * MODEL_FRAME_SIZE);
    CL_CHECK(clEnqueueReadBuffer(q, net_input_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(float), &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
    clFinish(q);
    return &input_frames[0];
  } else {
    loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, *output, true);
    // NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
    clFinish(q);
    return NULL;
  }
 }
 ModelFrame::~ModelFrame() {
  transform_destroy(&transform);
  loadyuv_destroy(&loadyuv);
  CL_CHECK(clReleaseMemObject(net_input_cl));
  CL_CHECK(clReleaseMemObject(v_cl));
  CL_CHECK(clReleaseMemObject(u_cl));
  CL_CHECK(clReleaseMemObject(y_cl));
  CL_CHECK(clReleaseCommandQueue(q));
 }
 void softmax(const float* input, float* output, size_t len) {
  const float max_val = *std::max_element(input, input + len);
  float denominator = 0;
  for (int i = 0; i < len; i++) {
    float const v_exp = expf(input[i] - max_val);
    denominator += v_exp;
    output[i] = v_exp;
  }
  const float inv_denominator = 1. / denominator;
  for (int i = 0; i < len; i++) {
    output[i] *= inv_denominator;
  }
 }
 float sigmoid(float input) {
  return 1 / (1 + expf(-input));
 }
--- a/selfdrive/modeld/models/commonmodel.h
+++ b/selfdrive/modeld/models/commonmodel.h
@@ -0,0 +1,47 @@
 #pragma once
 #include <cfloat>
 #include <cstdlib>
 #include <memory>
 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
 #else
 #include <CL/cl.h>
 #endif
 #include "common/mat.h"
 #include "cereal/messaging/messaging.h"
 #include "selfdrive/modeld/transforms/loadyuv.h"
 #include "selfdrive/modeld/transforms/transform.h"
 const bool send_raw_pred = getenv("SEND_RAW_PRED") != NULL;
 void softmax(const float* input, float* output, size_t len);
 float sigmoid(float input);
 template<class T, size_t size>
 constexpr const kj::ArrayPtr<const T> to_kj_array_ptr(const std::array<T, size> &arr) {
  return kj::ArrayPtr(arr.data(), arr.size());
 }
 class ModelFrame {
 public:
  ModelFrame(cl_device_id device_id, cl_context context);
  ~ModelFrame();
  float* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform, cl_mem *output);
  const int MODEL_WIDTH = 512;
  const int MODEL_HEIGHT = 256;
  const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT * 3 / 2;
  const int buf_size = MODEL_FRAME_SIZE * 2;
 private:
  Transform transform;
  LoadYUVState loadyuv;
  cl_command_queue q;
  cl_mem y_cl, u_cl, v_cl, net_input_cl;
  std::unique_ptr<float[]> input_frames;
 };
--- a/selfdrive/modeld/models/commonmodel_pyx.cpp
+++ b/selfdrive/modeld/models/commonmodel_pyx.cpp
--- a/selfdrive/modeld/models/commonmodel_pyx.so
+++ b/selfdrive/modeld/models/commonmodel_pyx.so
--- a/selfdrive/modeld/models/dmonitoring_model.current
+++ b/selfdrive/modeld/models/dmonitoring_model.current
@@ -0,0 +1,2 @@
 60abed33-9f25-4e34-9937-aaf918d41dfc
 50887963963a3022e85ac0c94b0801aef955a608
--- a/selfdrive/modeld/models/dmonitoring_model.onnx
+++ b/selfdrive/modeld/models/dmonitoring_model.onnx
--- a/selfdrive/modeld/models/models/certified-herbalist.thneed
+++ b/selfdrive/modeld/models/models/certified-herbalist.thneed
--- a/selfdrive/modeld/models/models/duck-amigo.thneed
+++ b/selfdrive/modeld/models/models/duck-amigo.thneed
--- a/selfdrive/modeld/models/models/recertified-herbalist.thneed
+++ b/selfdrive/modeld/models/models/recertified-herbalist.thneed
--- a/selfdrive/modeld/models/navmodel.onnx
+++ b/selfdrive/modeld/models/navmodel.onnx
--- a/selfdrive/modeld/models/supercombo.thneed
+++ b/selfdrive/modeld/models/supercombo.thneed
--- a/selfdrive/modeld/models/supercombo_metadata.pkl
+++ b/selfdrive/modeld/models/supercombo_metadata.pkl
--- a/selfdrive/modeld/navmodeld.py
+++ b/selfdrive/modeld/navmodeld.py
@@ -5,7 +5,6 @@ import time
 import ctypes
 import numpy as np
 from pathlib import Path
 from typing import Tuple, Dict
 from cereal import messaging
 from cereal.messaging import PubMaster, SubMaster
@@ -41,7 +40,7 @@ class NavModelResult(ctypes.Structure):
    ("features", ctypes.c_float*NAV_FEATURE_LEN)]
 class ModelState:
-  inputs: Dict[str, np.ndarray]
+  inputs: dict[str, np.ndarray]
  output: np.ndarray
  model: ModelRunner
@@ -52,7 +51,7 @@ class ModelState:
    self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.DSP, True, None)
    self.model.addInput("input_img", None)
-  def run(self, buf:np.ndarray) -> Tuple[np.ndarray, float]:
+  def run(self, buf:np.ndarray) -> tuple[np.ndarray, float]:
    self.inputs['input_img'][:] = buf
    t1 = time.perf_counter()
--- a/selfdrive/modeld/parse_model_outputs.py
+++ b/selfdrive/modeld/parse_model_outputs.py
@@ -1,5 +1,4 @@
 import numpy as np
 from typing import Dict
 from openpilot.selfdrive.modeld.constants import ModelConstants
 def sigmoid(x):
@@ -82,7 +81,7 @@ class Parser:
    outs[name] = pred_mu_final.reshape(final_shape)
    outs[name + '_stds'] = pred_std_final.reshape(final_shape)
-  def parse_outputs(self, outs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
+  def parse_outputs(self, outs: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
    self.parse_mdn('plan', outs, in_N=ModelConstants.PLAN_MHP_N, out_N=ModelConstants.PLAN_MHP_SELECTION,
                   out_shape=(ModelConstants.IDX_N,ModelConstants.PLAN_WIDTH))
    self.parse_mdn('lane_lines', outs, in_N=0, out_N=0, out_shape=(ModelConstants.NUM_LANE_LINES,ModelConstants.IDX_N,ModelConstants.LANE_LINES_WIDTH))
--- a/selfdrive/modeld/runners/onnxmodel.py
+++ b/selfdrive/modeld/runners/onnxmodel.py
@@ -3,7 +3,7 @@ import itertools
 import os
 import sys
 import numpy as np
-from typing import Tuple, Dict, Union, Any
+from typing import Any
 from openpilot.selfdrive.modeld.runners.runmodel_pyx import RunModel
@@ -38,7 +38,7 @@ def create_ort_session(path, fp16_to_fp32):
  options = ort.SessionOptions()
  options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
-  provider: Union[str, Tuple[str, Dict[Any, Any]]]
+  provider: str | tuple[str, dict[Any, Any]]
  if 'OpenVINOExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
    provider = 'OpenVINOExecutionProvider'
  elif 'CUDAExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
--- a/selfdrive/modeld/runners/run.h
+++ b/selfdrive/modeld/runners/run.h
@@ -0,0 +1,4 @@
 #pragma once
 #include "selfdrive/modeld/runners/runmodel.h"
 #include "selfdrive/modeld/runners/snpemodel.h"
--- a/selfdrive/modeld/runners/runmodel.h
+++ b/selfdrive/modeld/runners/runmodel.h
@@ -0,0 +1,49 @@
 #pragma once
 #include <string>
 #include <vector>
 #include <memory>
 #include <cassert>
 #include "common/clutil.h"
 #include "common/swaglog.h"
 #define USE_CPU_RUNTIME 0
 #define USE_GPU_RUNTIME 1
 #define USE_DSP_RUNTIME 2
 struct ModelInput {
  const std::string name;
  float *buffer;
  int size;
  ModelInput(const std::string _name, float *_buffer, int _size) : name(_name), buffer(_buffer), size(_size) {}
  virtual void setBuffer(float *_buffer, int _size) {
    assert(size == _size || size == 0);
    buffer = _buffer;
    size = _size;
  }
 };
 class RunModel {
 public:
  std::vector<std::unique_ptr<ModelInput>> inputs;
  virtual ~RunModel() {}
  virtual void execute() {}
  virtual void* getCLBuffer(const std::string name) { return nullptr; }
  virtual void addInput(const std::string name, float *buffer, int size) {
    inputs.push_back(std::unique_ptr<ModelInput>(new ModelInput(name, buffer, size)));
  }
  virtual void setInputBuffer(const std::string name, float *buffer, int size) {
    for (auto &input : inputs) {
      if (name == input->name) {
        input->setBuffer(buffer, size);
        return;
      }
    }
    LOGE("Tried to update input `%s` but no input with this name exists", name.c_str());
    assert(false);
  }
 };
--- a/selfdrive/modeld/runners/runmodel_pyx.cpp
+++ b/selfdrive/modeld/runners/runmodel_pyx.cpp
--- a/selfdrive/modeld/runners/runmodel_pyx.so
+++ b/selfdrive/modeld/runners/runmodel_pyx.so
--- a/selfdrive/modeld/runners/snpemodel.cc
+++ b/selfdrive/modeld/runners/snpemodel.cc
@@ -0,0 +1,116 @@
 #pragma clang diagnostic ignored "-Wexceptions"
 #include "selfdrive/modeld/runners/snpemodel.h"
 #include <cstring>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 #include "common/util.h"
 #include "common/timing.h"
 void PrintErrorStringAndExit() {
  std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
  std::exit(EXIT_FAILURE);
 }
 SNPEModel::SNPEModel(const std::string path, float *_output, size_t _output_size, int runtime, bool _use_tf8, cl_context context) {
  output = _output;
  output_size = _output_size;
  use_tf8 = _use_tf8;
 #ifdef QCOM2
  if (runtime == USE_GPU_RUNTIME) {
    snpe_runtime = zdl::DlSystem::Runtime_t::GPU;
  } else if (runtime == USE_DSP_RUNTIME) {
    snpe_runtime = zdl::DlSystem::Runtime_t::DSP;
  } else {
    snpe_runtime = zdl::DlSystem::Runtime_t::CPU;
  }
  assert(zdl::SNPE::SNPEFactory::isRuntimeAvailable(snpe_runtime));
 #endif
  model_data = util::read_file(path);
  assert(model_data.size() > 0);
  // load model
  std::unique_ptr<zdl::DlContainer::IDlContainer> container = zdl::DlContainer::IDlContainer::open((uint8_t*)model_data.data(), model_data.size());
  if (!container) { PrintErrorStringAndExit(); }
  LOGW("loaded model with size: %lu", model_data.size());
  // create model runner
  zdl::SNPE::SNPEBuilder snpe_builder(container.get());
  while (!snpe) {
 #ifdef QCOM2
    snpe = snpe_builder.setOutputLayers({})
                       .setRuntimeProcessor(snpe_runtime)
                       .setUseUserSuppliedBuffers(true)
                       .setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
                       .build();
 #else
    snpe = snpe_builder.setOutputLayers({})
                       .setUseUserSuppliedBuffers(true)
                       .setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
                       .build();
 #endif
    if (!snpe) std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
  }
  // create output buffer
  zdl::DlSystem::UserBufferEncodingFloat ub_encoding_float;
  zdl::DlSystem::IUserBufferFactory &ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
  const auto &output_tensor_names_opt = snpe->getOutputTensorNames();
  if (!output_tensor_names_opt) throw std::runtime_error("Error obtaining output tensor names");
  const auto &output_tensor_names = *output_tensor_names_opt;
  assert(output_tensor_names.size() == 1);
  const char *output_tensor_name = output_tensor_names.at(0);
  const zdl::DlSystem::TensorShape &buffer_shape = snpe->getInputOutputBufferAttributes(output_tensor_name)->getDims();
  if (output_size != 0) {
    assert(output_size == buffer_shape[1]);
  } else {
    output_size = buffer_shape[1];
  }
  std::vector<size_t> output_strides = {output_size * sizeof(float), sizeof(float)};
  output_buffer = ub_factory.createUserBuffer(output, output_size * sizeof(float), output_strides, &ub_encoding_float);
  output_map.add(output_tensor_name, output_buffer.get());
 }
 void SNPEModel::addInput(const std::string name, float *buffer, int size) {
  const int idx = inputs.size();
  const auto &input_tensor_names_opt = snpe->getInputTensorNames();
  if (!input_tensor_names_opt) throw std::runtime_error("Error obtaining input tensor names");
  const auto &input_tensor_names = *input_tensor_names_opt;
  const char *input_tensor_name = input_tensor_names.at(idx);
  const bool input_tf8 = use_tf8 && strcmp(input_tensor_name, "input_img") == 0;  // TODO: This is a terrible hack, get rid of this name check both here and in onnx_runner.py
  LOGW("adding index %d: %s", idx, input_tensor_name);
  zdl::DlSystem::UserBufferEncodingFloat ub_encoding_float;
  zdl::DlSystem::UserBufferEncodingTf8 ub_encoding_tf8(0, 1./255); // network takes 0-1
  zdl::DlSystem::IUserBufferFactory &ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
  zdl::DlSystem::UserBufferEncoding *input_encoding = input_tf8 ? (zdl::DlSystem::UserBufferEncoding*)&ub_encoding_tf8 : (zdl::DlSystem::UserBufferEncoding*)&ub_encoding_float;
  const auto &buffer_shape_opt = snpe->getInputDimensions(input_tensor_name);
  const zdl::DlSystem::TensorShape &buffer_shape = *buffer_shape_opt;
  size_t size_of_input = input_tf8 ? sizeof(uint8_t) : sizeof(float);
  std::vector<size_t> strides(buffer_shape.rank());
  strides[strides.size() - 1] = size_of_input;
  size_t product = 1;
  for (size_t i = 0; i < buffer_shape.rank(); i++) product *= buffer_shape[i];
  size_t stride = strides[strides.size() - 1];
  for (size_t i = buffer_shape.rank() - 1; i > 0; i--) {
    stride *= buffer_shape[i];
    strides[i-1] = stride;
  }
  auto input_buffer = ub_factory.createUserBuffer(buffer, product*size_of_input, strides, input_encoding);
  input_map.add(input_tensor_name, input_buffer.get());
  inputs.push_back(std::unique_ptr<SNPEModelInput>(new SNPEModelInput(name, buffer, size, std::move(input_buffer))));
 }
 void SNPEModel::execute() {
  if (!snpe->execute(input_map, output_map)) {
    PrintErrorStringAndExit();
  }
 }
--- a/selfdrive/modeld/runners/snpemodel.h
+++ b/selfdrive/modeld/runners/snpemodel.h
@@ -0,0 +1,52 @@
 #pragma once
 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
 #include <memory>
 #include <string>
 #include <utility>
 #include <DlContainer/IDlContainer.hpp>
 #include <DlSystem/DlError.hpp>
 #include <DlSystem/ITensor.hpp>
 #include <DlSystem/ITensorFactory.hpp>
 #include <DlSystem/IUserBuffer.hpp>
 #include <DlSystem/IUserBufferFactory.hpp>
 #include <SNPE/SNPE.hpp>
 #include <SNPE/SNPEBuilder.hpp>
 #include <SNPE/SNPEFactory.hpp>
 #include "selfdrive/modeld/runners/runmodel.h"
 struct SNPEModelInput : public ModelInput {
  std::unique_ptr<zdl::DlSystem::IUserBuffer> snpe_buffer;
  SNPEModelInput(const std::string _name, float *_buffer, int _size, std::unique_ptr<zdl::DlSystem::IUserBuffer> _snpe_buffer) : ModelInput(_name, _buffer, _size), snpe_buffer(std::move(_snpe_buffer)) {}
  void setBuffer(float *_buffer, int _size) {
    ModelInput::setBuffer(_buffer, _size);
    assert(snpe_buffer->setBufferAddress(_buffer) == true);
  }
 };
 class SNPEModel : public RunModel {
 public:
  SNPEModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
  void addInput(const std::string name, float *buffer, int size);
  void execute();
 private:
  std::string model_data;
 #ifdef QCOM2
  zdl::DlSystem::Runtime_t snpe_runtime;
 #endif
  // snpe model stuff
  std::unique_ptr<zdl::SNPE::SNPE> snpe;
  zdl::DlSystem::UserBufferMap input_map;
  zdl::DlSystem::UserBufferMap output_map;
  std::unique_ptr<zdl::DlSystem::IUserBuffer> output_buffer;
  bool use_tf8;
  float *output;
  size_t output_size;
 };
--- a/selfdrive/modeld/runners/snpemodel_pyx.cpp
+++ b/selfdrive/modeld/runners/snpemodel_pyx.cpp
--- a/selfdrive/modeld/runners/snpemodel_pyx.so
+++ b/selfdrive/modeld/runners/snpemodel_pyx.so
--- a/selfdrive/modeld/runners/thneedmodel.cc
+++ b/selfdrive/modeld/runners/thneedmodel.cc
@@ -0,0 +1,58 @@
 #include "selfdrive/modeld/runners/thneedmodel.h"
 #include <string>
 #include "common/swaglog.h"
 ThneedModel::ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool luse_tf8, cl_context context) {
  thneed = new Thneed(true, context);
  thneed->load(path.c_str());
  thneed->clexec();
  recorded = false;
  output = _output;
 }
 void* ThneedModel::getCLBuffer(const std::string name) {
  int index = -1;
  for (int i = 0; i < inputs.size(); i++) {
    if (name == inputs[i]->name) {
      index = i;
      break;
    }
  }
  if (index == -1) {
    LOGE("Tried to get CL buffer for input `%s` but no input with this name exists", name.c_str());
    assert(false);
  }
  if (thneed->input_clmem.size() >= inputs.size()) {
    return &thneed->input_clmem[inputs.size() - index - 1];
  } else {
    return nullptr;
  }
 }
 void ThneedModel::execute() {
  if (!recorded) {
    thneed->record = true;
    float *input_buffers[inputs.size()];
    for (int i = 0; i < inputs.size(); i++) {
      input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
    }
    thneed->copy_inputs(input_buffers);
    thneed->clexec();
    thneed->copy_output(output);
    thneed->stop();
    recorded = true;
  } else {
    float *input_buffers[inputs.size()];
    for (int i = 0; i < inputs.size(); i++) {
      input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
    }
    thneed->execute(input_buffers, output);
  }
 }
--- a/selfdrive/modeld/runners/thneedmodel.h
+++ b/selfdrive/modeld/runners/thneedmodel.h
@@ -0,0 +1,17 @@
 #pragma once
 #include <string>
 #include "selfdrive/modeld/runners/runmodel.h"
 #include "selfdrive/modeld/thneed/thneed.h"
 class ThneedModel : public RunModel {
 public:
  ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
  void *getCLBuffer(const std::string name);
  void execute();
 private:
  Thneed *thneed = NULL;
  bool recorded;
  float *output;
 };
--- a/selfdrive/modeld/runners/thneedmodel_pyx.cpp
+++ b/selfdrive/modeld/runners/thneedmodel_pyx.cpp
--- a/selfdrive/modeld/runners/thneedmodel_pyx.so
+++ b/selfdrive/modeld/runners/thneedmodel_pyx.so
--- a/selfdrive/modeld/tests/init.py
+++ b/selfdrive/modeld/tests/init.py
--- a/selfdrive/modeld/tests/dmon_lag/repro.cc
+++ b/selfdrive/modeld/tests/dmon_lag/repro.cc
@@ -0,0 +1,101 @@
 // clang++ -O2 repro.cc && ./a.out
 #include <sched.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <ctime>
 static inline double millis_since_boot() {
  struct timespec t;
  clock_gettime(CLOCK_BOOTTIME, &t);
  return t.tv_sec * 1000.0 + t.tv_nsec * 1e-6;
 }
 #define MODEL_WIDTH 320
 #define MODEL_HEIGHT 640
 // null function still breaks it
 #define input_lambda(x) x
 // this is copied from models/dmonitoring.cc, and is the code that triggers the issue
 void inner(uint8_t *resized_buf, float *net_input_buf) {
  int resized_width = MODEL_WIDTH;
  int resized_height = MODEL_HEIGHT;
  // one shot conversion, O(n) anyway
  // yuvframe2tensor, normalize
  for (int r = 0; r < MODEL_HEIGHT/2; r++) {
    for (int c = 0; c < MODEL_WIDTH/2; c++) {
      // Y_ul
      net_input_buf[(c*MODEL_HEIGHT/2) + r] = input_lambda(resized_buf[(2*r*resized_width) + (2*c)]);
      // Y_ur
      net_input_buf[(c*MODEL_HEIGHT/2) + r + (2*(MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(2*r*resized_width) + (2*c+1)]);
      // Y_dl
      net_input_buf[(c*MODEL_HEIGHT/2) + r + ((MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(2*r*resized_width+1) + (2*c)]);
      // Y_dr
      net_input_buf[(c*MODEL_HEIGHT/2) + r + (3*(MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(2*r*resized_width+1) + (2*c+1)]);
      // U
      net_input_buf[(c*MODEL_HEIGHT/2) + r + (4*(MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(resized_width*resized_height) + (r*resized_width/2) + c]);
      // V
      net_input_buf[(c*MODEL_HEIGHT/2) + r + (5*(MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(resized_width*resized_height) + ((resized_width/2)*(resized_height/2)) + (r*resized_width/2) + c]);
    }
  }
 }
 float trial() {
  int resized_width = MODEL_WIDTH;
  int resized_height = MODEL_HEIGHT;
  int yuv_buf_len = (MODEL_WIDTH/2) * (MODEL_HEIGHT/2) * 6; // Y|u|v -> y|y|y|y|u|v
  // allocate the buffers
  uint8_t *resized_buf = (uint8_t*)malloc(resized_width*resized_height*3/2);
  float *net_input_buf = (float*)malloc(yuv_buf_len*sizeof(float));
  printf("allocate -- %p 0x%x -- %p 0x%lx\n", resized_buf, resized_width*resized_height*3/2, net_input_buf, yuv_buf_len*sizeof(float));
  // test for bad buffers
  static int CNT = 20;
  float avg = 0.0;
  for (int i = 0; i < CNT; i++) {
    double s4 = millis_since_boot();
    inner(resized_buf, net_input_buf);
    double s5 = millis_since_boot();
    avg += s5-s4;
  }
  avg /= CNT;
  // once it's bad, it's reliably bad
  if (avg > 10) {
    printf("HIT %f\n", avg);
    printf("BAD\n");
    for (int i = 0; i < 200; i++) {
      double s4 = millis_since_boot();
      inner(resized_buf, net_input_buf);
      double s5 = millis_since_boot();
      printf("%.2f   ", s5-s4);
    }
    printf("\n");
    exit(0);
  }
  // don't free so we get a different buffer each time
  //free(resized_buf);
  //free(net_input_buf);
  return avg;
 }
 int main() {
  while (true) {
    float ret = trial();
    printf("got %f\n", ret);
  }
 }
--- a/selfdrive/modeld/tests/snpe_benchmark/.gitignore
+++ b/selfdrive/modeld/tests/snpe_benchmark/.gitignore
@@ -0,0 +1 @@
 benchmark
--- a/selfdrive/modeld/tests/snpe_benchmark/benchmark.cc
+++ b/selfdrive/modeld/tests/snpe_benchmark/benchmark.cc
@@ -0,0 +1,192 @@
 #include <SNPE/SNPE.hpp>
 #include <SNPE/SNPEBuilder.hpp>
 #include <SNPE/SNPEFactory.hpp>
 #include <DlContainer/IDlContainer.hpp>
 #include <DlSystem/DlError.hpp>
 #include <DlSystem/ITensor.hpp>
 #include <DlSystem/ITensorFactory.hpp>
 #include <iostream>
 #include <fstream>
 #include <sstream>
 using namespace std;
 int64_t timespecDiff(struct timespec *timeA_p, struct timespec *timeB_p) {
  return ((timeA_p->tv_sec * 1000000000) + timeA_p->tv_nsec) - ((timeB_p->tv_sec * 1000000000) + timeB_p->tv_nsec);
 }
 void PrintErrorStringAndExit() {
  cout << "ERROR!" << endl;
  const char* const errStr = zdl::DlSystem::getLastErrorString();
  std::cerr << errStr << std::endl;
  std::exit(EXIT_FAILURE);
 }
 zdl::DlSystem::Runtime_t checkRuntime() {
  static zdl::DlSystem::Version_t Version = zdl::SNPE::SNPEFactory::getLibraryVersion();
  static zdl::DlSystem::Runtime_t Runtime;
  std::cout << "SNPE Version: " << Version.asString().c_str() << std::endl; //Print Version number
  if (zdl::SNPE::SNPEFactory::isRuntimeAvailable(zdl::DlSystem::Runtime_t::DSP)) {
    std::cout << "Using DSP runtime" << std::endl;
    Runtime = zdl::DlSystem::Runtime_t::DSP;
  } else if (zdl::SNPE::SNPEFactory::isRuntimeAvailable(zdl::DlSystem::Runtime_t::GPU)) {
    std::cout << "Using GPU runtime" << std::endl;
    Runtime = zdl::DlSystem::Runtime_t::GPU;
  } else {
    std::cout << "Using cpu runtime" << std::endl;
    Runtime = zdl::DlSystem::Runtime_t::CPU;
  }
  return Runtime;
 }
 void test(char *filename) {
  static zdl::DlSystem::Runtime_t runtime = checkRuntime();
  std::unique_ptr<zdl::DlContainer::IDlContainer> container;
  container = zdl::DlContainer::IDlContainer::open(filename);
  if (!container) { PrintErrorStringAndExit(); }
  cout << "start build" << endl;
  std::unique_ptr<zdl::SNPE::SNPE> snpe;
  {
    snpe = NULL;
    zdl::SNPE::SNPEBuilder snpeBuilder(container.get());
    snpe = snpeBuilder.setOutputLayers({})
      .setRuntimeProcessor(runtime)
      .setUseUserSuppliedBuffers(false)
      //.setDebugMode(true)
      .build();
    if (!snpe) {
      cout << "ERROR!" << endl;
      const char* const errStr = zdl::DlSystem::getLastErrorString();
      std::cerr << errStr << std::endl;
    }
    cout << "ran snpeBuilder" << endl;
  }
  const auto &strList_opt = snpe->getInputTensorNames();
  if (!strList_opt) throw std::runtime_error("Error obtaining input tensor names");
  cout << "get input tensor names done" << endl;
  const auto &strList = *strList_opt;
  static zdl::DlSystem::TensorMap inputTensorMap;
  static zdl::DlSystem::TensorMap outputTensorMap;
  vector<std::unique_ptr<zdl::DlSystem::ITensor> > inputs;
  for (int i = 0; i < strList.size(); i++) {
    cout << "input name: " << strList.at(i) << endl;
    const auto &inputDims_opt = snpe->getInputDimensions(strList.at(i));
    const auto &inputShape = *inputDims_opt;
    inputs.push_back(zdl::SNPE::SNPEFactory::getTensorFactory().createTensor(inputShape));
    inputTensorMap.add(strList.at(i), inputs[i].get());
  }
  struct timespec start, end;
  cout << "**** starting benchmark ****" << endl;
  for (int i = 0; i < 50; i++) {
    clock_gettime(CLOCK_MONOTONIC, &start);
    int err = snpe->execute(inputTensorMap, outputTensorMap);
    assert(err == true);
    clock_gettime(CLOCK_MONOTONIC, &end);
    uint64_t timeElapsed = timespecDiff(&end, &start);
    printf("time: %f ms\n", timeElapsed*1.0/1e6);
  }
 }
 void get_testframe(int index, std::unique_ptr<zdl::DlSystem::ITensor> &input) {
  FILE * pFile;
  string filepath="/data/ipt/quantize_samples/sample_input_"+std::to_string(index);
  pFile = fopen(filepath.c_str(), "rb");
  int length = 1*6*160*320*4;
  float * frame_buffer = new float[length/4]; // 32/8
  fread(frame_buffer, length, 1, pFile);
  // std::cout << *(frame_buffer+length/4-1) << std::endl;
  std::copy(frame_buffer, frame_buffer+(length/4), input->begin());
  fclose(pFile);
 }
 void SaveITensor(const std::string& path, const zdl::DlSystem::ITensor* tensor)
 {
   std::ofstream os(path, std::ofstream::binary);
   if (!os)
   {
      std::cerr << "Failed to open output file for writing: " << path << "\n";
      std::exit(EXIT_FAILURE);
   }
   for ( auto it = tensor->cbegin(); it != tensor->cend(); ++it )
   {
      float f = *it;
      if (!os.write(reinterpret_cast<char*>(&f), sizeof(float)))
      {
         std::cerr << "Failed to write data to: " << path << "\n";
         std::exit(EXIT_FAILURE);
      }
   }
 }
 void testrun(char* modelfile) {
  static zdl::DlSystem::Runtime_t runtime = checkRuntime();
  std::unique_ptr<zdl::DlContainer::IDlContainer> container;
  container = zdl::DlContainer::IDlContainer::open(modelfile);
  if (!container) { PrintErrorStringAndExit(); }
  cout << "start build" << endl;
  std::unique_ptr<zdl::SNPE::SNPE> snpe;
  {
    snpe = NULL;
    zdl::SNPE::SNPEBuilder snpeBuilder(container.get());
    snpe = snpeBuilder.setOutputLayers({})
      .setRuntimeProcessor(runtime)
      .setUseUserSuppliedBuffers(false)
      //.setDebugMode(true)
      .build();
    if (!snpe) {
      cout << "ERROR!" << endl;
      const char* const errStr = zdl::DlSystem::getLastErrorString();
      std::cerr << errStr << std::endl;
    }
    cout << "ran snpeBuilder" << endl;
  }
  const auto &strList_opt = snpe->getInputTensorNames();
  if (!strList_opt) throw std::runtime_error("Error obtaining input tensor names");
  cout << "get input tensor names done" << endl;
  const auto &strList = *strList_opt;
  static zdl::DlSystem::TensorMap inputTensorMap;
  static zdl::DlSystem::TensorMap outputTensorMap;
  assert(strList.size() == 1);
  const auto &inputDims_opt = snpe->getInputDimensions(strList.at(0));
  const auto &inputShape = *inputDims_opt;
  std::cout << "winkwink" << std::endl;
  for (int i=0; i<10000; i++) {
    std::unique_ptr<zdl::DlSystem::ITensor> input;
    input = zdl::SNPE::SNPEFactory::getTensorFactory().createTensor(inputShape);
    get_testframe(i, input);
    snpe->execute(input.get(), outputTensorMap);
    zdl::DlSystem::StringList tensorNames = outputTensorMap.getTensorNames();
    std::for_each(tensorNames.begin(), tensorNames.end(), [&](const char* name) {
      std::ostringstream path;
      path << "/data/opt/Result_" << std::to_string(i) << ".raw";
      auto tensorPtr = outputTensorMap.getTensor(name);
      SaveITensor(path.str(), tensorPtr);
    });
  }
 }
 int main(int argc, char* argv[]) {
  if (argc < 2) {
    printf("usage: %s <filename>\n", argv[0]);
    return -1;
  }
  if (argc == 2) {
    while (true) test(argv[1]);
  } else if (argc == 3) {
    testrun(argv[1]);
  }
  return 0;
 }
--- a/selfdrive/modeld/tests/snpe_benchmark/benchmark.sh
+++ b/selfdrive/modeld/tests/snpe_benchmark/benchmark.sh
@@ -0,0 +1,4 @@
 #!/bin/sh -e
 clang++ -I /data/openpilot/third_party/snpe/include/ -L/data/pythonpath/third_party/snpe/aarch64 -lSNPE benchmark.cc -o benchmark
 export LD_LIBRARY_PATH="/data/pythonpath/third_party/snpe/aarch64/:$HOME/openpilot/third_party/snpe/x86_64/:$LD_LIBRARY_PATH"
 exec ./benchmark $1
--- a/selfdrive/modeld/tests/test_modeld.py
+++ b/selfdrive/modeld/tests/test_modeld.py
@@ -0,0 +1,107 @@
 #!/usr/bin/env python3
 import unittest
 import numpy as np
 import random
 import cereal.messaging as messaging
 from cereal.visionipc import VisionIpcServer, VisionStreamType
 from openpilot.common.transformations.camera import DEVICE_CAMERAS
 from openpilot.common.realtime import DT_MDL
 from openpilot.selfdrive.car.car_helpers import write_car_param
 from openpilot.selfdrive.manager.process_config import managed_processes
 from openpilot.selfdrive.test.process_replay.vision_meta import meta_from_camera_state
 CAM = DEVICE_CAMERAS[("tici", "ar0231")].fcam
 IMG = np.zeros(int(CAM.width*CAM.height*(3/2)), dtype=np.uint8)
 IMG_BYTES = IMG.flatten().tobytes()
 class TestModeld(unittest.TestCase):
  def setUp(self):
    self.vipc_server = VisionIpcServer("camerad")
    self.vipc_server.create_buffers(VisionStreamType.VISION_STREAM_ROAD, 40, False, CAM.width, CAM.height)
    self.vipc_server.create_buffers(VisionStreamType.VISION_STREAM_DRIVER, 40, False, CAM.width, CAM.height)
    self.vipc_server.create_buffers(VisionStreamType.VISION_STREAM_WIDE_ROAD, 40, False, CAM.width, CAM.height)
    self.vipc_server.start_listener()
    write_car_param()
    self.sm = messaging.SubMaster(['modelV2', 'cameraOdometry'])
    self.pm = messaging.PubMaster(['roadCameraState', 'wideRoadCameraState', 'liveCalibration'])
    managed_processes['modeld'].start()
    self.pm.wait_for_readers_to_update("roadCameraState", 10)
  def tearDown(self):
    managed_processes['modeld'].stop()
    del self.vipc_server
  def _send_frames(self, frame_id, cams=None):
    if cams is None:
      cams = ('roadCameraState', 'wideRoadCameraState')
    cs = None
    for cam in cams:
      msg = messaging.new_message(cam)
      cs = getattr(msg, cam)
      cs.frameId = frame_id
      cs.timestampSof = int((frame_id * DT_MDL) * 1e9)
      cs.timestampEof = int(cs.timestampSof + (DT_MDL * 1e9))
      cam_meta = meta_from_camera_state(cam)
      self.pm.send(msg.which(), msg)
      self.vipc_server.send(cam_meta.stream, IMG_BYTES, cs.frameId,
                            cs.timestampSof, cs.timestampEof)
    return cs
  def _wait(self):
    self.sm.update(5000)
    if self.sm['modelV2'].frameId != self.sm['cameraOdometry'].frameId:
      self.sm.update(1000)
  def test_modeld(self):
    for n in range(1, 500):
      cs = self._send_frames(n)
      self._wait()
      mdl = self.sm['modelV2']
      self.assertEqual(mdl.frameId, n)
      self.assertEqual(mdl.frameIdExtra, n)
      self.assertEqual(mdl.timestampEof, cs.timestampEof)
      self.assertEqual(mdl.frameAge, 0)
      self.assertEqual(mdl.frameDropPerc, 0)
      odo = self.sm['cameraOdometry']
      self.assertEqual(odo.frameId, n)
      self.assertEqual(odo.timestampEof, cs.timestampEof)
  def test_dropped_frames(self):
    """
      modeld should only run on consecutive road frames
    """
    frame_id = -1
    road_frames = list()
    for n in range(1, 50):
      if (random.random() < 0.1) and n > 3:
        cams = random.choice([(), ('wideRoadCameraState', )])
        self._send_frames(n, cams)
      else:
        self._send_frames(n)
        road_frames.append(n)
      self._wait()
      if len(road_frames) < 3 or road_frames[-1] - road_frames[-2] == 1:
        frame_id = road_frames[-1]
      mdl = self.sm['modelV2']
      odo = self.sm['cameraOdometry']
      self.assertEqual(mdl.frameId, frame_id)
      self.assertEqual(mdl.frameIdExtra, frame_id)
      self.assertEqual(odo.frameId, frame_id)
      if n != frame_id:
        self.assertFalse(self.sm.updated['modelV2'])
        self.assertFalse(self.sm.updated['cameraOdometry'])
 if __name__ == "__main__":
  unittest.main()
--- a/selfdrive/modeld/tests/tf_test/build.sh
+++ b/selfdrive/modeld/tests/tf_test/build.sh
@@ -0,0 +1,2 @@
 #!/bin/bash
 clang++ -I /home/batman/one/external/tensorflow/include/ -L /home/batman/one/external/tensorflow/lib -Wl,-rpath=/home/batman/one/external/tensorflow/lib main.cc -ltensorflow
--- a/selfdrive/modeld/tests/tf_test/main.cc
+++ b/selfdrive/modeld/tests/tf_test/main.cc
@@ -0,0 +1,69 @@
 #include <cassert>
 #include <cstdio>
 #include <cstdlib>
 #include "tensorflow/c/c_api.h"
 void* read_file(const char* path, size_t* out_len) {
  FILE* f = fopen(path, "r");
  if (!f) {
    return NULL;
  }
  fseek(f, 0, SEEK_END);
  long f_len = ftell(f);
  rewind(f);
  char* buf = (char*)calloc(f_len, 1);
  assert(buf);
  size_t num_read = fread(buf, f_len, 1, f);
  fclose(f);
  if (num_read != 1) {
    free(buf);
    return NULL;
  }
  if (out_len) {
    *out_len = f_len;
  }
  return buf;
 }
 static void DeallocateBuffer(void* data, size_t) {
  free(data);
 }
 int main(int argc, char* argv[]) {
  TF_Buffer* buf;
  TF_Graph* graph;
  TF_Status* status;
  char *path = argv[1];
  // load model
  {
    size_t model_size;
    char tmp[1024];
    snprintf(tmp, sizeof(tmp), "%s.pb", path);
    printf("loading model %s\n", tmp);
    uint8_t *model_data = (uint8_t *)read_file(tmp, &model_size);
    buf = TF_NewBuffer();
    buf->data = model_data;
    buf->length = model_size;
    buf->data_deallocator = DeallocateBuffer;
    printf("loaded model of size %d\n", model_size);
  }
  // import graph
  status = TF_NewStatus();
  graph = TF_NewGraph();
  TF_ImportGraphDefOptions *opts = TF_NewImportGraphDefOptions();
  TF_GraphImportGraphDef(graph, buf, opts, status);
  TF_DeleteImportGraphDefOptions(opts);
  TF_DeleteBuffer(buf);
  if (TF_GetCode(status) != TF_OK) {
    printf("FAIL: %s\n", TF_Message(status));
  } else {
    printf("SUCCESS\n");
  }
 }
--- a/selfdrive/modeld/tests/tf_test/pb_loader.py
+++ b/selfdrive/modeld/tests/tf_test/pb_loader.py
@@ -0,0 +1,8 @@
 #!/usr/bin/env python3
 import sys
 import tensorflow as tf
 with open(sys.argv[1], "rb") as f:
  graph_def = tf.compat.v1.GraphDef()
  graph_def.ParseFromString(f.read())
  #tf.io.write_graph(graph_def, '', sys.argv[1]+".try")
--- a/selfdrive/modeld/tests/timing/benchmark.py
+++ b/selfdrive/modeld/tests/timing/benchmark.py
@@ -0,0 +1,39 @@
 #!/usr/bin/env python3
 # type: ignore
 import os
 import time
 import numpy as np
 import cereal.messaging as messaging
 from openpilot.selfdrive.manager.process_config import managed_processes
 N = int(os.getenv("N", "5"))
 TIME = int(os.getenv("TIME", "30"))
 if __name__ == "__main__":
  sock = messaging.sub_sock('modelV2', conflate=False, timeout=1000)
  execution_times = []
  for _ in range(N):
    os.environ['LOGPRINT'] = 'debug'
    managed_processes['modeld'].start()
    time.sleep(5)
    t = []
    start = time.monotonic()
    while time.monotonic() - start < TIME:
      msgs = messaging.drain_sock(sock, wait_for_one=True)
      for m in msgs:
        t.append(m.modelV2.modelExecutionTime)
    execution_times.append(np.array(t[10:]) * 1000)
    managed_processes['modeld'].stop()
  print("\n\n")
  print(f"ran modeld {N} times for {TIME}s each")
  for _, t in enumerate(execution_times):
    print(f"\tavg: {sum(t)/len(t):0.2f}ms, min: {min(t):0.2f}ms, max: {max(t):0.2f}ms")
  print("\n\n")
--- a/selfdrive/modeld/thneed/README
+++ b/selfdrive/modeld/thneed/README
@@ -0,0 +1,8 @@
 thneed is an SNPE accelerator. I know SNPE is already an accelerator, but sometimes things need to go even faster..
 It runs on the local device, and caches a single model run. Then it replays it, but fast.
 thneed slices through abstraction layers like a fish.
 You need a thneed.
--- a/selfdrive/modeld/thneed/serialize.cc
+++ b/selfdrive/modeld/thneed/serialize.cc
@@ -0,0 +1,154 @@
 #include <cassert>
 #include <set>
 #include "third_party/json11/json11.hpp"
 #include "common/util.h"
 #include "common/clutil.h"
 #include "common/swaglog.h"
 #include "selfdrive/modeld/thneed/thneed.h"
 using namespace json11;
 extern map<cl_program, string> g_program_source;
 void Thneed::load(const char *filename) {
  LOGD("Thneed::load: loading from %s\n", filename);
  string buf = util::read_file(filename);
  int jsz = *(int *)buf.data();
  string jsonerr;
  string jj(buf.data() + sizeof(int), jsz);
  Json jdat = Json::parse(jj, jsonerr);
  map<cl_mem, cl_mem> real_mem;
  real_mem[NULL] = NULL;
  int ptr = sizeof(int)+jsz;
  for (auto &obj : jdat["objects"].array_items()) {
    auto mobj = obj.object_items();
    int sz = mobj["size"].int_value();
    cl_mem clbuf = NULL;
    if (mobj["buffer_id"].string_value().size() > 0) {
      // image buffer must already be allocated
      clbuf = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
      assert(mobj["needs_load"].bool_value() == false);
    } else {
      if (mobj["needs_load"].bool_value()) {
        clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL);
        if (debug >= 1) printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
        ptr += sz;
      } else {
        // TODO: is there a faster way to init zeroed out buffers?
        void *host_zeros = calloc(sz, 1);
        clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, host_zeros, NULL);
        free(host_zeros);
      }
    }
    assert(clbuf != NULL);
    if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") {
      cl_image_desc desc = {0};
      desc.image_type = (mobj["arg_type"] == "image2d_t") ? CL_MEM_OBJECT_IMAGE2D : CL_MEM_OBJECT_IMAGE1D_BUFFER;
      desc.image_width = mobj["width"].int_value();
      desc.image_height = mobj["height"].int_value();
      desc.image_row_pitch = mobj["row_pitch"].int_value();
      assert(sz == desc.image_height*desc.image_row_pitch);
 #ifdef QCOM2
      desc.buffer = clbuf;
 #else
      // TODO: we are creating unused buffers on PC
      clReleaseMemObject(clbuf);
 #endif
      cl_image_format format = {0};
      format.image_channel_order = CL_RGBA;
      format.image_channel_data_type = mobj["float32"].bool_value() ? CL_FLOAT : CL_HALF_FLOAT;
      cl_int errcode;
 #ifndef QCOM2
      if (mobj["needs_load"].bool_value()) {
        clbuf = clCreateImage(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, &format, &desc, &buf[ptr-sz], &errcode);
      } else {
        clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &errcode);
      }
 #else
      clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &errcode);
 #endif
      if (clbuf == NULL) {
        LOGE("clError: %s create image %zux%zu rp %zu with buffer %p\n", cl_get_error_string(errcode),
             desc.image_width, desc.image_height, desc.image_row_pitch, desc.buffer);
      }
      assert(clbuf != NULL);
    }
    real_mem[*(cl_mem*)(mobj["id"].string_value().data())] = clbuf;
  }
  map<string, cl_program> g_programs;
  for (const auto &[name, source] : jdat["programs"].object_items()) {
    if (debug >= 1) printf("building %s with size %zu\n", name.c_str(), source.string_value().size());
    g_programs[name] = cl_program_from_source(context, device_id, source.string_value());
  }
  for (auto &obj : jdat["inputs"].array_items()) {
    auto mobj = obj.object_items();
    int sz = mobj["size"].int_value();
    cl_mem aa = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
    input_clmem.push_back(aa);
    input_sizes.push_back(sz);
    LOGD("Thneed::load: adding input %s with size %d\n", mobj["name"].string_value().data(), sz);
    cl_int cl_err;
    void *ret = clEnqueueMapBuffer(command_queue, aa, CL_TRUE, CL_MAP_WRITE, 0, sz, 0, NULL, NULL, &cl_err);
    if (cl_err != CL_SUCCESS) LOGE("clError: %s map %p %d\n", cl_get_error_string(cl_err), aa, sz);
    assert(cl_err == CL_SUCCESS);
    inputs.push_back(ret);
  }
  for (auto &obj : jdat["outputs"].array_items()) {
    auto mobj = obj.object_items();
    int sz = mobj["size"].int_value();
    LOGD("Thneed::save: adding output with size %d\n", sz);
    // TODO: support multiple outputs
    output = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
    assert(output != NULL);
  }
  for (auto &obj : jdat["binaries"].array_items()) {
    string name = obj["name"].string_value();
    size_t length = obj["length"].int_value();
    if (debug >= 1) printf("binary %s with size %zu\n", name.c_str(), length);
    g_programs[name] = cl_program_from_binary(context, device_id, (const uint8_t*)&buf[ptr], length);
    ptr += length;
  }
  for (auto &obj : jdat["kernels"].array_items()) {
    auto gws = obj["global_work_size"];
    auto lws = obj["local_work_size"];
    auto kk = shared_ptr<CLQueuedKernel>(new CLQueuedKernel(this));
    kk->name = obj["name"].string_value();
    kk->program = g_programs[kk->name];
    kk->work_dim = obj["work_dim"].int_value();
    for (int i = 0; i < kk->work_dim; i++) {
      kk->global_work_size[i] = gws[i].int_value();
      kk->local_work_size[i] = lws[i].int_value();
    }
    kk->num_args = obj["num_args"].int_value();
    for (int i = 0; i < kk->num_args; i++) {
      string arg = obj["args"].array_items()[i].string_value();
      int arg_size = obj["args_size"].array_items()[i].int_value();
      kk->args_size.push_back(arg_size);
      if (arg_size == 8) {
        cl_mem val = *(cl_mem*)(arg.data());
        val = real_mem[val];
        kk->args.push_back(string((char*)&val, sizeof(val)));
      } else {
        kk->args.push_back(arg);
      }
    }
    kq.push_back(kk);
  }
  clFinish(command_queue);
 }
--- a/selfdrive/modeld/thneed/thneed.h
+++ b/selfdrive/modeld/thneed/thneed.h
@@ -0,0 +1,133 @@
 #pragma once
 #ifndef __user
 #define __user __attribute__(())
 #endif
 #include <cstdint>
 #include <cstdlib>
 #include <memory>
 #include <string>
 #include <vector>
 #include <CL/cl.h>
 #include "third_party/linux/include/msm_kgsl.h"
 using namespace std;
 cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value);
 namespace json11 {
  class Json;
 }
 class Thneed;
 class GPUMalloc {
  public:
    GPUMalloc(int size, int fd);
    ~GPUMalloc();
    void *alloc(int size);
  private:
    uint64_t base;
    int remaining;
 };
 class CLQueuedKernel {
  public:
    CLQueuedKernel(Thneed *lthneed) { thneed = lthneed; }
    CLQueuedKernel(Thneed *lthneed,
                   cl_kernel _kernel,
                   cl_uint _work_dim,
                   const size_t *_global_work_size,
                   const size_t *_local_work_size);
    cl_int exec();
    void debug_print(bool verbose);
    int get_arg_num(const char *search_arg_name);
    cl_program program;
    string name;
    cl_uint num_args;
    vector<string> arg_names;
    vector<string> arg_types;
    vector<string> args;
    vector<int> args_size;
    cl_kernel kernel = NULL;
    json11::Json to_json() const;
    cl_uint work_dim;
    size_t global_work_size[3] = {0};
    size_t local_work_size[3] = {0};
  private:
    Thneed *thneed;
 };
 class CachedIoctl {
  public:
    virtual void exec() {}
 };
 class CachedSync: public CachedIoctl {
  public:
    CachedSync(Thneed *lthneed, string ldata) { thneed = lthneed; data = ldata; }
    void exec();
  private:
    Thneed *thneed;
    string data;
 };
 class CachedCommand: public CachedIoctl {
  public:
    CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd);
    void exec();
  private:
    void disassemble(int cmd_index);
    struct kgsl_gpu_command cache;
    unique_ptr<kgsl_command_object[]> cmds;
    unique_ptr<kgsl_command_object[]> objs;
    Thneed *thneed;
    vector<shared_ptr<CLQueuedKernel> > kq;
 };
 class Thneed {
  public:
    Thneed(bool do_clinit=false, cl_context _context = NULL);
    void stop();
    void execute(float **finputs, float *foutput, bool slow=false);
    void wait();
    vector<cl_mem> input_clmem;
    vector<void *> inputs;
    vector<size_t> input_sizes;
    cl_mem output = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue;
    cl_device_id device_id;
    int context_id;
    // protected?
    bool record = false;
    int debug;
    int timestamp;
 #ifdef QCOM2
    unique_ptr<GPUMalloc> ram;
    vector<unique_ptr<CachedIoctl> > cmds;
    int fd;
 #endif
    // all CL kernels
    void copy_inputs(float **finputs, bool internal=false);
    void copy_output(float *foutput);
    cl_int clexec();
    vector<shared_ptr<CLQueuedKernel> > kq;
    // pending CL kernels
    vector<shared_ptr<CLQueuedKernel> > ckq;
    // loading
    void load(const char *filename);
  private:
    void clinit();
 };
--- a/selfdrive/modeld/thneed/thneed_common.cc
+++ b/selfdrive/modeld/thneed/thneed_common.cc
@@ -0,0 +1,216 @@
 #include "selfdrive/modeld/thneed/thneed.h"
 #include <cassert>
 #include <cstring>
 #include <map>
 #include "common/clutil.h"
 #include "common/timing.h"
 map<pair<cl_kernel, int>, string> g_args;
 map<pair<cl_kernel, int>, int> g_args_size;
 map<cl_program, string> g_program_source;
 void Thneed::stop() {
  //printf("Thneed::stop: recorded %lu commands\n", cmds.size());
  record = false;
 }
 void Thneed::clinit() {
  device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT);
  if (context == NULL) context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, NULL, NULL, &err));
  //cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
  cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
  command_queue = CL_CHECK_ERR(clCreateCommandQueueWithProperties(context, device_id, props, &err));
  printf("Thneed::clinit done\n");
 }
 cl_int Thneed::clexec() {
  if (debug >= 1) printf("Thneed::clexec: running %lu queued kernels\n", kq.size());
  for (auto &k : kq) {
    if (record) ckq.push_back(k);
    cl_int ret = k->exec();
    assert(ret == CL_SUCCESS);
  }
  return clFinish(command_queue);
 }
 void Thneed::copy_inputs(float **finputs, bool internal) {
  for (int idx = 0; idx < inputs.size(); ++idx) {
    if (debug >= 1) printf("copying %lu -- %p -> %p (cl %p)\n", input_sizes[idx], finputs[idx], inputs[idx], input_clmem[idx]);
    if (internal) {
      // if it's internal, using memcpy is fine since the buffer sync is cached in the ioctl layer
      if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
    } else {
      if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL));
    }
  }
 }
 void Thneed::copy_output(float *foutput) {
  if (output != NULL) {
    size_t sz;
    clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
    if (debug >= 1) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
    CL_CHECK(clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL));
  } else {
    printf("CAUTION: model output is NULL, does it have no outputs?\n");
  }
 }
 // *********** CLQueuedKernel ***********
 CLQueuedKernel::CLQueuedKernel(Thneed *lthneed,
                               cl_kernel _kernel,
                               cl_uint _work_dim,
                               const size_t *_global_work_size,
                               const size_t *_local_work_size) {
  thneed = lthneed;
  kernel = _kernel;
  work_dim = _work_dim;
  assert(work_dim <= 3);
  for (int i = 0; i < work_dim; i++) {
    global_work_size[i] = _global_work_size[i];
    local_work_size[i] = _local_work_size[i];
  }
  char _name[0x100];
  clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(_name), _name, NULL);
  name = string(_name);
  clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
  // get args
  for (int i = 0; i < num_args; i++) {
    char arg_name[0x100] = {0};
    clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
    arg_names.push_back(string(arg_name));
    clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
    arg_types.push_back(string(arg_name));
    args.push_back(g_args[make_pair(kernel, i)]);
    args_size.push_back(g_args_size[make_pair(kernel, i)]);
  }
  // get program
  clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, NULL);
 }
 int CLQueuedKernel::get_arg_num(const char *search_arg_name) {
  for (int i = 0; i < num_args; i++) {
    if (arg_names[i] == search_arg_name) return i;
  }
  printf("failed to find %s in %s\n", search_arg_name, name.c_str());
  assert(false);
 }
 cl_int CLQueuedKernel::exec() {
  if (kernel == NULL) {
    kernel = clCreateKernel(program, name.c_str(), NULL);
    arg_names.clear();
    arg_types.clear();
    for (int j = 0; j < num_args; j++) {
      char arg_name[0x100] = {0};
      clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
      arg_names.push_back(string(arg_name));
      clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
      arg_types.push_back(string(arg_name));
      cl_int ret;
      if (args[j].size() != 0) {
        assert(args[j].size() == args_size[j]);
        ret = thneed_clSetKernelArg(kernel, j, args[j].size(), args[j].data());
      } else {
        ret = thneed_clSetKernelArg(kernel, j, args_size[j], NULL);
      }
      assert(ret == CL_SUCCESS);
    }
  }
  if (thneed->debug >= 1) {
    debug_print(thneed->debug >= 2);
  }
  return clEnqueueNDRangeKernel(thneed->command_queue,
    kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL);
 }
 void CLQueuedKernel::debug_print(bool verbose) {
  printf("%p %56s -- ", kernel, name.c_str());
  for (int i = 0; i < work_dim; i++) {
    printf("%4zu ", global_work_size[i]);
  }
  printf(" -- ");
  for (int i = 0; i < work_dim; i++) {
    printf("%4zu ", local_work_size[i]);
  }
  printf("\n");
  if (verbose) {
    for (int i = 0; i < num_args; i++) {
      string arg = args[i];
      printf("  %s %s", arg_types[i].c_str(), arg_names[i].c_str());
      void *arg_value = (void*)arg.data();
      int arg_size = arg.size();
      if (arg_size == 0) {
        printf(" (size) %d", args_size[i]);
      } else if (arg_size == 1) {
        printf(" = %d", *((char*)arg_value));
      } else if (arg_size == 2) {
        printf(" = %d", *((short*)arg_value));
      } else if (arg_size == 4) {
        if (arg_types[i] == "float") {
          printf(" = %f", *((float*)arg_value));
        } else {
          printf(" = %d", *((int*)arg_value));
        }
      } else if (arg_size == 8) {
        cl_mem val = (cl_mem)(*((uintptr_t*)arg_value));
        printf(" = %p", val);
        if (val != NULL) {
          cl_mem_object_type obj_type;
          clGetMemObjectInfo(val, CL_MEM_TYPE, sizeof(obj_type), &obj_type, NULL);
          if (arg_types[i] == "image2d_t" || arg_types[i] == "image1d_t" || obj_type == CL_MEM_OBJECT_IMAGE2D) {
            cl_image_format format;
            size_t width, height, depth, array_size, row_pitch, slice_pitch;
            cl_mem buf;
            clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
            assert(format.image_channel_order == CL_RGBA);
            assert(format.image_channel_data_type == CL_HALF_FLOAT || format.image_channel_data_type == CL_FLOAT);
            clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
            clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
            clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
            clGetImageInfo(val, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
            clGetImageInfo(val, CL_IMAGE_ARRAY_SIZE, sizeof(array_size), &array_size, NULL);
            clGetImageInfo(val, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL);
            assert(depth == 0);
            assert(array_size == 0);
            assert(slice_pitch == 0);
            clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
            size_t sz = 0;
            if (buf != NULL) clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
            printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz);
          } else {
            size_t sz;
            clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
            printf(" buffer %zu", sz);
          }
        }
      }
      printf("\n");
    }
  }
 }
 cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) {
  g_args_size[make_pair(kernel, arg_index)] = arg_size;
  if (arg_value != NULL) {
    g_args[make_pair(kernel, arg_index)] = string((char*)arg_value, arg_size);
  } else {
    g_args[make_pair(kernel, arg_index)] = string("");
  }
  cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value);
  return ret;
 }
--- a/selfdrive/modeld/thneed/thneed_pc.cc
+++ b/selfdrive/modeld/thneed/thneed_pc.cc
@@ -0,0 +1,32 @@
 #include "selfdrive/modeld/thneed/thneed.h"
 #include <cassert>
 #include "common/clutil.h"
 #include "common/timing.h"
 Thneed::Thneed(bool do_clinit, cl_context _context) {
  context = _context;
  if (do_clinit) clinit();
  char *thneed_debug_env = getenv("THNEED_DEBUG");
  debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
 }
 void Thneed::execute(float **finputs, float *foutput, bool slow) {
  uint64_t tb, te;
  if (debug >= 1) tb = nanos_since_boot();
  // ****** copy inputs
  copy_inputs(finputs);
  // ****** run commands
  clexec();
  // ****** copy outputs
  copy_output(foutput);
  if (debug >= 1) {
    te = nanos_since_boot();
    printf("model exec in %lu us\n", (te-tb)/1000);
  }
 }
--- a/selfdrive/modeld/thneed/thneed_qcom2.cc
+++ b/selfdrive/modeld/thneed/thneed_qcom2.cc
@@ -0,0 +1,258 @@
 #include "selfdrive/modeld/thneed/thneed.h"
 #include <dlfcn.h>
 #include <sys/mman.h>
 #include <cassert>
 #include <cerrno>
 #include <cstring>
 #include <map>
 #include <string>
 #include "common/clutil.h"
 #include "common/timing.h"
 Thneed *g_thneed = NULL;
 int g_fd = -1;
 void hexdump(uint8_t *d, int len) {
  assert((len%4) == 0);
  printf("  dumping %p len 0x%x\n", d, len);
  for (int i = 0; i < len/4; i++) {
    if (i != 0 && (i%0x10) == 0) printf("\n");
    printf("%8x ", d[i]);
  }
  printf("\n");
 }
 // *********** ioctl interceptor ***********
 extern "C" {
 int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL;
 #undef ioctl
 int ioctl(int filedes, unsigned long request, void *argp) {
  request &= 0xFFFFFFFF;  // needed on QCOM2
  if (my_ioctl == NULL) my_ioctl = reinterpret_cast<decltype(my_ioctl)>(dlsym(RTLD_NEXT, "ioctl"));
  Thneed *thneed = g_thneed;
  // save the fd
  if (request == IOCTL_KGSL_GPUOBJ_ALLOC) g_fd = filedes;
  // note that this runs always, even without a thneed object
  if (request == IOCTL_KGSL_DRAWCTXT_CREATE) {
    struct kgsl_drawctxt_create *create = (struct kgsl_drawctxt_create *)argp;
    create->flags &= ~KGSL_CONTEXT_PRIORITY_MASK;
    create->flags |= 6 << KGSL_CONTEXT_PRIORITY_SHIFT;   // priority from 1-15, 1 is max priority
    printf("IOCTL_KGSL_DRAWCTXT_CREATE: creating context with flags 0x%x\n", create->flags);
  }
  if (thneed != NULL) {
    if (request == IOCTL_KGSL_GPU_COMMAND) {
      struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp;
      if (thneed->record) {
        thneed->timestamp = cmd->timestamp;
        thneed->context_id = cmd->context_id;
        thneed->cmds.push_back(unique_ptr<CachedCommand>(new CachedCommand(thneed, cmd)));
      }
      if (thneed->debug >= 1) {
        printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx    context_id: %u  timestamp: %u  numcmds: %d  numobjs: %d\n",
            thneed->cmds.size(),
            cmd->flags,
            cmd->context_id, cmd->timestamp, cmd->numcmds, cmd->numobjs);
      }
    } else if (request == IOCTL_KGSL_GPUOBJ_SYNC) {
      struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp;
      struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs);
      if (thneed->debug >= 2) {
        printf("IOCTL_KGSL_GPUOBJ_SYNC count:%d ", cmd->count);
        for (int i = 0; i < cmd->count; i++) {
          printf(" -- offset:0x%lx len:0x%lx id:%d op:%d  ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op);
        }
        printf("\n");
      }
      if (thneed->record) {
        thneed->cmds.push_back(unique_ptr<CachedSync>(new
              CachedSync(thneed, string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count))));
      }
    } else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) {
      struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp;
      if (thneed->debug >= 1) {
        printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d  timestamp: %d  timeout: %d\n",
            cmd->context_id, cmd->timestamp, cmd->timeout);
      }
    } else if (request == IOCTL_KGSL_SETPROPERTY) {
      if (thneed->debug >= 1) {
        struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp;
        printf("IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu\n", prop->type, prop->sizebytes);
        if (thneed->debug >= 2) {
          hexdump((uint8_t *)prop->value, prop->sizebytes);
          if (prop->type == KGSL_PROP_PWR_CONSTRAINT) {
            struct kgsl_device_constraint *constraint = (struct kgsl_device_constraint *)prop->value;
            hexdump((uint8_t *)constraint->data, constraint->size);
          }
        }
      }
    } else if (request == IOCTL_KGSL_DRAWCTXT_CREATE || request == IOCTL_KGSL_DRAWCTXT_DESTROY) {
      // this happens
    } else if (request == IOCTL_KGSL_GPUOBJ_ALLOC || request == IOCTL_KGSL_GPUOBJ_FREE) {
      // this happens
    } else {
      if (thneed->debug >= 1) {
        printf("other ioctl %lx\n", request);
      }
    }
  }
  int ret = my_ioctl(filedes, request, argp);
  // NOTE: This error message goes into stdout and messes up pyenv
  // if (ret != 0) printf("ioctl returned %d with errno %d\n", ret, errno);
  return ret;
 }
 }
 // *********** GPUMalloc ***********
 GPUMalloc::GPUMalloc(int size, int fd) {
  struct kgsl_gpuobj_alloc alloc;
  memset(&alloc, 0, sizeof(alloc));
  alloc.size = size;
  alloc.flags = 0x10000a00;
  ioctl(fd, IOCTL_KGSL_GPUOBJ_ALLOC, &alloc);
  void *addr = mmap64(NULL, alloc.mmapsize, 0x3, 0x1, fd, alloc.id*0x1000);
  assert(addr != MAP_FAILED);
  base = (uint64_t)addr;
  remaining = size;
 }
 GPUMalloc::~GPUMalloc() {
  // TODO: free the GPU malloced area
 }
 void *GPUMalloc::alloc(int size) {
  void *ret = (void*)base;
  size = (size+0xff) & (~0xFF);
  assert(size <= remaining);
  remaining -= size;
  base += size;
  return ret;
 }
 // *********** CachedSync, at the ioctl layer ***********
 void CachedSync::exec() {
  struct kgsl_gpuobj_sync cmd;
  cmd.objs = (uint64_t)data.data();
  cmd.obj_len = data.length();
  cmd.count = data.length() / sizeof(struct kgsl_gpuobj_sync_obj);
  int ret = ioctl(thneed->fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd);
  assert(ret == 0);
 }
 // *********** CachedCommand, at the ioctl layer ***********
 CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
  thneed = lthneed;
  assert(cmd->numsyncs == 0);
  memcpy(&cache, cmd, sizeof(cache));
  if (cmd->numcmds > 0) {
    cmds = make_unique<struct kgsl_command_object[]>(cmd->numcmds);
    memcpy(cmds.get(), (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*cmd->numcmds);
    cache.cmdlist = (uint64_t)cmds.get();
    for (int i = 0; i < cmd->numcmds; i++) {
      void *nn = thneed->ram->alloc(cmds[i].size);
      memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size);
      cmds[i].gpuaddr = (uint64_t)nn;
    }
  }
  if (cmd->numobjs > 0) {
    objs = make_unique<struct kgsl_command_object[]>(cmd->numobjs);
    memcpy(objs.get(), (void *)cmd->objlist, sizeof(struct kgsl_command_object)*cmd->numobjs);
    cache.objlist = (uint64_t)objs.get();
    for (int i = 0; i < cmd->numobjs; i++) {
      void *nn = thneed->ram->alloc(objs[i].size);
      memset(nn, 0, objs[i].size);
      objs[i].gpuaddr = (uint64_t)nn;
    }
  }
  kq = thneed->ckq;
  thneed->ckq.clear();
 }
 void CachedCommand::exec() {
  cache.timestamp = ++thneed->timestamp;
  int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache);
  if (thneed->debug >= 1) printf("CachedCommand::exec got %d\n", ret);
  if (thneed->debug >= 2) {
    for (auto &it : kq) {
      it->debug_print(false);
    }
  }
  assert(ret == 0);
 }
 // *********** Thneed ***********
 Thneed::Thneed(bool do_clinit, cl_context _context) {
  // TODO: QCOM2 actually requires a different context
  //context = _context;
  if (do_clinit) clinit();
  assert(g_fd != -1);
  fd = g_fd;
  ram = make_unique<GPUMalloc>(0x80000, fd);
  timestamp = -1;
  g_thneed = this;
  char *thneed_debug_env = getenv("THNEED_DEBUG");
  debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
 }
 void Thneed::wait() {
  struct kgsl_device_waittimestamp_ctxtid wait;
  wait.context_id = context_id;
  wait.timestamp = timestamp;
  wait.timeout = -1;
  uint64_t tb = nanos_since_boot();
  int wret = ioctl(fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
  uint64_t te = nanos_since_boot();
  if (debug >= 1) printf("wait %d after %lu us\n", wret, (te-tb)/1000);
 }
 void Thneed::execute(float **finputs, float *foutput, bool slow) {
  uint64_t tb, te;
  if (debug >= 1) tb = nanos_since_boot();
  // ****** copy inputs
  copy_inputs(finputs, true);
  // ****** run commands
  int i = 0;
  for (auto &it : cmds) {
    ++i;
    if (debug >= 1) printf("run %2d @ %7lu us: ", i, (nanos_since_boot()-tb)/1000);
    it->exec();
    if ((i == cmds.size()) || slow) wait();
  }
  // ****** copy outputs
  copy_output(foutput);
  if (debug >= 1) {
    te = nanos_since_boot();
    printf("model exec in %lu us\n", (te-tb)/1000);
  }
 }
--- a/selfdrive/modeld/transforms/loadyuv.cc
+++ b/selfdrive/modeld/transforms/loadyuv.cc
@@ -0,0 +1,74 @@
 #include "selfdrive/modeld/transforms/loadyuv.h"
 #include <cassert>
 #include <cstdio>
 #include <cstring>
 void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height) {
  memset(s, 0, sizeof(*s));
  s->width = width;
  s->height = height;
  char args[1024];
  snprintf(args, sizeof(args),
           "-cl-fast-relaxed-math -cl-denorms-are-zero "
           "-DTRANSFORMED_WIDTH=%d -DTRANSFORMED_HEIGHT=%d",
           width, height);
  cl_program prg = cl_program_from_file(ctx, device_id, LOADYUV_PATH, args);
  s->loadys_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loadys", &err));
  s->loaduv_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loaduv", &err));
  s->copy_krnl = CL_CHECK_ERR(clCreateKernel(prg, "copy", &err));
  // done with this
  CL_CHECK(clReleaseProgram(prg));
 }
 void loadyuv_destroy(LoadYUVState* s) {
  CL_CHECK(clReleaseKernel(s->loadys_krnl));
  CL_CHECK(clReleaseKernel(s->loaduv_krnl));
  CL_CHECK(clReleaseKernel(s->copy_krnl));
 }
 void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
                   cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
                   cl_mem out_cl, bool do_shift) {
  cl_int global_out_off = 0;
  if (do_shift) {
    // shift the image in slot 1 to slot 0, then place the new image in slot 1
    global_out_off += (s->width*s->height) + (s->width/2)*(s->height/2)*2;
    CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &out_cl));
    CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_int), &global_out_off));
    const size_t copy_work_size = global_out_off/8;
    CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL,
                                &copy_work_size, NULL, 0, 0, NULL));
  }
  CL_CHECK(clSetKernelArg(s->loadys_krnl, 0, sizeof(cl_mem), &y_cl));
  CL_CHECK(clSetKernelArg(s->loadys_krnl, 1, sizeof(cl_mem), &out_cl));
  CL_CHECK(clSetKernelArg(s->loadys_krnl, 2, sizeof(cl_int), &global_out_off));
  const size_t loadys_work_size = (s->width*s->height)/8;
  CL_CHECK(clEnqueueNDRangeKernel(q, s->loadys_krnl, 1, NULL,
                               &loadys_work_size, NULL, 0, 0, NULL));
  const size_t loaduv_work_size = ((s->width/2)*(s->height/2))/8;
  global_out_off += (s->width*s->height);
  CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &u_cl));
  CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl));
  CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off));
  CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
                               &loaduv_work_size, NULL, 0, 0, NULL));
  global_out_off += (s->width/2)*(s->height/2);
  CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &v_cl));
  CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl));
  CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off));
  CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
                               &loaduv_work_size, NULL, 0, 0, NULL));
 }
--- a/selfdrive/modeld/transforms/loadyuv.h
+++ b/selfdrive/modeld/transforms/loadyuv.h
@@ -0,0 +1,16 @@
 #pragma once
 #include "common/clutil.h"
 typedef struct {
  int width, height;
  cl_kernel loadys_krnl, loaduv_krnl, copy_krnl;
 } LoadYUVState;
 void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height);
 void loadyuv_destroy(LoadYUVState* s);
 void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
                   cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
                   cl_mem out_cl, bool do_shift = false);
--- a/selfdrive/modeld/transforms/transform.cc
+++ b/selfdrive/modeld/transforms/transform.cc
@@ -0,0 +1,97 @@
 #include "selfdrive/modeld/transforms/transform.h"
 #include <cassert>
 #include <cstring>
 #include "common/clutil.h"
 void transform_init(Transform* s, cl_context ctx, cl_device_id device_id) {
  memset(s, 0, sizeof(*s));
  cl_program prg = cl_program_from_file(ctx, device_id, TRANSFORM_PATH, "");
  s->krnl = CL_CHECK_ERR(clCreateKernel(prg, "warpPerspective", &err));
  // done with this
  CL_CHECK(clReleaseProgram(prg));
  s->m_y_cl = CL_CHECK_ERR(clCreateBuffer(ctx, CL_MEM_READ_WRITE, 3*3*sizeof(float), NULL, &err));
  s->m_uv_cl = CL_CHECK_ERR(clCreateBuffer(ctx, CL_MEM_READ_WRITE, 3*3*sizeof(float), NULL, &err));
 }
 void transform_destroy(Transform* s) {
  CL_CHECK(clReleaseMemObject(s->m_y_cl));
  CL_CHECK(clReleaseMemObject(s->m_uv_cl));
  CL_CHECK(clReleaseKernel(s->krnl));
 }
 void transform_queue(Transform* s,
                     cl_command_queue q,
                     cl_mem in_yuv, int in_width, int in_height, int in_stride, int in_uv_offset,
                     cl_mem out_y, cl_mem out_u, cl_mem out_v,
                     int out_width, int out_height,
                     const mat3& projection) {
  const int zero = 0;
  // sampled using pixel center origin
  // (because that's how fastcv and opencv does it)
  mat3 projection_y = projection;
  // in and out uv is half the size of y.
  mat3 projection_uv = transform_scale_buffer(projection, 0.5);
  CL_CHECK(clEnqueueWriteBuffer(q, s->m_y_cl, CL_TRUE, 0, 3*3*sizeof(float), (void*)projection_y.v, 0, NULL, NULL));
  CL_CHECK(clEnqueueWriteBuffer(q, s->m_uv_cl, CL_TRUE, 0, 3*3*sizeof(float), (void*)projection_uv.v, 0, NULL, NULL));
  const int in_y_width = in_width;
  const int in_y_height = in_height;
  const int in_y_px_stride = 1;
  const int in_uv_width = in_width/2;
  const int in_uv_height = in_height/2;
  const int in_uv_px_stride = 2;
  const int in_u_offset = in_uv_offset;
  const int in_v_offset = in_uv_offset + 1;
  const int out_y_width = out_width;
  const int out_y_height = out_height;
  const int out_uv_width = out_width/2;
  const int out_uv_height = out_height/2;
  CL_CHECK(clSetKernelArg(s->krnl, 0, sizeof(cl_mem), &in_yuv));  // src
  CL_CHECK(clSetKernelArg(s->krnl, 1, sizeof(cl_int), &in_stride));  // src_row_stride
  CL_CHECK(clSetKernelArg(s->krnl, 2, sizeof(cl_int), &in_y_px_stride));  // src_px_stride
  CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &zero));  // src_offset
  CL_CHECK(clSetKernelArg(s->krnl, 4, sizeof(cl_int), &in_y_height));  // src_rows
  CL_CHECK(clSetKernelArg(s->krnl, 5, sizeof(cl_int), &in_y_width));  // src_cols
  CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_y));  // dst
  CL_CHECK(clSetKernelArg(s->krnl, 7, sizeof(cl_int), &out_y_width));  // dst_row_stride
  CL_CHECK(clSetKernelArg(s->krnl, 8, sizeof(cl_int), &zero));  // dst_offset
  CL_CHECK(clSetKernelArg(s->krnl, 9, sizeof(cl_int), &out_y_height));  // dst_rows
  CL_CHECK(clSetKernelArg(s->krnl, 10, sizeof(cl_int), &out_y_width));  // dst_cols
  CL_CHECK(clSetKernelArg(s->krnl, 11, sizeof(cl_mem), &s->m_y_cl));  // M
  const size_t work_size_y[2] = {(size_t)out_y_width, (size_t)out_y_height};
  CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
                              (const size_t*)&work_size_y, NULL, 0, 0, NULL));
  const size_t work_size_uv[2] = {(size_t)out_uv_width, (size_t)out_uv_height};
  CL_CHECK(clSetKernelArg(s->krnl, 2, sizeof(cl_int), &in_uv_px_stride));  // src_px_stride
  CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &in_u_offset));  // src_offset
  CL_CHECK(clSetKernelArg(s->krnl, 4, sizeof(cl_int), &in_uv_height));  // src_rows
  CL_CHECK(clSetKernelArg(s->krnl, 5, sizeof(cl_int), &in_uv_width));  // src_cols
  CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_u));  // dst
  CL_CHECK(clSetKernelArg(s->krnl, 7, sizeof(cl_int), &out_uv_width));  // dst_row_stride
  CL_CHECK(clSetKernelArg(s->krnl, 8, sizeof(cl_int), &zero));  // dst_offset
  CL_CHECK(clSetKernelArg(s->krnl, 9, sizeof(cl_int), &out_uv_height));  // dst_rows
  CL_CHECK(clSetKernelArg(s->krnl, 10, sizeof(cl_int), &out_uv_width));  // dst_cols
  CL_CHECK(clSetKernelArg(s->krnl, 11, sizeof(cl_mem), &s->m_uv_cl));  // M
  CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
                              (const size_t*)&work_size_uv, NULL, 0, 0, NULL));
  CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &in_v_offset));  // src_ofset
  CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_v));  // dst
  CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
                              (const size_t*)&work_size_uv, NULL, 0, 0, NULL));
 }
--- a/selfdrive/modeld/transforms/transform.h
+++ b/selfdrive/modeld/transforms/transform.h
@@ -0,0 +1,25 @@
 #pragma once
 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
 #else
 #include <CL/cl.h>
 #endif
 #include "common/mat.h"
 typedef struct {
  cl_kernel krnl;
  cl_mem m_y_cl, m_uv_cl;
 } Transform;
 void transform_init(Transform* s, cl_context ctx, cl_device_id device_id);
 void transform_destroy(Transform* transform);
 void transform_queue(Transform* s, cl_command_queue q,
                     cl_mem yuv, int in_width, int in_height, int in_stride, int in_uv_offset,
                     cl_mem out_y, cl_mem out_u, cl_mem out_v,
                     int out_width, int out_height,
                     const mat3& projection);
		`@@ -0,0 +1,2 @@`
							`60abed33-9f25-4e34-9937-aaf918d41dfc`
							`50887963963a3022e85ac0c94b0801aef955a608`
		`@@ -0,0 +1,2 @@`
							`#!/bin/bash`
							`clang++ -I /home/batman/one/external/tensorflow/include/ -L /home/batman/one/external/tensorflow/lib -Wl,-rpath=/home/batman/one/external/tensorflow/lib main.cc -ltensorflow`