This commit is contained in:
Your Name
2024-04-27 03:27:27 -05:00
parent 886a019ad5
commit c22b4866eb
55 changed files with 49557 additions and 116523 deletions

View File

@@ -0,0 +1,72 @@
Import('env', 'envCython', 'arch', 'cereal', 'messaging', 'common', 'gpucommon', 'visionipc', 'transformations')
lenv = env.Clone()
lenvCython = envCython.Clone()
libs = [cereal, messaging, visionipc, gpucommon, common, 'capnp', 'zmq', 'kj', 'pthread']
frameworks = []
common_src = [
"models/commonmodel.cc",
"transforms/loadyuv.cc",
"transforms/transform.cc",
]
thneed_src_common = [
"thneed/thneed_common.cc",
"thneed/serialize.cc",
]
thneed_src_qcom = thneed_src_common + ["thneed/thneed_qcom2.cc"]
thneed_src_pc = thneed_src_common + ["thneed/thneed_pc.cc"]
thneed_src = thneed_src_qcom if arch == "larch64" else thneed_src_pc
# SNPE except on Mac and ARM Linux
snpe_lib = []
if arch != "Darwin" and arch != "aarch64":
common_src += ['runners/snpemodel.cc']
snpe_lib += ['SNPE']
# OpenCL is a framework on Mac
if arch == "Darwin":
frameworks += ['OpenCL']
else:
libs += ['OpenCL']
# Set path definitions
for pathdef, fn in {'TRANSFORM': 'transforms/transform.cl', 'LOADYUV': 'transforms/loadyuv.cl'}.items():
for xenv in (lenv, lenvCython):
xenv['CXXFLAGS'].append(f'-D{pathdef}_PATH=\\"{File(fn).abspath}\\"')
# Compile cython
snpe_rpath_qcom = "/data/pythonpath/third_party/snpe/larch64"
snpe_rpath_pc = f"{Dir('#').abspath}/third_party/snpe/x86_64-linux-clang"
snpe_rpath = lenvCython['RPATH'] + [snpe_rpath_qcom if arch == "larch64" else snpe_rpath_pc]
cython_libs = envCython["LIBS"] + libs
snpemodel_lib = lenv.Library('snpemodel', ['runners/snpemodel.cc'])
commonmodel_lib = lenv.Library('commonmodel', common_src)
lenvCython.Program('runners/runmodel_pyx.so', 'runners/runmodel_pyx.pyx', LIBS=cython_libs, FRAMEWORKS=frameworks)
lenvCython.Program('runners/snpemodel_pyx.so', 'runners/snpemodel_pyx.pyx', LIBS=[snpemodel_lib, snpe_lib, *cython_libs], FRAMEWORKS=frameworks, RPATH=snpe_rpath)
lenvCython.Program('models/commonmodel_pyx.so', 'models/commonmodel_pyx.pyx', LIBS=[commonmodel_lib, *cython_libs], FRAMEWORKS=frameworks)
# Get model metadata
fn = File("models/supercombo").abspath
cmd = f'python3 {Dir("#selfdrive/modeld").abspath}/get_model_metadata.py {fn}.onnx'
files = sum([lenv.Glob("#"+x) for x in open(File("#release/files_common").abspath).read().split("\n") if x.endswith("get_model_metadata.py")], [])
lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"]+files, cmd)
# Build thneed model
if arch == "larch64" or GetOption('pc_thneed'):
tinygrad_opts = []
if not GetOption('pc_thneed'):
# use FLOAT16 on device for speed + don't cache the CL kernels for space
tinygrad_opts += ["FLOAT16=1", "PYOPENCL_NO_CACHE=1"]
cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn}.onnx {fn}.thneed"
tinygrad_files = sum([lenv.Glob("#"+x) for x in open(File("#release/files_common").abspath).read().split("\n") if x.startswith("tinygrad_repo/")], [])
lenv.Command(fn + ".thneed", [fn + ".onnx"] + tinygrad_files, cmd)
thneed_lib = env.SharedLibrary('thneed', thneed_src, LIBS=[gpucommon, common, 'zmq', 'OpenCL', 'dl'])
thneedmodel_lib = env.Library('thneedmodel', ['runners/thneedmodel.cc'])
lenvCython.Program('runners/thneedmodel_pyx.so', 'runners/thneedmodel_pyx.pyx', LIBS=envCython["LIBS"]+[thneedmodel_lib, thneed_lib, gpucommon, common, 'dl', 'zmq', 'OpenCL'])

View File

@@ -24,6 +24,7 @@ class ModelConstants:
LAT_PLANNER_STATE_LEN = 4 LAT_PLANNER_STATE_LEN = 4
LATERAL_CONTROL_PARAMS_LEN = 2 LATERAL_CONTROL_PARAMS_LEN = 2
PREV_DESIRED_CURV_LEN = 1 PREV_DESIRED_CURV_LEN = 1
RADAR_TRACKS_LEN = 64
# model outputs constants # model outputs constants
FCW_THRESHOLDS_5MS2 = np.array([.05, .05, .15, .15, .15], dtype=np.float32) FCW_THRESHOLDS_5MS2 = np.array([.05, .05, .15, .15, .15], dtype=np.float32)
@@ -42,6 +43,7 @@ class ModelConstants:
DESIRE_PRED_WIDTH = 8 DESIRE_PRED_WIDTH = 8
LAT_PLANNER_SOLUTION_WIDTH = 4 LAT_PLANNER_SOLUTION_WIDTH = 4
DESIRED_CURV_WIDTH = 1 DESIRED_CURV_WIDTH = 1
RADAR_TRACKS_WIDTH = 3
NUM_LANE_LINES = 4 NUM_LANE_LINES = 4
NUM_ROAD_EDGES = 2 NUM_ROAD_EDGES = 2

View File

@@ -6,7 +6,6 @@ import time
import ctypes import ctypes
import numpy as np import numpy as np
from pathlib import Path from pathlib import Path
from typing import Tuple, Dict
from cereal import messaging from cereal import messaging
from cereal.messaging import PubMaster, SubMaster from cereal.messaging import PubMaster, SubMaster
@@ -53,7 +52,7 @@ class DMonitoringModelResult(ctypes.Structure):
("wheel_on_right_prob", ctypes.c_float)] ("wheel_on_right_prob", ctypes.c_float)]
class ModelState: class ModelState:
inputs: Dict[str, np.ndarray] inputs: dict[str, np.ndarray]
output: np.ndarray output: np.ndarray
model: ModelRunner model: ModelRunner
@@ -68,7 +67,7 @@ class ModelState:
self.model.addInput("input_img", None) self.model.addInput("input_img", None)
self.model.addInput("calib", self.inputs['calib']) self.model.addInput("calib", self.inputs['calib'])
def run(self, buf:VisionBuf, calib:np.ndarray) -> Tuple[np.ndarray, float]: def run(self, buf:VisionBuf, calib:np.ndarray) -> tuple[np.ndarray, float]:
self.inputs['calib'][:] = calib self.inputs['calib'][:] = calib
v_offset = buf.height - MODEL_HEIGHT v_offset = buf.height - MODEL_HEIGHT

View File

@@ -1,7 +1,6 @@
import os import os
import capnp import capnp
import numpy as np import numpy as np
from typing import Dict
from cereal import log from cereal import log
from openpilot.selfdrive.modeld.constants import ModelConstants, Plan, Meta from openpilot.selfdrive.modeld.constants import ModelConstants, Plan, Meta
@@ -42,7 +41,7 @@ def fill_xyvat(builder, t, x, y, v, a, x_std=None, y_std=None, v_std=None, a_std
if a_std is not None: if a_std is not None:
builder.aStd = a_std.tolist() builder.aStd = a_std.tolist()
def fill_model_msg(msg: capnp._DynamicStructBuilder, net_output_data: Dict[str, np.ndarray], publish_state: PublishState, def fill_model_msg(msg: capnp._DynamicStructBuilder, net_output_data: dict[str, np.ndarray], publish_state: PublishState,
vipc_frame_id: int, vipc_frame_id_extra: int, frame_id: int, frame_drop: float, vipc_frame_id: int, vipc_frame_id_extra: int, frame_id: int, frame_drop: float,
timestamp_eof: int, timestamp_llk: int, model_execution_time: float, timestamp_eof: int, timestamp_llk: int, model_execution_time: float,
nav_enabled: bool, valid: bool) -> None: nav_enabled: bool, valid: bool) -> None:
@@ -104,11 +103,12 @@ def fill_model_msg(msg: capnp._DynamicStructBuilder, net_output_data: Dict[str,
lane_line = modelV2.laneLines[i] lane_line = modelV2.laneLines[i]
far_lane, near_lane, road_edge = (0, 1, 0) if i == 4 else (3, 2, 1) far_lane, near_lane, road_edge = (0, 1, 0) if i == 4 else (3, 2, 1)
y_min = net_output_data['lane_lines'][0, near_lane,:,0]
z_min = net_output_data['lane_lines'][0, near_lane,:,1]
lane_diff = np.abs(net_output_data['lane_lines'][0,near_lane] - net_output_data['lane_lines'][0,far_lane]) lane_diff = np.abs(net_output_data['lane_lines'][0,near_lane] - net_output_data['lane_lines'][0,far_lane])
road_edge_diff = np.abs(net_output_data['lane_lines'][0,near_lane] - net_output_data['road_edges'][0,road_edge]) road_edge_diff = np.abs(net_output_data['lane_lines'][0,near_lane] - net_output_data['road_edges'][0,road_edge])
y_min = net_output_data['lane_lines'][0, near_lane,:,0]
z_min = net_output_data['lane_lines'][0, near_lane,:,1]
y_min += np.where(lane_diff[:,0] < road_edge_diff[:,0], net_output_data['lane_lines'][0,far_lane,:,0], net_output_data['road_edges'][0,road_edge,:,0]) y_min += np.where(lane_diff[:,0] < road_edge_diff[:,0], net_output_data['lane_lines'][0,far_lane,:,0], net_output_data['road_edges'][0,road_edge,:,0])
z_min += np.where(lane_diff[:,1] < road_edge_diff[:,1], net_output_data['lane_lines'][0,far_lane,:,1], net_output_data['road_edges'][0,road_edge,:,1]) z_min += np.where(lane_diff[:,1] < road_edge_diff[:,1], net_output_data['lane_lines'][0,far_lane,:,1], net_output_data['road_edges'][0,road_edge,:,1])
@@ -192,7 +192,7 @@ def fill_model_msg(msg: capnp._DynamicStructBuilder, net_output_data: Dict[str,
if SEND_RAW_PRED: if SEND_RAW_PRED:
modelV2.rawPredictions = net_output_data['raw_pred'].tobytes() modelV2.rawPredictions = net_output_data['raw_pred'].tobytes()
def fill_pose_msg(msg: capnp._DynamicStructBuilder, net_output_data: Dict[str, np.ndarray], def fill_pose_msg(msg: capnp._DynamicStructBuilder, net_output_data: dict[str, np.ndarray],
vipc_frame_id: int, vipc_dropped_frames: int, timestamp_eof: int, live_calib_seen: bool) -> None: vipc_frame_id: int, vipc_dropped_frames: int, timestamp_eof: int, live_calib_seen: bool) -> None:
msg.valid = live_calib_seen & (vipc_dropped_frames < 1) msg.valid = live_calib_seen & (vipc_dropped_frames < 1)
cameraOdometry = msg.cameraOdometry cameraOdometry = msg.cameraOdometry

View File

@@ -4,9 +4,8 @@ import pathlib
import onnx import onnx
import codecs import codecs
import pickle import pickle
from typing import Tuple
def get_name_and_shape(value_info:onnx.ValueInfoProto) -> Tuple[str, Tuple[int,...]]: def get_name_and_shape(value_info:onnx.ValueInfoProto) -> tuple[str, tuple[int,...]]:
shape = tuple([int(dim.dim_value) for dim in value_info.type.tensor_type.shape.dim]) shape = tuple([int(dim.dim_value) for dim in value_info.type.tensor_type.shape.dim])
name = value_info.name name = value_info.name
return name, shape return name, shape

Binary file not shown.

View File

@@ -6,7 +6,6 @@ import numpy as np
import cereal.messaging as messaging import cereal.messaging as messaging
from cereal import car, log from cereal import car, log
from pathlib import Path from pathlib import Path
from typing import Dict, Optional
from setproctitle import setproctitle from setproctitle import setproctitle
from cereal.messaging import PubMaster, SubMaster from cereal.messaging import PubMaster, SubMaster
from cereal.visionipc import VisionIpcClient, VisionStreamType, VisionBuf from cereal.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
@@ -14,6 +13,7 @@ from openpilot.common.swaglog import cloudlog
from openpilot.common.params import Params from openpilot.common.params import Params
from openpilot.common.filter_simple import FirstOrderFilter from openpilot.common.filter_simple import FirstOrderFilter
from openpilot.common.realtime import config_realtime_process from openpilot.common.realtime import config_realtime_process
from openpilot.common.transformations.camera import DEVICE_CAMERAS
from openpilot.common.transformations.model import get_warp_matrix from openpilot.common.transformations.model import get_warp_matrix
from openpilot.selfdrive import sentry from openpilot.selfdrive import sentry
from openpilot.selfdrive.car.car_helpers import get_demo_car_params from openpilot.selfdrive.car.car_helpers import get_demo_car_params
@@ -24,15 +24,18 @@ from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_
from openpilot.selfdrive.modeld.constants import ModelConstants from openpilot.selfdrive.modeld.constants import ModelConstants
from openpilot.selfdrive.modeld.models.commonmodel_pyx import ModelFrame, CLContext from openpilot.selfdrive.modeld.models.commonmodel_pyx import ModelFrame, CLContext
from openpilot.selfdrive.frogpilot.functions.frogpilot_functions import DEFAULT_MODEL from openpilot.selfdrive.frogpilot.controls.lib.model_manager import DEFAULT_MODEL, MODELS_PATH, NAVIGATIONLESS_MODELS, RADARLESS_MODELS
PROCESS_NAME = "selfdrive.modeld.modeld" PROCESS_NAME = "selfdrive.modeld.modeld"
SEND_RAW_PRED = os.getenv('SEND_RAW_PRED') SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
MODEL_NAME = Params().get("Model", encoding='utf-8') MODEL_NAME = Params().get("Model", encoding='utf-8')
DISABLE_NAV = MODEL_NAME in NAVIGATIONLESS_MODELS
DISABLE_RADAR = MODEL_NAME in RADARLESS_MODELS
MODEL_PATHS = { MODEL_PATHS = {
ModelRunner.THNEED: Path(__file__).parent / ('models/supercombo.thneed' if MODEL_NAME == DEFAULT_MODEL else f'models/models/{MODEL_NAME}.thneed'), ModelRunner.THNEED: Path(__file__).parent / ('models/supercombo.thneed' if MODEL_NAME == DEFAULT_MODEL else f'{MODELS_PATH}/{MODEL_NAME}.thneed'),
ModelRunner.ONNX: Path(__file__).parent / 'models/supercombo.onnx'} ModelRunner.ONNX: Path(__file__).parent / 'models/supercombo.onnx'}
METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl' METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'
@@ -49,7 +52,7 @@ class FrameMeta:
class ModelState: class ModelState:
frame: ModelFrame frame: ModelFrame
wide_frame: ModelFrame wide_frame: ModelFrame
inputs: Dict[str, np.ndarray] inputs: dict[str, np.ndarray]
output: np.ndarray output: np.ndarray
prev_desire: np.ndarray # for tracking the rising edge of the pulse prev_desire: np.ndarray # for tracking the rising edge of the pulse
model: ModelRunner model: ModelRunner
@@ -63,9 +66,10 @@ class ModelState:
'traffic_convention': np.zeros(ModelConstants.TRAFFIC_CONVENTION_LEN, dtype=np.float32), 'traffic_convention': np.zeros(ModelConstants.TRAFFIC_CONVENTION_LEN, dtype=np.float32),
'lateral_control_params': np.zeros(ModelConstants.LATERAL_CONTROL_PARAMS_LEN, dtype=np.float32), 'lateral_control_params': np.zeros(ModelConstants.LATERAL_CONTROL_PARAMS_LEN, dtype=np.float32),
'prev_desired_curv': np.zeros(ModelConstants.PREV_DESIRED_CURV_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32), 'prev_desired_curv': np.zeros(ModelConstants.PREV_DESIRED_CURV_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
'nav_features': np.zeros(ModelConstants.NAV_FEATURE_LEN, dtype=np.float32), **({'nav_features': np.zeros(ModelConstants.NAV_FEATURE_LEN, dtype=np.float32),
'nav_instructions': np.zeros(ModelConstants.NAV_INSTRUCTION_LEN, dtype=np.float32), 'nav_instructions': np.zeros(ModelConstants.NAV_INSTRUCTION_LEN, dtype=np.float32)} if not DISABLE_NAV else {}),
'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32), 'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32),
**({'radar_tracks': np.zeros(ModelConstants.RADAR_TRACKS_LEN * ModelConstants.RADAR_TRACKS_WIDTH, dtype=np.float32)} if DISABLE_RADAR else {}),
} }
with open(METADATA_PATH, 'rb') as f: with open(METADATA_PATH, 'rb') as f:
@@ -82,14 +86,14 @@ class ModelState:
for k,v in self.inputs.items(): for k,v in self.inputs.items():
self.model.addInput(k, v) self.model.addInput(k, v)
def slice_outputs(self, model_outputs: np.ndarray) -> Dict[str, np.ndarray]: def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]:
parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()} parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()}
if SEND_RAW_PRED: if SEND_RAW_PRED:
parsed_model_outputs['raw_pred'] = model_outputs.copy() parsed_model_outputs['raw_pred'] = model_outputs.copy()
return parsed_model_outputs return parsed_model_outputs
def run(self, buf: VisionBuf, wbuf: VisionBuf, transform: np.ndarray, transform_wide: np.ndarray, def run(self, buf: VisionBuf, wbuf: VisionBuf, transform: np.ndarray, transform_wide: np.ndarray,
inputs: Dict[str, np.ndarray], prepare_only: bool) -> Optional[Dict[str, np.ndarray]]: inputs: dict[str, np.ndarray], prepare_only: bool) -> dict[str, np.ndarray] | None:
# Model decides when action is completed, so desire input is just a pulse triggered on rising edge # Model decides when action is completed, so desire input is just a pulse triggered on rising edge
inputs['desire'][0] = 0 inputs['desire'][0] = 0
self.inputs['desire'][:-ModelConstants.DESIRE_LEN] = self.inputs['desire'][ModelConstants.DESIRE_LEN:] self.inputs['desire'][:-ModelConstants.DESIRE_LEN] = self.inputs['desire'][ModelConstants.DESIRE_LEN:]
@@ -98,8 +102,11 @@ class ModelState:
self.inputs['traffic_convention'][:] = inputs['traffic_convention'] self.inputs['traffic_convention'][:] = inputs['traffic_convention']
self.inputs['lateral_control_params'][:] = inputs['lateral_control_params'] self.inputs['lateral_control_params'][:] = inputs['lateral_control_params']
if not DISABLE_NAV:
self.inputs['nav_features'][:] = inputs['nav_features'] self.inputs['nav_features'][:] = inputs['nav_features']
self.inputs['nav_instructions'][:] = inputs['nav_instructions'] self.inputs['nav_instructions'][:] = inputs['nav_instructions']
if DISABLE_RADAR:
self.inputs['radar_tracks'][:] = inputs['radar_tracks']
# if getCLBuffer is not None, frame will be None # if getCLBuffer is not None, frame will be None
self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs"))) self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs")))
@@ -158,7 +165,7 @@ def main(demo=False):
# messaging # messaging
pm = PubMaster(["modelV2", "cameraOdometry"]) pm = PubMaster(["modelV2", "cameraOdometry"])
sm = SubMaster(["carState", "roadCameraState", "liveCalibration", "driverMonitoringState", "navModel", "navInstruction", "carControl", "frogpilotPlan"]) sm = SubMaster(["deviceState", "carState", "roadCameraState", "liveCalibration", "driverMonitoringState", "navModel", "navInstruction", "carControl", "liveTracks", "frogpilotPlan"])
publish_state = PublishState() publish_state = PublishState()
params = Params() params = Params()
@@ -230,10 +237,11 @@ def main(demo=False):
is_rhd = sm["driverMonitoringState"].isRHD is_rhd = sm["driverMonitoringState"].isRHD
frame_id = sm["roadCameraState"].frameId frame_id = sm["roadCameraState"].frameId
lateral_control_params = np.array([sm["carState"].vEgo, steer_delay], dtype=np.float32) lateral_control_params = np.array([sm["carState"].vEgo, steer_delay], dtype=np.float32)
if sm.updated["liveCalibration"]: if sm.updated["liveCalibration"] and sm.seen['roadCameraState'] and sm.seen['deviceState']:
device_from_calib_euler = np.array(sm["liveCalibration"].rpyCalib, dtype=np.float32) device_from_calib_euler = np.array(sm["liveCalibration"].rpyCalib, dtype=np.float32)
model_transform_main = get_warp_matrix(device_from_calib_euler, main_wide_camera, False).astype(np.float32) dc = DEVICE_CAMERAS[(str(sm['deviceState'].deviceType), str(sm['roadCameraState'].sensor))]
model_transform_extra = get_warp_matrix(device_from_calib_euler, True, True).astype(np.float32) model_transform_main = get_warp_matrix(device_from_calib_euler, dc.ecam.intrinsics if main_wide_camera else dc.fcam.intrinsics, False).astype(np.float32)
model_transform_extra = get_warp_matrix(device_from_calib_euler, dc.ecam.intrinsics, True).astype(np.float32)
live_calib_seen = True live_calib_seen = True
traffic_convention = np.zeros(2) traffic_convention = np.zeros(2)
@@ -246,7 +254,7 @@ def main(demo=False):
# Enable/disable nav features # Enable/disable nav features
timestamp_llk = sm["navModel"].locationMonoTime timestamp_llk = sm["navModel"].locationMonoTime
nav_valid = sm.valid["navModel"] # and (nanos_since_boot() - timestamp_llk < 1e9) nav_valid = sm.valid["navModel"] # and (nanos_since_boot() - timestamp_llk < 1e9)
nav_enabled = nav_valid and (params.get_bool("ExperimentalMode") or params.get_bool("NavChill")) nav_enabled = nav_valid and not DISABLE_NAV
if not nav_enabled: if not nav_enabled:
nav_features[:] = 0 nav_features[:] = 0
@@ -267,6 +275,14 @@ def main(demo=False):
if 0 <= distance_idx < 50: if 0 <= distance_idx < 50:
nav_instructions[distance_idx*3 + direction_idx] = 1 nav_instructions[distance_idx*3 + direction_idx] = 1
radar_tracks = np.zeros(ModelConstants.RADAR_TRACKS_LEN * ModelConstants.RADAR_TRACKS_WIDTH, dtype=np.float32)
if sm.updated["liveTracks"]:
for i, track in enumerate(sm["liveTracks"]):
if i >= ModelConstants.RADAR_TRACKS_LEN:
break
vec_index = i * ModelConstants.RADAR_TRACKS_WIDTH
radar_tracks[vec_index:vec_index+ModelConstants.RADAR_TRACKS_WIDTH] = [track.dRel, track.yRel, track.vRel]
# tracked dropped frames # tracked dropped frames
vipc_dropped_frames = max(0, meta_main.frame_id - last_vipc_frame_id - 1) vipc_dropped_frames = max(0, meta_main.frame_id - last_vipc_frame_id - 1)
frames_dropped = frame_dropped_filter.update(min(vipc_dropped_frames, 10)) frames_dropped = frame_dropped_filter.update(min(vipc_dropped_frames, 10))
@@ -280,12 +296,13 @@ def main(demo=False):
if prepare_only: if prepare_only:
cloudlog.error(f"skipping model eval. Dropped {vipc_dropped_frames} frames") cloudlog.error(f"skipping model eval. Dropped {vipc_dropped_frames} frames")
inputs:Dict[str, np.ndarray] = { inputs:dict[str, np.ndarray] = {
'desire': vec_desire, 'desire': vec_desire,
'traffic_convention': traffic_convention, 'traffic_convention': traffic_convention,
'lateral_control_params': lateral_control_params, 'lateral_control_params': lateral_control_params,
'nav_features': nav_features, **({'nav_features': nav_features, 'nav_instructions': nav_instructions} if not DISABLE_NAV else {}),
'nav_instructions': nav_instructions} **({'radar_tracks': radar_tracks,} if DISABLE_RADAR else {}),
}
mt1 = time.perf_counter() mt1 = time.perf_counter()
model_output = model.run(buf_main, buf_extra, model_transform_main, model_transform_extra, inputs, prepare_only) model_output = model.run(buf_main, buf_extra, model_transform_main, model_transform_extra, inputs, prepare_only)

View File

@@ -0,0 +1,66 @@
## Neural networks in openpilot
To view the architecture of the ONNX networks, you can use [netron](https://netron.app/)
## Supercombo
### Supercombo input format (Full size: 799906 x float32)
* **image stream**
* Two consecutive images (256 * 512 * 3 in RGB) recorded at 20 Hz : 393216 = 2 * 6 * 128 * 256
* Each 256 * 512 image is represented in YUV420 with 6 channels : 6 * 128 * 256
* Channels 0,1,2,3 represent the full-res Y channel and are represented in numpy as Y[::2, ::2], Y[::2, 1::2], Y[1::2, ::2], and Y[1::2, 1::2]
* Channel 4 represents the half-res U channel
* Channel 5 represents the half-res V channel
* **wide image stream**
* Two consecutive images (256 * 512 * 3 in RGB) recorded at 20 Hz : 393216 = 2 * 6 * 128 * 256
* Each 256 * 512 image is represented in YUV420 with 6 channels : 6 * 128 * 256
* Channels 0,1,2,3 represent the full-res Y channel and are represented in numpy as Y[::2, ::2], Y[::2, 1::2], Y[1::2, ::2], and Y[1::2, 1::2]
* Channel 4 represents the half-res U channel
* Channel 5 represents the half-res V channel
* **desire**
* one-hot encoded buffer to command model to execute certain actions, bit needs to be sent for the past 5 seconds (at 20FPS) : 100 * 8
* **traffic convention**
* one-hot encoded vector to tell model whether traffic is right-hand or left-hand traffic : 2
* **feature buffer**
* A buffer of intermediate features that gets appended to the current feature to form a 5 seconds temporal context (at 20FPS) : 99 * 512
* **nav features**
* 1 * 150
* **nav instructions**
* 1 * 256
### Supercombo output format (Full size: XXX x float32)
Read [here](https://github.com/commaai/openpilot/blob/90af436a121164a51da9fa48d093c29f738adf6a/selfdrive/modeld/models/driving.h#L236) for more.
## Driver Monitoring Model
* .onnx model can be run with onnx runtimes
* .dlc file is a pre-quantized model and only runs on qualcomm DSPs
### input format
* single image W = 1440 H = 960 luminance channel (Y) from the planar YUV420 format:
* full input size is 1440 * 960 = 1382400
* normalized ranging from 0.0 to 1.0 in float32 (onnx runner) or ranging from 0 to 255 in uint8 (snpe runner)
* camera calibration angles (roll, pitch, yaw) from liveCalibration: 3 x float32 inputs
### output format
* 84 x float32 outputs = 2 + 41 * 2 ([parsing example](https://github.com/commaai/openpilot/blob/22ce4e17ba0d3bfcf37f8255a4dd1dc683fe0c38/selfdrive/modeld/models/dmonitoring.cc#L33))
* for each person in the front seats (2 * 41)
* face pose: 12 = 6 + 6
* face orientation [pitch, yaw, roll] in camera frame: 3
* face position [dx, dy] relative to image center: 2
* normalized face size: 1
* standard deviations for above outputs: 6
* face visible probability: 1
* eyes: 20 = (8 + 1) + (8 + 1) + 1 + 1
* eye position and size, and their standard deviations: 8
* eye visible probability: 1
* eye closed probability: 1
* wearing sunglasses probability: 1
* face occluded probability: 1
* touching wheel probability: 1
* paying attention probability: 1
* (deprecated) distracted probabilities: 2
* using phone probability: 1
* distracted probability: 1
* common outputs 2
* poor camera vision probability: 1
* left hand drive probability: 1

View File

@@ -0,0 +1,72 @@
#include "selfdrive/modeld/models/commonmodel.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstring>
#include "common/clutil.h"
#include "common/mat.h"
#include "common/timing.h"
ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
input_frames = std::make_unique<float[]>(buf_size);
q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_WIDTH * MODEL_HEIGHT, NULL, &err));
u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err));
v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err));
net_input_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_FRAME_SIZE * sizeof(float), NULL, &err));
transform_init(&transform, context, device_id);
loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
}
float* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection, cl_mem *output) {
transform_queue(&this->transform, q,
yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, projection);
if (output == NULL) {
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, net_input_cl);
std::memmove(&input_frames[0], &input_frames[MODEL_FRAME_SIZE], sizeof(float) * MODEL_FRAME_SIZE);
CL_CHECK(clEnqueueReadBuffer(q, net_input_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(float), &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
clFinish(q);
return &input_frames[0];
} else {
loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, *output, true);
// NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
clFinish(q);
return NULL;
}
}
ModelFrame::~ModelFrame() {
transform_destroy(&transform);
loadyuv_destroy(&loadyuv);
CL_CHECK(clReleaseMemObject(net_input_cl));
CL_CHECK(clReleaseMemObject(v_cl));
CL_CHECK(clReleaseMemObject(u_cl));
CL_CHECK(clReleaseMemObject(y_cl));
CL_CHECK(clReleaseCommandQueue(q));
}
void softmax(const float* input, float* output, size_t len) {
const float max_val = *std::max_element(input, input + len);
float denominator = 0;
for (int i = 0; i < len; i++) {
float const v_exp = expf(input[i] - max_val);
denominator += v_exp;
output[i] = v_exp;
}
const float inv_denominator = 1. / denominator;
for (int i = 0; i < len; i++) {
output[i] *= inv_denominator;
}
}
float sigmoid(float input) {
return 1 / (1 + expf(-input));
}

View File

@@ -0,0 +1,47 @@
#pragma once
#include <cfloat>
#include <cstdlib>
#include <memory>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#include "common/mat.h"
#include "cereal/messaging/messaging.h"
#include "selfdrive/modeld/transforms/loadyuv.h"
#include "selfdrive/modeld/transforms/transform.h"
const bool send_raw_pred = getenv("SEND_RAW_PRED") != NULL;
void softmax(const float* input, float* output, size_t len);
float sigmoid(float input);
template<class T, size_t size>
constexpr const kj::ArrayPtr<const T> to_kj_array_ptr(const std::array<T, size> &arr) {
return kj::ArrayPtr(arr.data(), arr.size());
}
class ModelFrame {
public:
ModelFrame(cl_device_id device_id, cl_context context);
~ModelFrame();
float* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform, cl_mem *output);
const int MODEL_WIDTH = 512;
const int MODEL_HEIGHT = 256;
const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT * 3 / 2;
const int buf_size = MODEL_FRAME_SIZE * 2;
private:
Transform transform;
LoadYUVState loadyuv;
cl_command_queue q;
cl_mem y_cl, u_cl, v_cl, net_input_cl;
std::unique_ptr<float[]> input_frames;
};

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,2 @@
60abed33-9f25-4e34-9937-aaf918d41dfc
50887963963a3022e85ac0c94b0801aef955a608

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,6 @@ import time
import ctypes import ctypes
import numpy as np import numpy as np
from pathlib import Path from pathlib import Path
from typing import Tuple, Dict
from cereal import messaging from cereal import messaging
from cereal.messaging import PubMaster, SubMaster from cereal.messaging import PubMaster, SubMaster
@@ -41,7 +40,7 @@ class NavModelResult(ctypes.Structure):
("features", ctypes.c_float*NAV_FEATURE_LEN)] ("features", ctypes.c_float*NAV_FEATURE_LEN)]
class ModelState: class ModelState:
inputs: Dict[str, np.ndarray] inputs: dict[str, np.ndarray]
output: np.ndarray output: np.ndarray
model: ModelRunner model: ModelRunner
@@ -52,7 +51,7 @@ class ModelState:
self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.DSP, True, None) self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.DSP, True, None)
self.model.addInput("input_img", None) self.model.addInput("input_img", None)
def run(self, buf:np.ndarray) -> Tuple[np.ndarray, float]: def run(self, buf:np.ndarray) -> tuple[np.ndarray, float]:
self.inputs['input_img'][:] = buf self.inputs['input_img'][:] = buf
t1 = time.perf_counter() t1 = time.perf_counter()

View File

@@ -1,5 +1,4 @@
import numpy as np import numpy as np
from typing import Dict
from openpilot.selfdrive.modeld.constants import ModelConstants from openpilot.selfdrive.modeld.constants import ModelConstants
def sigmoid(x): def sigmoid(x):
@@ -82,7 +81,7 @@ class Parser:
outs[name] = pred_mu_final.reshape(final_shape) outs[name] = pred_mu_final.reshape(final_shape)
outs[name + '_stds'] = pred_std_final.reshape(final_shape) outs[name + '_stds'] = pred_std_final.reshape(final_shape)
def parse_outputs(self, outs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: def parse_outputs(self, outs: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
self.parse_mdn('plan', outs, in_N=ModelConstants.PLAN_MHP_N, out_N=ModelConstants.PLAN_MHP_SELECTION, self.parse_mdn('plan', outs, in_N=ModelConstants.PLAN_MHP_N, out_N=ModelConstants.PLAN_MHP_SELECTION,
out_shape=(ModelConstants.IDX_N,ModelConstants.PLAN_WIDTH)) out_shape=(ModelConstants.IDX_N,ModelConstants.PLAN_WIDTH))
self.parse_mdn('lane_lines', outs, in_N=0, out_N=0, out_shape=(ModelConstants.NUM_LANE_LINES,ModelConstants.IDX_N,ModelConstants.LANE_LINES_WIDTH)) self.parse_mdn('lane_lines', outs, in_N=0, out_N=0, out_shape=(ModelConstants.NUM_LANE_LINES,ModelConstants.IDX_N,ModelConstants.LANE_LINES_WIDTH))

View File

@@ -3,7 +3,7 @@ import itertools
import os import os
import sys import sys
import numpy as np import numpy as np
from typing import Tuple, Dict, Union, Any from typing import Any
from openpilot.selfdrive.modeld.runners.runmodel_pyx import RunModel from openpilot.selfdrive.modeld.runners.runmodel_pyx import RunModel
@@ -38,7 +38,7 @@ def create_ort_session(path, fp16_to_fp32):
options = ort.SessionOptions() options = ort.SessionOptions()
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
provider: Union[str, Tuple[str, Dict[Any, Any]]] provider: str | tuple[str, dict[Any, Any]]
if 'OpenVINOExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ: if 'OpenVINOExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
provider = 'OpenVINOExecutionProvider' provider = 'OpenVINOExecutionProvider'
elif 'CUDAExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ: elif 'CUDAExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:

View File

@@ -0,0 +1,4 @@
#pragma once
#include "selfdrive/modeld/runners/runmodel.h"
#include "selfdrive/modeld/runners/snpemodel.h"

View File

@@ -0,0 +1,49 @@
#pragma once
#include <string>
#include <vector>
#include <memory>
#include <cassert>
#include "common/clutil.h"
#include "common/swaglog.h"
#define USE_CPU_RUNTIME 0
#define USE_GPU_RUNTIME 1
#define USE_DSP_RUNTIME 2
struct ModelInput {
const std::string name;
float *buffer;
int size;
ModelInput(const std::string _name, float *_buffer, int _size) : name(_name), buffer(_buffer), size(_size) {}
virtual void setBuffer(float *_buffer, int _size) {
assert(size == _size || size == 0);
buffer = _buffer;
size = _size;
}
};
class RunModel {
public:
std::vector<std::unique_ptr<ModelInput>> inputs;
virtual ~RunModel() {}
virtual void execute() {}
virtual void* getCLBuffer(const std::string name) { return nullptr; }
virtual void addInput(const std::string name, float *buffer, int size) {
inputs.push_back(std::unique_ptr<ModelInput>(new ModelInput(name, buffer, size)));
}
virtual void setInputBuffer(const std::string name, float *buffer, int size) {
for (auto &input : inputs) {
if (name == input->name) {
input->setBuffer(buffer, size);
return;
}
}
LOGE("Tried to update input `%s` but no input with this name exists", name.c_str());
assert(false);
}
};

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,116 @@
#pragma clang diagnostic ignored "-Wexceptions"
#include "selfdrive/modeld/runners/snpemodel.h"
#include <cstring>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "common/util.h"
#include "common/timing.h"
void PrintErrorStringAndExit() {
std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
std::exit(EXIT_FAILURE);
}
SNPEModel::SNPEModel(const std::string path, float *_output, size_t _output_size, int runtime, bool _use_tf8, cl_context context) {
output = _output;
output_size = _output_size;
use_tf8 = _use_tf8;
#ifdef QCOM2
if (runtime == USE_GPU_RUNTIME) {
snpe_runtime = zdl::DlSystem::Runtime_t::GPU;
} else if (runtime == USE_DSP_RUNTIME) {
snpe_runtime = zdl::DlSystem::Runtime_t::DSP;
} else {
snpe_runtime = zdl::DlSystem::Runtime_t::CPU;
}
assert(zdl::SNPE::SNPEFactory::isRuntimeAvailable(snpe_runtime));
#endif
model_data = util::read_file(path);
assert(model_data.size() > 0);
// load model
std::unique_ptr<zdl::DlContainer::IDlContainer> container = zdl::DlContainer::IDlContainer::open((uint8_t*)model_data.data(), model_data.size());
if (!container) { PrintErrorStringAndExit(); }
LOGW("loaded model with size: %lu", model_data.size());
// create model runner
zdl::SNPE::SNPEBuilder snpe_builder(container.get());
while (!snpe) {
#ifdef QCOM2
snpe = snpe_builder.setOutputLayers({})
.setRuntimeProcessor(snpe_runtime)
.setUseUserSuppliedBuffers(true)
.setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
.build();
#else
snpe = snpe_builder.setOutputLayers({})
.setUseUserSuppliedBuffers(true)
.setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
.build();
#endif
if (!snpe) std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
}
// create output buffer
zdl::DlSystem::UserBufferEncodingFloat ub_encoding_float;
zdl::DlSystem::IUserBufferFactory &ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
const auto &output_tensor_names_opt = snpe->getOutputTensorNames();
if (!output_tensor_names_opt) throw std::runtime_error("Error obtaining output tensor names");
const auto &output_tensor_names = *output_tensor_names_opt;
assert(output_tensor_names.size() == 1);
const char *output_tensor_name = output_tensor_names.at(0);
const zdl::DlSystem::TensorShape &buffer_shape = snpe->getInputOutputBufferAttributes(output_tensor_name)->getDims();
if (output_size != 0) {
assert(output_size == buffer_shape[1]);
} else {
output_size = buffer_shape[1];
}
std::vector<size_t> output_strides = {output_size * sizeof(float), sizeof(float)};
output_buffer = ub_factory.createUserBuffer(output, output_size * sizeof(float), output_strides, &ub_encoding_float);
output_map.add(output_tensor_name, output_buffer.get());
}
void SNPEModel::addInput(const std::string name, float *buffer, int size) {
const int idx = inputs.size();
const auto &input_tensor_names_opt = snpe->getInputTensorNames();
if (!input_tensor_names_opt) throw std::runtime_error("Error obtaining input tensor names");
const auto &input_tensor_names = *input_tensor_names_opt;
const char *input_tensor_name = input_tensor_names.at(idx);
const bool input_tf8 = use_tf8 && strcmp(input_tensor_name, "input_img") == 0; // TODO: This is a terrible hack, get rid of this name check both here and in onnx_runner.py
LOGW("adding index %d: %s", idx, input_tensor_name);
zdl::DlSystem::UserBufferEncodingFloat ub_encoding_float;
zdl::DlSystem::UserBufferEncodingTf8 ub_encoding_tf8(0, 1./255); // network takes 0-1
zdl::DlSystem::IUserBufferFactory &ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
zdl::DlSystem::UserBufferEncoding *input_encoding = input_tf8 ? (zdl::DlSystem::UserBufferEncoding*)&ub_encoding_tf8 : (zdl::DlSystem::UserBufferEncoding*)&ub_encoding_float;
const auto &buffer_shape_opt = snpe->getInputDimensions(input_tensor_name);
const zdl::DlSystem::TensorShape &buffer_shape = *buffer_shape_opt;
size_t size_of_input = input_tf8 ? sizeof(uint8_t) : sizeof(float);
std::vector<size_t> strides(buffer_shape.rank());
strides[strides.size() - 1] = size_of_input;
size_t product = 1;
for (size_t i = 0; i < buffer_shape.rank(); i++) product *= buffer_shape[i];
size_t stride = strides[strides.size() - 1];
for (size_t i = buffer_shape.rank() - 1; i > 0; i--) {
stride *= buffer_shape[i];
strides[i-1] = stride;
}
auto input_buffer = ub_factory.createUserBuffer(buffer, product*size_of_input, strides, input_encoding);
input_map.add(input_tensor_name, input_buffer.get());
inputs.push_back(std::unique_ptr<SNPEModelInput>(new SNPEModelInput(name, buffer, size, std::move(input_buffer))));
}
void SNPEModel::execute() {
if (!snpe->execute(input_map, output_map)) {
PrintErrorStringAndExit();
}
}

View File

@@ -0,0 +1,52 @@
#pragma once
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#include <memory>
#include <string>
#include <utility>
#include <DlContainer/IDlContainer.hpp>
#include <DlSystem/DlError.hpp>
#include <DlSystem/ITensor.hpp>
#include <DlSystem/ITensorFactory.hpp>
#include <DlSystem/IUserBuffer.hpp>
#include <DlSystem/IUserBufferFactory.hpp>
#include <SNPE/SNPE.hpp>
#include <SNPE/SNPEBuilder.hpp>
#include <SNPE/SNPEFactory.hpp>
#include "selfdrive/modeld/runners/runmodel.h"
struct SNPEModelInput : public ModelInput {
std::unique_ptr<zdl::DlSystem::IUserBuffer> snpe_buffer;
SNPEModelInput(const std::string _name, float *_buffer, int _size, std::unique_ptr<zdl::DlSystem::IUserBuffer> _snpe_buffer) : ModelInput(_name, _buffer, _size), snpe_buffer(std::move(_snpe_buffer)) {}
void setBuffer(float *_buffer, int _size) {
ModelInput::setBuffer(_buffer, _size);
assert(snpe_buffer->setBufferAddress(_buffer) == true);
}
};
class SNPEModel : public RunModel {
public:
SNPEModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
void addInput(const std::string name, float *buffer, int size);
void execute();
private:
std::string model_data;
#ifdef QCOM2
zdl::DlSystem::Runtime_t snpe_runtime;
#endif
// snpe model stuff
std::unique_ptr<zdl::SNPE::SNPE> snpe;
zdl::DlSystem::UserBufferMap input_map;
zdl::DlSystem::UserBufferMap output_map;
std::unique_ptr<zdl::DlSystem::IUserBuffer> output_buffer;
bool use_tf8;
float *output;
size_t output_size;
};

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,58 @@
#include "selfdrive/modeld/runners/thneedmodel.h"
#include <string>
#include "common/swaglog.h"
ThneedModel::ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool luse_tf8, cl_context context) {
thneed = new Thneed(true, context);
thneed->load(path.c_str());
thneed->clexec();
recorded = false;
output = _output;
}
void* ThneedModel::getCLBuffer(const std::string name) {
int index = -1;
for (int i = 0; i < inputs.size(); i++) {
if (name == inputs[i]->name) {
index = i;
break;
}
}
if (index == -1) {
LOGE("Tried to get CL buffer for input `%s` but no input with this name exists", name.c_str());
assert(false);
}
if (thneed->input_clmem.size() >= inputs.size()) {
return &thneed->input_clmem[inputs.size() - index - 1];
} else {
return nullptr;
}
}
void ThneedModel::execute() {
if (!recorded) {
thneed->record = true;
float *input_buffers[inputs.size()];
for (int i = 0; i < inputs.size(); i++) {
input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
}
thneed->copy_inputs(input_buffers);
thneed->clexec();
thneed->copy_output(output);
thneed->stop();
recorded = true;
} else {
float *input_buffers[inputs.size()];
for (int i = 0; i < inputs.size(); i++) {
input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
}
thneed->execute(input_buffers, output);
}
}

View File

@@ -0,0 +1,17 @@
#pragma once
#include <string>
#include "selfdrive/modeld/runners/runmodel.h"
#include "selfdrive/modeld/thneed/thneed.h"
class ThneedModel : public RunModel {
public:
ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
void *getCLBuffer(const std::string name);
void execute();
private:
Thneed *thneed = NULL;
bool recorded;
float *output;
};

File diff suppressed because it is too large Load Diff

View File

View File

@@ -0,0 +1,101 @@
// clang++ -O2 repro.cc && ./a.out
#include <sched.h>
#include <sys/types.h>
#include <unistd.h>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
static inline double millis_since_boot() {
struct timespec t;
clock_gettime(CLOCK_BOOTTIME, &t);
return t.tv_sec * 1000.0 + t.tv_nsec * 1e-6;
}
#define MODEL_WIDTH 320
#define MODEL_HEIGHT 640
// null function still breaks it
#define input_lambda(x) x
// this is copied from models/dmonitoring.cc, and is the code that triggers the issue
void inner(uint8_t *resized_buf, float *net_input_buf) {
int resized_width = MODEL_WIDTH;
int resized_height = MODEL_HEIGHT;
// one shot conversion, O(n) anyway
// yuvframe2tensor, normalize
for (int r = 0; r < MODEL_HEIGHT/2; r++) {
for (int c = 0; c < MODEL_WIDTH/2; c++) {
// Y_ul
net_input_buf[(c*MODEL_HEIGHT/2) + r] = input_lambda(resized_buf[(2*r*resized_width) + (2*c)]);
// Y_ur
net_input_buf[(c*MODEL_HEIGHT/2) + r + (2*(MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(2*r*resized_width) + (2*c+1)]);
// Y_dl
net_input_buf[(c*MODEL_HEIGHT/2) + r + ((MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(2*r*resized_width+1) + (2*c)]);
// Y_dr
net_input_buf[(c*MODEL_HEIGHT/2) + r + (3*(MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(2*r*resized_width+1) + (2*c+1)]);
// U
net_input_buf[(c*MODEL_HEIGHT/2) + r + (4*(MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(resized_width*resized_height) + (r*resized_width/2) + c]);
// V
net_input_buf[(c*MODEL_HEIGHT/2) + r + (5*(MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(resized_width*resized_height) + ((resized_width/2)*(resized_height/2)) + (r*resized_width/2) + c]);
}
}
}
float trial() {
int resized_width = MODEL_WIDTH;
int resized_height = MODEL_HEIGHT;
int yuv_buf_len = (MODEL_WIDTH/2) * (MODEL_HEIGHT/2) * 6; // Y|u|v -> y|y|y|y|u|v
// allocate the buffers
uint8_t *resized_buf = (uint8_t*)malloc(resized_width*resized_height*3/2);
float *net_input_buf = (float*)malloc(yuv_buf_len*sizeof(float));
printf("allocate -- %p 0x%x -- %p 0x%lx\n", resized_buf, resized_width*resized_height*3/2, net_input_buf, yuv_buf_len*sizeof(float));
// test for bad buffers
static int CNT = 20;
float avg = 0.0;
for (int i = 0; i < CNT; i++) {
double s4 = millis_since_boot();
inner(resized_buf, net_input_buf);
double s5 = millis_since_boot();
avg += s5-s4;
}
avg /= CNT;
// once it's bad, it's reliably bad
if (avg > 10) {
printf("HIT %f\n", avg);
printf("BAD\n");
for (int i = 0; i < 200; i++) {
double s4 = millis_since_boot();
inner(resized_buf, net_input_buf);
double s5 = millis_since_boot();
printf("%.2f ", s5-s4);
}
printf("\n");
exit(0);
}
// don't free so we get a different buffer each time
//free(resized_buf);
//free(net_input_buf);
return avg;
}
int main() {
while (true) {
float ret = trial();
printf("got %f\n", ret);
}
}

View File

@@ -0,0 +1 @@
benchmark

View File

@@ -0,0 +1,192 @@
#include <SNPE/SNPE.hpp>
#include <SNPE/SNPEBuilder.hpp>
#include <SNPE/SNPEFactory.hpp>
#include <DlContainer/IDlContainer.hpp>
#include <DlSystem/DlError.hpp>
#include <DlSystem/ITensor.hpp>
#include <DlSystem/ITensorFactory.hpp>
#include <iostream>
#include <fstream>
#include <sstream>
using namespace std;
int64_t timespecDiff(struct timespec *timeA_p, struct timespec *timeB_p) {
return ((timeA_p->tv_sec * 1000000000) + timeA_p->tv_nsec) - ((timeB_p->tv_sec * 1000000000) + timeB_p->tv_nsec);
}
void PrintErrorStringAndExit() {
cout << "ERROR!" << endl;
const char* const errStr = zdl::DlSystem::getLastErrorString();
std::cerr << errStr << std::endl;
std::exit(EXIT_FAILURE);
}
zdl::DlSystem::Runtime_t checkRuntime() {
static zdl::DlSystem::Version_t Version = zdl::SNPE::SNPEFactory::getLibraryVersion();
static zdl::DlSystem::Runtime_t Runtime;
std::cout << "SNPE Version: " << Version.asString().c_str() << std::endl; //Print Version number
if (zdl::SNPE::SNPEFactory::isRuntimeAvailable(zdl::DlSystem::Runtime_t::DSP)) {
std::cout << "Using DSP runtime" << std::endl;
Runtime = zdl::DlSystem::Runtime_t::DSP;
} else if (zdl::SNPE::SNPEFactory::isRuntimeAvailable(zdl::DlSystem::Runtime_t::GPU)) {
std::cout << "Using GPU runtime" << std::endl;
Runtime = zdl::DlSystem::Runtime_t::GPU;
} else {
std::cout << "Using cpu runtime" << std::endl;
Runtime = zdl::DlSystem::Runtime_t::CPU;
}
return Runtime;
}
void test(char *filename) {
static zdl::DlSystem::Runtime_t runtime = checkRuntime();
std::unique_ptr<zdl::DlContainer::IDlContainer> container;
container = zdl::DlContainer::IDlContainer::open(filename);
if (!container) { PrintErrorStringAndExit(); }
cout << "start build" << endl;
std::unique_ptr<zdl::SNPE::SNPE> snpe;
{
snpe = NULL;
zdl::SNPE::SNPEBuilder snpeBuilder(container.get());
snpe = snpeBuilder.setOutputLayers({})
.setRuntimeProcessor(runtime)
.setUseUserSuppliedBuffers(false)
//.setDebugMode(true)
.build();
if (!snpe) {
cout << "ERROR!" << endl;
const char* const errStr = zdl::DlSystem::getLastErrorString();
std::cerr << errStr << std::endl;
}
cout << "ran snpeBuilder" << endl;
}
const auto &strList_opt = snpe->getInputTensorNames();
if (!strList_opt) throw std::runtime_error("Error obtaining input tensor names");
cout << "get input tensor names done" << endl;
const auto &strList = *strList_opt;
static zdl::DlSystem::TensorMap inputTensorMap;
static zdl::DlSystem::TensorMap outputTensorMap;
vector<std::unique_ptr<zdl::DlSystem::ITensor> > inputs;
for (int i = 0; i < strList.size(); i++) {
cout << "input name: " << strList.at(i) << endl;
const auto &inputDims_opt = snpe->getInputDimensions(strList.at(i));
const auto &inputShape = *inputDims_opt;
inputs.push_back(zdl::SNPE::SNPEFactory::getTensorFactory().createTensor(inputShape));
inputTensorMap.add(strList.at(i), inputs[i].get());
}
struct timespec start, end;
cout << "**** starting benchmark ****" << endl;
for (int i = 0; i < 50; i++) {
clock_gettime(CLOCK_MONOTONIC, &start);
int err = snpe->execute(inputTensorMap, outputTensorMap);
assert(err == true);
clock_gettime(CLOCK_MONOTONIC, &end);
uint64_t timeElapsed = timespecDiff(&end, &start);
printf("time: %f ms\n", timeElapsed*1.0/1e6);
}
}
void get_testframe(int index, std::unique_ptr<zdl::DlSystem::ITensor> &input) {
FILE * pFile;
string filepath="/data/ipt/quantize_samples/sample_input_"+std::to_string(index);
pFile = fopen(filepath.c_str(), "rb");
int length = 1*6*160*320*4;
float * frame_buffer = new float[length/4]; // 32/8
fread(frame_buffer, length, 1, pFile);
// std::cout << *(frame_buffer+length/4-1) << std::endl;
std::copy(frame_buffer, frame_buffer+(length/4), input->begin());
fclose(pFile);
}
void SaveITensor(const std::string& path, const zdl::DlSystem::ITensor* tensor)
{
std::ofstream os(path, std::ofstream::binary);
if (!os)
{
std::cerr << "Failed to open output file for writing: " << path << "\n";
std::exit(EXIT_FAILURE);
}
for ( auto it = tensor->cbegin(); it != tensor->cend(); ++it )
{
float f = *it;
if (!os.write(reinterpret_cast<char*>(&f), sizeof(float)))
{
std::cerr << "Failed to write data to: " << path << "\n";
std::exit(EXIT_FAILURE);
}
}
}
void testrun(char* modelfile) {
static zdl::DlSystem::Runtime_t runtime = checkRuntime();
std::unique_ptr<zdl::DlContainer::IDlContainer> container;
container = zdl::DlContainer::IDlContainer::open(modelfile);
if (!container) { PrintErrorStringAndExit(); }
cout << "start build" << endl;
std::unique_ptr<zdl::SNPE::SNPE> snpe;
{
snpe = NULL;
zdl::SNPE::SNPEBuilder snpeBuilder(container.get());
snpe = snpeBuilder.setOutputLayers({})
.setRuntimeProcessor(runtime)
.setUseUserSuppliedBuffers(false)
//.setDebugMode(true)
.build();
if (!snpe) {
cout << "ERROR!" << endl;
const char* const errStr = zdl::DlSystem::getLastErrorString();
std::cerr << errStr << std::endl;
}
cout << "ran snpeBuilder" << endl;
}
const auto &strList_opt = snpe->getInputTensorNames();
if (!strList_opt) throw std::runtime_error("Error obtaining input tensor names");
cout << "get input tensor names done" << endl;
const auto &strList = *strList_opt;
static zdl::DlSystem::TensorMap inputTensorMap;
static zdl::DlSystem::TensorMap outputTensorMap;
assert(strList.size() == 1);
const auto &inputDims_opt = snpe->getInputDimensions(strList.at(0));
const auto &inputShape = *inputDims_opt;
std::cout << "winkwink" << std::endl;
for (int i=0; i<10000; i++) {
std::unique_ptr<zdl::DlSystem::ITensor> input;
input = zdl::SNPE::SNPEFactory::getTensorFactory().createTensor(inputShape);
get_testframe(i, input);
snpe->execute(input.get(), outputTensorMap);
zdl::DlSystem::StringList tensorNames = outputTensorMap.getTensorNames();
std::for_each(tensorNames.begin(), tensorNames.end(), [&](const char* name) {
std::ostringstream path;
path << "/data/opt/Result_" << std::to_string(i) << ".raw";
auto tensorPtr = outputTensorMap.getTensor(name);
SaveITensor(path.str(), tensorPtr);
});
}
}
int main(int argc, char* argv[]) {
if (argc < 2) {
printf("usage: %s <filename>\n", argv[0]);
return -1;
}
if (argc == 2) {
while (true) test(argv[1]);
} else if (argc == 3) {
testrun(argv[1]);
}
return 0;
}

View File

@@ -0,0 +1,4 @@
#!/bin/sh -e
clang++ -I /data/openpilot/third_party/snpe/include/ -L/data/pythonpath/third_party/snpe/aarch64 -lSNPE benchmark.cc -o benchmark
export LD_LIBRARY_PATH="/data/pythonpath/third_party/snpe/aarch64/:$HOME/openpilot/third_party/snpe/x86_64/:$LD_LIBRARY_PATH"
exec ./benchmark $1

View File

@@ -0,0 +1,107 @@
#!/usr/bin/env python3
import unittest
import numpy as np
import random
import cereal.messaging as messaging
from cereal.visionipc import VisionIpcServer, VisionStreamType
from openpilot.common.transformations.camera import DEVICE_CAMERAS
from openpilot.common.realtime import DT_MDL
from openpilot.selfdrive.car.car_helpers import write_car_param
from openpilot.selfdrive.manager.process_config import managed_processes
from openpilot.selfdrive.test.process_replay.vision_meta import meta_from_camera_state
CAM = DEVICE_CAMERAS[("tici", "ar0231")].fcam
IMG = np.zeros(int(CAM.width*CAM.height*(3/2)), dtype=np.uint8)
IMG_BYTES = IMG.flatten().tobytes()
class TestModeld(unittest.TestCase):
def setUp(self):
self.vipc_server = VisionIpcServer("camerad")
self.vipc_server.create_buffers(VisionStreamType.VISION_STREAM_ROAD, 40, False, CAM.width, CAM.height)
self.vipc_server.create_buffers(VisionStreamType.VISION_STREAM_DRIVER, 40, False, CAM.width, CAM.height)
self.vipc_server.create_buffers(VisionStreamType.VISION_STREAM_WIDE_ROAD, 40, False, CAM.width, CAM.height)
self.vipc_server.start_listener()
write_car_param()
self.sm = messaging.SubMaster(['modelV2', 'cameraOdometry'])
self.pm = messaging.PubMaster(['roadCameraState', 'wideRoadCameraState', 'liveCalibration'])
managed_processes['modeld'].start()
self.pm.wait_for_readers_to_update("roadCameraState", 10)
def tearDown(self):
managed_processes['modeld'].stop()
del self.vipc_server
def _send_frames(self, frame_id, cams=None):
if cams is None:
cams = ('roadCameraState', 'wideRoadCameraState')
cs = None
for cam in cams:
msg = messaging.new_message(cam)
cs = getattr(msg, cam)
cs.frameId = frame_id
cs.timestampSof = int((frame_id * DT_MDL) * 1e9)
cs.timestampEof = int(cs.timestampSof + (DT_MDL * 1e9))
cam_meta = meta_from_camera_state(cam)
self.pm.send(msg.which(), msg)
self.vipc_server.send(cam_meta.stream, IMG_BYTES, cs.frameId,
cs.timestampSof, cs.timestampEof)
return cs
def _wait(self):
self.sm.update(5000)
if self.sm['modelV2'].frameId != self.sm['cameraOdometry'].frameId:
self.sm.update(1000)
def test_modeld(self):
for n in range(1, 500):
cs = self._send_frames(n)
self._wait()
mdl = self.sm['modelV2']
self.assertEqual(mdl.frameId, n)
self.assertEqual(mdl.frameIdExtra, n)
self.assertEqual(mdl.timestampEof, cs.timestampEof)
self.assertEqual(mdl.frameAge, 0)
self.assertEqual(mdl.frameDropPerc, 0)
odo = self.sm['cameraOdometry']
self.assertEqual(odo.frameId, n)
self.assertEqual(odo.timestampEof, cs.timestampEof)
def test_dropped_frames(self):
"""
modeld should only run on consecutive road frames
"""
frame_id = -1
road_frames = list()
for n in range(1, 50):
if (random.random() < 0.1) and n > 3:
cams = random.choice([(), ('wideRoadCameraState', )])
self._send_frames(n, cams)
else:
self._send_frames(n)
road_frames.append(n)
self._wait()
if len(road_frames) < 3 or road_frames[-1] - road_frames[-2] == 1:
frame_id = road_frames[-1]
mdl = self.sm['modelV2']
odo = self.sm['cameraOdometry']
self.assertEqual(mdl.frameId, frame_id)
self.assertEqual(mdl.frameIdExtra, frame_id)
self.assertEqual(odo.frameId, frame_id)
if n != frame_id:
self.assertFalse(self.sm.updated['modelV2'])
self.assertFalse(self.sm.updated['cameraOdometry'])
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,2 @@
#!/bin/bash
clang++ -I /home/batman/one/external/tensorflow/include/ -L /home/batman/one/external/tensorflow/lib -Wl,-rpath=/home/batman/one/external/tensorflow/lib main.cc -ltensorflow

View File

@@ -0,0 +1,69 @@
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include "tensorflow/c/c_api.h"
void* read_file(const char* path, size_t* out_len) {
FILE* f = fopen(path, "r");
if (!f) {
return NULL;
}
fseek(f, 0, SEEK_END);
long f_len = ftell(f);
rewind(f);
char* buf = (char*)calloc(f_len, 1);
assert(buf);
size_t num_read = fread(buf, f_len, 1, f);
fclose(f);
if (num_read != 1) {
free(buf);
return NULL;
}
if (out_len) {
*out_len = f_len;
}
return buf;
}
static void DeallocateBuffer(void* data, size_t) {
free(data);
}
int main(int argc, char* argv[]) {
TF_Buffer* buf;
TF_Graph* graph;
TF_Status* status;
char *path = argv[1];
// load model
{
size_t model_size;
char tmp[1024];
snprintf(tmp, sizeof(tmp), "%s.pb", path);
printf("loading model %s\n", tmp);
uint8_t *model_data = (uint8_t *)read_file(tmp, &model_size);
buf = TF_NewBuffer();
buf->data = model_data;
buf->length = model_size;
buf->data_deallocator = DeallocateBuffer;
printf("loaded model of size %d\n", model_size);
}
// import graph
status = TF_NewStatus();
graph = TF_NewGraph();
TF_ImportGraphDefOptions *opts = TF_NewImportGraphDefOptions();
TF_GraphImportGraphDef(graph, buf, opts, status);
TF_DeleteImportGraphDefOptions(opts);
TF_DeleteBuffer(buf);
if (TF_GetCode(status) != TF_OK) {
printf("FAIL: %s\n", TF_Message(status));
} else {
printf("SUCCESS\n");
}
}

View File

@@ -0,0 +1,8 @@
#!/usr/bin/env python3
import sys
import tensorflow as tf
with open(sys.argv[1], "rb") as f:
graph_def = tf.compat.v1.GraphDef()
graph_def.ParseFromString(f.read())
#tf.io.write_graph(graph_def, '', sys.argv[1]+".try")

View File

@@ -0,0 +1,39 @@
#!/usr/bin/env python3
# type: ignore
import os
import time
import numpy as np
import cereal.messaging as messaging
from openpilot.selfdrive.manager.process_config import managed_processes
N = int(os.getenv("N", "5"))
TIME = int(os.getenv("TIME", "30"))
if __name__ == "__main__":
sock = messaging.sub_sock('modelV2', conflate=False, timeout=1000)
execution_times = []
for _ in range(N):
os.environ['LOGPRINT'] = 'debug'
managed_processes['modeld'].start()
time.sleep(5)
t = []
start = time.monotonic()
while time.monotonic() - start < TIME:
msgs = messaging.drain_sock(sock, wait_for_one=True)
for m in msgs:
t.append(m.modelV2.modelExecutionTime)
execution_times.append(np.array(t[10:]) * 1000)
managed_processes['modeld'].stop()
print("\n\n")
print(f"ran modeld {N} times for {TIME}s each")
for _, t in enumerate(execution_times):
print(f"\tavg: {sum(t)/len(t):0.2f}ms, min: {min(t):0.2f}ms, max: {max(t):0.2f}ms")
print("\n\n")

View File

@@ -0,0 +1,8 @@
thneed is an SNPE accelerator. I know SNPE is already an accelerator, but sometimes things need to go even faster..
It runs on the local device, and caches a single model run. Then it replays it, but fast.
thneed slices through abstraction layers like a fish.
You need a thneed.

View File

@@ -0,0 +1,154 @@
#include <cassert>
#include <set>
#include "third_party/json11/json11.hpp"
#include "common/util.h"
#include "common/clutil.h"
#include "common/swaglog.h"
#include "selfdrive/modeld/thneed/thneed.h"
using namespace json11;
extern map<cl_program, string> g_program_source;
void Thneed::load(const char *filename) {
LOGD("Thneed::load: loading from %s\n", filename);
string buf = util::read_file(filename);
int jsz = *(int *)buf.data();
string jsonerr;
string jj(buf.data() + sizeof(int), jsz);
Json jdat = Json::parse(jj, jsonerr);
map<cl_mem, cl_mem> real_mem;
real_mem[NULL] = NULL;
int ptr = sizeof(int)+jsz;
for (auto &obj : jdat["objects"].array_items()) {
auto mobj = obj.object_items();
int sz = mobj["size"].int_value();
cl_mem clbuf = NULL;
if (mobj["buffer_id"].string_value().size() > 0) {
// image buffer must already be allocated
clbuf = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
assert(mobj["needs_load"].bool_value() == false);
} else {
if (mobj["needs_load"].bool_value()) {
clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, &buf[ptr], NULL);
if (debug >= 1) printf("loading %p %d @ 0x%X\n", clbuf, sz, ptr);
ptr += sz;
} else {
// TODO: is there a faster way to init zeroed out buffers?
void *host_zeros = calloc(sz, 1);
clbuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sz, host_zeros, NULL);
free(host_zeros);
}
}
assert(clbuf != NULL);
if (mobj["arg_type"] == "image2d_t" || mobj["arg_type"] == "image1d_t") {
cl_image_desc desc = {0};
desc.image_type = (mobj["arg_type"] == "image2d_t") ? CL_MEM_OBJECT_IMAGE2D : CL_MEM_OBJECT_IMAGE1D_BUFFER;
desc.image_width = mobj["width"].int_value();
desc.image_height = mobj["height"].int_value();
desc.image_row_pitch = mobj["row_pitch"].int_value();
assert(sz == desc.image_height*desc.image_row_pitch);
#ifdef QCOM2
desc.buffer = clbuf;
#else
// TODO: we are creating unused buffers on PC
clReleaseMemObject(clbuf);
#endif
cl_image_format format = {0};
format.image_channel_order = CL_RGBA;
format.image_channel_data_type = mobj["float32"].bool_value() ? CL_FLOAT : CL_HALF_FLOAT;
cl_int errcode;
#ifndef QCOM2
if (mobj["needs_load"].bool_value()) {
clbuf = clCreateImage(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, &format, &desc, &buf[ptr-sz], &errcode);
} else {
clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &errcode);
}
#else
clbuf = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &errcode);
#endif
if (clbuf == NULL) {
LOGE("clError: %s create image %zux%zu rp %zu with buffer %p\n", cl_get_error_string(errcode),
desc.image_width, desc.image_height, desc.image_row_pitch, desc.buffer);
}
assert(clbuf != NULL);
}
real_mem[*(cl_mem*)(mobj["id"].string_value().data())] = clbuf;
}
map<string, cl_program> g_programs;
for (const auto &[name, source] : jdat["programs"].object_items()) {
if (debug >= 1) printf("building %s with size %zu\n", name.c_str(), source.string_value().size());
g_programs[name] = cl_program_from_source(context, device_id, source.string_value());
}
for (auto &obj : jdat["inputs"].array_items()) {
auto mobj = obj.object_items();
int sz = mobj["size"].int_value();
cl_mem aa = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
input_clmem.push_back(aa);
input_sizes.push_back(sz);
LOGD("Thneed::load: adding input %s with size %d\n", mobj["name"].string_value().data(), sz);
cl_int cl_err;
void *ret = clEnqueueMapBuffer(command_queue, aa, CL_TRUE, CL_MAP_WRITE, 0, sz, 0, NULL, NULL, &cl_err);
if (cl_err != CL_SUCCESS) LOGE("clError: %s map %p %d\n", cl_get_error_string(cl_err), aa, sz);
assert(cl_err == CL_SUCCESS);
inputs.push_back(ret);
}
for (auto &obj : jdat["outputs"].array_items()) {
auto mobj = obj.object_items();
int sz = mobj["size"].int_value();
LOGD("Thneed::save: adding output with size %d\n", sz);
// TODO: support multiple outputs
output = real_mem[*(cl_mem*)(mobj["buffer_id"].string_value().data())];
assert(output != NULL);
}
for (auto &obj : jdat["binaries"].array_items()) {
string name = obj["name"].string_value();
size_t length = obj["length"].int_value();
if (debug >= 1) printf("binary %s with size %zu\n", name.c_str(), length);
g_programs[name] = cl_program_from_binary(context, device_id, (const uint8_t*)&buf[ptr], length);
ptr += length;
}
for (auto &obj : jdat["kernels"].array_items()) {
auto gws = obj["global_work_size"];
auto lws = obj["local_work_size"];
auto kk = shared_ptr<CLQueuedKernel>(new CLQueuedKernel(this));
kk->name = obj["name"].string_value();
kk->program = g_programs[kk->name];
kk->work_dim = obj["work_dim"].int_value();
for (int i = 0; i < kk->work_dim; i++) {
kk->global_work_size[i] = gws[i].int_value();
kk->local_work_size[i] = lws[i].int_value();
}
kk->num_args = obj["num_args"].int_value();
for (int i = 0; i < kk->num_args; i++) {
string arg = obj["args"].array_items()[i].string_value();
int arg_size = obj["args_size"].array_items()[i].int_value();
kk->args_size.push_back(arg_size);
if (arg_size == 8) {
cl_mem val = *(cl_mem*)(arg.data());
val = real_mem[val];
kk->args.push_back(string((char*)&val, sizeof(val)));
} else {
kk->args.push_back(arg);
}
}
kq.push_back(kk);
}
clFinish(command_queue);
}

View File

@@ -0,0 +1,133 @@
#pragma once
#ifndef __user
#define __user __attribute__(())
#endif
#include <cstdint>
#include <cstdlib>
#include <memory>
#include <string>
#include <vector>
#include <CL/cl.h>
#include "third_party/linux/include/msm_kgsl.h"
using namespace std;
cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value);
namespace json11 {
class Json;
}
class Thneed;
class GPUMalloc {
public:
GPUMalloc(int size, int fd);
~GPUMalloc();
void *alloc(int size);
private:
uint64_t base;
int remaining;
};
class CLQueuedKernel {
public:
CLQueuedKernel(Thneed *lthneed) { thneed = lthneed; }
CLQueuedKernel(Thneed *lthneed,
cl_kernel _kernel,
cl_uint _work_dim,
const size_t *_global_work_size,
const size_t *_local_work_size);
cl_int exec();
void debug_print(bool verbose);
int get_arg_num(const char *search_arg_name);
cl_program program;
string name;
cl_uint num_args;
vector<string> arg_names;
vector<string> arg_types;
vector<string> args;
vector<int> args_size;
cl_kernel kernel = NULL;
json11::Json to_json() const;
cl_uint work_dim;
size_t global_work_size[3] = {0};
size_t local_work_size[3] = {0};
private:
Thneed *thneed;
};
class CachedIoctl {
public:
virtual void exec() {}
};
class CachedSync: public CachedIoctl {
public:
CachedSync(Thneed *lthneed, string ldata) { thneed = lthneed; data = ldata; }
void exec();
private:
Thneed *thneed;
string data;
};
class CachedCommand: public CachedIoctl {
public:
CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd);
void exec();
private:
void disassemble(int cmd_index);
struct kgsl_gpu_command cache;
unique_ptr<kgsl_command_object[]> cmds;
unique_ptr<kgsl_command_object[]> objs;
Thneed *thneed;
vector<shared_ptr<CLQueuedKernel> > kq;
};
class Thneed {
public:
Thneed(bool do_clinit=false, cl_context _context = NULL);
void stop();
void execute(float **finputs, float *foutput, bool slow=false);
void wait();
vector<cl_mem> input_clmem;
vector<void *> inputs;
vector<size_t> input_sizes;
cl_mem output = NULL;
cl_context context = NULL;
cl_command_queue command_queue;
cl_device_id device_id;
int context_id;
// protected?
bool record = false;
int debug;
int timestamp;
#ifdef QCOM2
unique_ptr<GPUMalloc> ram;
vector<unique_ptr<CachedIoctl> > cmds;
int fd;
#endif
// all CL kernels
void copy_inputs(float **finputs, bool internal=false);
void copy_output(float *foutput);
cl_int clexec();
vector<shared_ptr<CLQueuedKernel> > kq;
// pending CL kernels
vector<shared_ptr<CLQueuedKernel> > ckq;
// loading
void load(const char *filename);
private:
void clinit();
};

View File

@@ -0,0 +1,216 @@
#include "selfdrive/modeld/thneed/thneed.h"
#include <cassert>
#include <cstring>
#include <map>
#include "common/clutil.h"
#include "common/timing.h"
map<pair<cl_kernel, int>, string> g_args;
map<pair<cl_kernel, int>, int> g_args_size;
map<cl_program, string> g_program_source;
void Thneed::stop() {
//printf("Thneed::stop: recorded %lu commands\n", cmds.size());
record = false;
}
void Thneed::clinit() {
device_id = cl_get_device_id(CL_DEVICE_TYPE_DEFAULT);
if (context == NULL) context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, NULL, NULL, &err));
//cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
cl_command_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
command_queue = CL_CHECK_ERR(clCreateCommandQueueWithProperties(context, device_id, props, &err));
printf("Thneed::clinit done\n");
}
cl_int Thneed::clexec() {
if (debug >= 1) printf("Thneed::clexec: running %lu queued kernels\n", kq.size());
for (auto &k : kq) {
if (record) ckq.push_back(k);
cl_int ret = k->exec();
assert(ret == CL_SUCCESS);
}
return clFinish(command_queue);
}
void Thneed::copy_inputs(float **finputs, bool internal) {
for (int idx = 0; idx < inputs.size(); ++idx) {
if (debug >= 1) printf("copying %lu -- %p -> %p (cl %p)\n", input_sizes[idx], finputs[idx], inputs[idx], input_clmem[idx]);
if (internal) {
// if it's internal, using memcpy is fine since the buffer sync is cached in the ioctl layer
if (finputs[idx] != NULL) memcpy(inputs[idx], finputs[idx], input_sizes[idx]);
} else {
if (finputs[idx] != NULL) CL_CHECK(clEnqueueWriteBuffer(command_queue, input_clmem[idx], CL_TRUE, 0, input_sizes[idx], finputs[idx], 0, NULL, NULL));
}
}
}
void Thneed::copy_output(float *foutput) {
if (output != NULL) {
size_t sz;
clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
if (debug >= 1) printf("copying %lu for output %p -> %p\n", sz, output, foutput);
CL_CHECK(clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL));
} else {
printf("CAUTION: model output is NULL, does it have no outputs?\n");
}
}
// *********** CLQueuedKernel ***********
CLQueuedKernel::CLQueuedKernel(Thneed *lthneed,
cl_kernel _kernel,
cl_uint _work_dim,
const size_t *_global_work_size,
const size_t *_local_work_size) {
thneed = lthneed;
kernel = _kernel;
work_dim = _work_dim;
assert(work_dim <= 3);
for (int i = 0; i < work_dim; i++) {
global_work_size[i] = _global_work_size[i];
local_work_size[i] = _local_work_size[i];
}
char _name[0x100];
clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(_name), _name, NULL);
name = string(_name);
clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL);
// get args
for (int i = 0; i < num_args; i++) {
char arg_name[0x100] = {0};
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
arg_names.push_back(string(arg_name));
clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
arg_types.push_back(string(arg_name));
args.push_back(g_args[make_pair(kernel, i)]);
args_size.push_back(g_args_size[make_pair(kernel, i)]);
}
// get program
clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, NULL);
}
int CLQueuedKernel::get_arg_num(const char *search_arg_name) {
for (int i = 0; i < num_args; i++) {
if (arg_names[i] == search_arg_name) return i;
}
printf("failed to find %s in %s\n", search_arg_name, name.c_str());
assert(false);
}
cl_int CLQueuedKernel::exec() {
if (kernel == NULL) {
kernel = clCreateKernel(program, name.c_str(), NULL);
arg_names.clear();
arg_types.clear();
for (int j = 0; j < num_args; j++) {
char arg_name[0x100] = {0};
clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL);
arg_names.push_back(string(arg_name));
clGetKernelArgInfo(kernel, j, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_name), arg_name, NULL);
arg_types.push_back(string(arg_name));
cl_int ret;
if (args[j].size() != 0) {
assert(args[j].size() == args_size[j]);
ret = thneed_clSetKernelArg(kernel, j, args[j].size(), args[j].data());
} else {
ret = thneed_clSetKernelArg(kernel, j, args_size[j], NULL);
}
assert(ret == CL_SUCCESS);
}
}
if (thneed->debug >= 1) {
debug_print(thneed->debug >= 2);
}
return clEnqueueNDRangeKernel(thneed->command_queue,
kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL);
}
void CLQueuedKernel::debug_print(bool verbose) {
printf("%p %56s -- ", kernel, name.c_str());
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", global_work_size[i]);
}
printf(" -- ");
for (int i = 0; i < work_dim; i++) {
printf("%4zu ", local_work_size[i]);
}
printf("\n");
if (verbose) {
for (int i = 0; i < num_args; i++) {
string arg = args[i];
printf(" %s %s", arg_types[i].c_str(), arg_names[i].c_str());
void *arg_value = (void*)arg.data();
int arg_size = arg.size();
if (arg_size == 0) {
printf(" (size) %d", args_size[i]);
} else if (arg_size == 1) {
printf(" = %d", *((char*)arg_value));
} else if (arg_size == 2) {
printf(" = %d", *((short*)arg_value));
} else if (arg_size == 4) {
if (arg_types[i] == "float") {
printf(" = %f", *((float*)arg_value));
} else {
printf(" = %d", *((int*)arg_value));
}
} else if (arg_size == 8) {
cl_mem val = (cl_mem)(*((uintptr_t*)arg_value));
printf(" = %p", val);
if (val != NULL) {
cl_mem_object_type obj_type;
clGetMemObjectInfo(val, CL_MEM_TYPE, sizeof(obj_type), &obj_type, NULL);
if (arg_types[i] == "image2d_t" || arg_types[i] == "image1d_t" || obj_type == CL_MEM_OBJECT_IMAGE2D) {
cl_image_format format;
size_t width, height, depth, array_size, row_pitch, slice_pitch;
cl_mem buf;
clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
assert(format.image_channel_order == CL_RGBA);
assert(format.image_channel_data_type == CL_HALF_FLOAT || format.image_channel_data_type == CL_FLOAT);
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
clGetImageInfo(val, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
clGetImageInfo(val, CL_IMAGE_ARRAY_SIZE, sizeof(array_size), &array_size, NULL);
clGetImageInfo(val, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL);
assert(depth == 0);
assert(array_size == 0);
assert(slice_pitch == 0);
clGetImageInfo(val, CL_IMAGE_BUFFER, sizeof(buf), &buf, NULL);
size_t sz = 0;
if (buf != NULL) clGetMemObjectInfo(buf, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
printf(" image %zu x %zu rp %zu @ %p buffer %zu", width, height, row_pitch, buf, sz);
} else {
size_t sz;
clGetMemObjectInfo(val, CL_MEM_SIZE, sizeof(sz), &sz, NULL);
printf(" buffer %zu", sz);
}
}
}
printf("\n");
}
}
}
cl_int thneed_clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) {
g_args_size[make_pair(kernel, arg_index)] = arg_size;
if (arg_value != NULL) {
g_args[make_pair(kernel, arg_index)] = string((char*)arg_value, arg_size);
} else {
g_args[make_pair(kernel, arg_index)] = string("");
}
cl_int ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value);
return ret;
}

View File

@@ -0,0 +1,32 @@
#include "selfdrive/modeld/thneed/thneed.h"
#include <cassert>
#include "common/clutil.h"
#include "common/timing.h"
Thneed::Thneed(bool do_clinit, cl_context _context) {
context = _context;
if (do_clinit) clinit();
char *thneed_debug_env = getenv("THNEED_DEBUG");
debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
}
void Thneed::execute(float **finputs, float *foutput, bool slow) {
uint64_t tb, te;
if (debug >= 1) tb = nanos_since_boot();
// ****** copy inputs
copy_inputs(finputs);
// ****** run commands
clexec();
// ****** copy outputs
copy_output(foutput);
if (debug >= 1) {
te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000);
}
}

View File

@@ -0,0 +1,258 @@
#include "selfdrive/modeld/thneed/thneed.h"
#include <dlfcn.h>
#include <sys/mman.h>
#include <cassert>
#include <cerrno>
#include <cstring>
#include <map>
#include <string>
#include "common/clutil.h"
#include "common/timing.h"
Thneed *g_thneed = NULL;
int g_fd = -1;
void hexdump(uint8_t *d, int len) {
assert((len%4) == 0);
printf(" dumping %p len 0x%x\n", d, len);
for (int i = 0; i < len/4; i++) {
if (i != 0 && (i%0x10) == 0) printf("\n");
printf("%8x ", d[i]);
}
printf("\n");
}
// *********** ioctl interceptor ***********
extern "C" {
int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL;
#undef ioctl
int ioctl(int filedes, unsigned long request, void *argp) {
request &= 0xFFFFFFFF; // needed on QCOM2
if (my_ioctl == NULL) my_ioctl = reinterpret_cast<decltype(my_ioctl)>(dlsym(RTLD_NEXT, "ioctl"));
Thneed *thneed = g_thneed;
// save the fd
if (request == IOCTL_KGSL_GPUOBJ_ALLOC) g_fd = filedes;
// note that this runs always, even without a thneed object
if (request == IOCTL_KGSL_DRAWCTXT_CREATE) {
struct kgsl_drawctxt_create *create = (struct kgsl_drawctxt_create *)argp;
create->flags &= ~KGSL_CONTEXT_PRIORITY_MASK;
create->flags |= 6 << KGSL_CONTEXT_PRIORITY_SHIFT; // priority from 1-15, 1 is max priority
printf("IOCTL_KGSL_DRAWCTXT_CREATE: creating context with flags 0x%x\n", create->flags);
}
if (thneed != NULL) {
if (request == IOCTL_KGSL_GPU_COMMAND) {
struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp;
if (thneed->record) {
thneed->timestamp = cmd->timestamp;
thneed->context_id = cmd->context_id;
thneed->cmds.push_back(unique_ptr<CachedCommand>(new CachedCommand(thneed, cmd)));
}
if (thneed->debug >= 1) {
printf("IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx context_id: %u timestamp: %u numcmds: %d numobjs: %d\n",
thneed->cmds.size(),
cmd->flags,
cmd->context_id, cmd->timestamp, cmd->numcmds, cmd->numobjs);
}
} else if (request == IOCTL_KGSL_GPUOBJ_SYNC) {
struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp;
struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs);
if (thneed->debug >= 2) {
printf("IOCTL_KGSL_GPUOBJ_SYNC count:%d ", cmd->count);
for (int i = 0; i < cmd->count; i++) {
printf(" -- offset:0x%lx len:0x%lx id:%d op:%d ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op);
}
printf("\n");
}
if (thneed->record) {
thneed->cmds.push_back(unique_ptr<CachedSync>(new
CachedSync(thneed, string((char *)objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count))));
}
} else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) {
struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp;
if (thneed->debug >= 1) {
printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d timestamp: %d timeout: %d\n",
cmd->context_id, cmd->timestamp, cmd->timeout);
}
} else if (request == IOCTL_KGSL_SETPROPERTY) {
if (thneed->debug >= 1) {
struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp;
printf("IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu\n", prop->type, prop->sizebytes);
if (thneed->debug >= 2) {
hexdump((uint8_t *)prop->value, prop->sizebytes);
if (prop->type == KGSL_PROP_PWR_CONSTRAINT) {
struct kgsl_device_constraint *constraint = (struct kgsl_device_constraint *)prop->value;
hexdump((uint8_t *)constraint->data, constraint->size);
}
}
}
} else if (request == IOCTL_KGSL_DRAWCTXT_CREATE || request == IOCTL_KGSL_DRAWCTXT_DESTROY) {
// this happens
} else if (request == IOCTL_KGSL_GPUOBJ_ALLOC || request == IOCTL_KGSL_GPUOBJ_FREE) {
// this happens
} else {
if (thneed->debug >= 1) {
printf("other ioctl %lx\n", request);
}
}
}
int ret = my_ioctl(filedes, request, argp);
// NOTE: This error message goes into stdout and messes up pyenv
// if (ret != 0) printf("ioctl returned %d with errno %d\n", ret, errno);
return ret;
}
}
// *********** GPUMalloc ***********
GPUMalloc::GPUMalloc(int size, int fd) {
struct kgsl_gpuobj_alloc alloc;
memset(&alloc, 0, sizeof(alloc));
alloc.size = size;
alloc.flags = 0x10000a00;
ioctl(fd, IOCTL_KGSL_GPUOBJ_ALLOC, &alloc);
void *addr = mmap64(NULL, alloc.mmapsize, 0x3, 0x1, fd, alloc.id*0x1000);
assert(addr != MAP_FAILED);
base = (uint64_t)addr;
remaining = size;
}
GPUMalloc::~GPUMalloc() {
// TODO: free the GPU malloced area
}
void *GPUMalloc::alloc(int size) {
void *ret = (void*)base;
size = (size+0xff) & (~0xFF);
assert(size <= remaining);
remaining -= size;
base += size;
return ret;
}
// *********** CachedSync, at the ioctl layer ***********
void CachedSync::exec() {
struct kgsl_gpuobj_sync cmd;
cmd.objs = (uint64_t)data.data();
cmd.obj_len = data.length();
cmd.count = data.length() / sizeof(struct kgsl_gpuobj_sync_obj);
int ret = ioctl(thneed->fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd);
assert(ret == 0);
}
// *********** CachedCommand, at the ioctl layer ***********
CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) {
thneed = lthneed;
assert(cmd->numsyncs == 0);
memcpy(&cache, cmd, sizeof(cache));
if (cmd->numcmds > 0) {
cmds = make_unique<struct kgsl_command_object[]>(cmd->numcmds);
memcpy(cmds.get(), (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*cmd->numcmds);
cache.cmdlist = (uint64_t)cmds.get();
for (int i = 0; i < cmd->numcmds; i++) {
void *nn = thneed->ram->alloc(cmds[i].size);
memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size);
cmds[i].gpuaddr = (uint64_t)nn;
}
}
if (cmd->numobjs > 0) {
objs = make_unique<struct kgsl_command_object[]>(cmd->numobjs);
memcpy(objs.get(), (void *)cmd->objlist, sizeof(struct kgsl_command_object)*cmd->numobjs);
cache.objlist = (uint64_t)objs.get();
for (int i = 0; i < cmd->numobjs; i++) {
void *nn = thneed->ram->alloc(objs[i].size);
memset(nn, 0, objs[i].size);
objs[i].gpuaddr = (uint64_t)nn;
}
}
kq = thneed->ckq;
thneed->ckq.clear();
}
void CachedCommand::exec() {
cache.timestamp = ++thneed->timestamp;
int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache);
if (thneed->debug >= 1) printf("CachedCommand::exec got %d\n", ret);
if (thneed->debug >= 2) {
for (auto &it : kq) {
it->debug_print(false);
}
}
assert(ret == 0);
}
// *********** Thneed ***********
Thneed::Thneed(bool do_clinit, cl_context _context) {
// TODO: QCOM2 actually requires a different context
//context = _context;
if (do_clinit) clinit();
assert(g_fd != -1);
fd = g_fd;
ram = make_unique<GPUMalloc>(0x80000, fd);
timestamp = -1;
g_thneed = this;
char *thneed_debug_env = getenv("THNEED_DEBUG");
debug = (thneed_debug_env != NULL) ? atoi(thneed_debug_env) : 0;
}
void Thneed::wait() {
struct kgsl_device_waittimestamp_ctxtid wait;
wait.context_id = context_id;
wait.timestamp = timestamp;
wait.timeout = -1;
uint64_t tb = nanos_since_boot();
int wret = ioctl(fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait);
uint64_t te = nanos_since_boot();
if (debug >= 1) printf("wait %d after %lu us\n", wret, (te-tb)/1000);
}
void Thneed::execute(float **finputs, float *foutput, bool slow) {
uint64_t tb, te;
if (debug >= 1) tb = nanos_since_boot();
// ****** copy inputs
copy_inputs(finputs, true);
// ****** run commands
int i = 0;
for (auto &it : cmds) {
++i;
if (debug >= 1) printf("run %2d @ %7lu us: ", i, (nanos_since_boot()-tb)/1000);
it->exec();
if ((i == cmds.size()) || slow) wait();
}
// ****** copy outputs
copy_output(foutput);
if (debug >= 1) {
te = nanos_since_boot();
printf("model exec in %lu us\n", (te-tb)/1000);
}
}

View File

@@ -0,0 +1,74 @@
#include "selfdrive/modeld/transforms/loadyuv.h"
#include <cassert>
#include <cstdio>
#include <cstring>
void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height) {
memset(s, 0, sizeof(*s));
s->width = width;
s->height = height;
char args[1024];
snprintf(args, sizeof(args),
"-cl-fast-relaxed-math -cl-denorms-are-zero "
"-DTRANSFORMED_WIDTH=%d -DTRANSFORMED_HEIGHT=%d",
width, height);
cl_program prg = cl_program_from_file(ctx, device_id, LOADYUV_PATH, args);
s->loadys_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loadys", &err));
s->loaduv_krnl = CL_CHECK_ERR(clCreateKernel(prg, "loaduv", &err));
s->copy_krnl = CL_CHECK_ERR(clCreateKernel(prg, "copy", &err));
// done with this
CL_CHECK(clReleaseProgram(prg));
}
void loadyuv_destroy(LoadYUVState* s) {
CL_CHECK(clReleaseKernel(s->loadys_krnl));
CL_CHECK(clReleaseKernel(s->loaduv_krnl));
CL_CHECK(clReleaseKernel(s->copy_krnl));
}
void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
cl_mem out_cl, bool do_shift) {
cl_int global_out_off = 0;
if (do_shift) {
// shift the image in slot 1 to slot 0, then place the new image in slot 1
global_out_off += (s->width*s->height) + (s->width/2)*(s->height/2)*2;
CL_CHECK(clSetKernelArg(s->copy_krnl, 0, sizeof(cl_mem), &out_cl));
CL_CHECK(clSetKernelArg(s->copy_krnl, 1, sizeof(cl_int), &global_out_off));
const size_t copy_work_size = global_out_off/8;
CL_CHECK(clEnqueueNDRangeKernel(q, s->copy_krnl, 1, NULL,
&copy_work_size, NULL, 0, 0, NULL));
}
CL_CHECK(clSetKernelArg(s->loadys_krnl, 0, sizeof(cl_mem), &y_cl));
CL_CHECK(clSetKernelArg(s->loadys_krnl, 1, sizeof(cl_mem), &out_cl));
CL_CHECK(clSetKernelArg(s->loadys_krnl, 2, sizeof(cl_int), &global_out_off));
const size_t loadys_work_size = (s->width*s->height)/8;
CL_CHECK(clEnqueueNDRangeKernel(q, s->loadys_krnl, 1, NULL,
&loadys_work_size, NULL, 0, 0, NULL));
const size_t loaduv_work_size = ((s->width/2)*(s->height/2))/8;
global_out_off += (s->width*s->height);
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &u_cl));
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl));
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off));
CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
&loaduv_work_size, NULL, 0, 0, NULL));
global_out_off += (s->width/2)*(s->height/2);
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 0, sizeof(cl_mem), &v_cl));
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 1, sizeof(cl_mem), &out_cl));
CL_CHECK(clSetKernelArg(s->loaduv_krnl, 2, sizeof(cl_int), &global_out_off));
CL_CHECK(clEnqueueNDRangeKernel(q, s->loaduv_krnl, 1, NULL,
&loaduv_work_size, NULL, 0, 0, NULL));
}

View File

@@ -0,0 +1,16 @@
#pragma once
#include "common/clutil.h"
typedef struct {
int width, height;
cl_kernel loadys_krnl, loaduv_krnl, copy_krnl;
} LoadYUVState;
void loadyuv_init(LoadYUVState* s, cl_context ctx, cl_device_id device_id, int width, int height);
void loadyuv_destroy(LoadYUVState* s);
void loadyuv_queue(LoadYUVState* s, cl_command_queue q,
cl_mem y_cl, cl_mem u_cl, cl_mem v_cl,
cl_mem out_cl, bool do_shift = false);

View File

@@ -0,0 +1,97 @@
#include "selfdrive/modeld/transforms/transform.h"
#include <cassert>
#include <cstring>
#include "common/clutil.h"
void transform_init(Transform* s, cl_context ctx, cl_device_id device_id) {
memset(s, 0, sizeof(*s));
cl_program prg = cl_program_from_file(ctx, device_id, TRANSFORM_PATH, "");
s->krnl = CL_CHECK_ERR(clCreateKernel(prg, "warpPerspective", &err));
// done with this
CL_CHECK(clReleaseProgram(prg));
s->m_y_cl = CL_CHECK_ERR(clCreateBuffer(ctx, CL_MEM_READ_WRITE, 3*3*sizeof(float), NULL, &err));
s->m_uv_cl = CL_CHECK_ERR(clCreateBuffer(ctx, CL_MEM_READ_WRITE, 3*3*sizeof(float), NULL, &err));
}
void transform_destroy(Transform* s) {
CL_CHECK(clReleaseMemObject(s->m_y_cl));
CL_CHECK(clReleaseMemObject(s->m_uv_cl));
CL_CHECK(clReleaseKernel(s->krnl));
}
void transform_queue(Transform* s,
cl_command_queue q,
cl_mem in_yuv, int in_width, int in_height, int in_stride, int in_uv_offset,
cl_mem out_y, cl_mem out_u, cl_mem out_v,
int out_width, int out_height,
const mat3& projection) {
const int zero = 0;
// sampled using pixel center origin
// (because that's how fastcv and opencv does it)
mat3 projection_y = projection;
// in and out uv is half the size of y.
mat3 projection_uv = transform_scale_buffer(projection, 0.5);
CL_CHECK(clEnqueueWriteBuffer(q, s->m_y_cl, CL_TRUE, 0, 3*3*sizeof(float), (void*)projection_y.v, 0, NULL, NULL));
CL_CHECK(clEnqueueWriteBuffer(q, s->m_uv_cl, CL_TRUE, 0, 3*3*sizeof(float), (void*)projection_uv.v, 0, NULL, NULL));
const int in_y_width = in_width;
const int in_y_height = in_height;
const int in_y_px_stride = 1;
const int in_uv_width = in_width/2;
const int in_uv_height = in_height/2;
const int in_uv_px_stride = 2;
const int in_u_offset = in_uv_offset;
const int in_v_offset = in_uv_offset + 1;
const int out_y_width = out_width;
const int out_y_height = out_height;
const int out_uv_width = out_width/2;
const int out_uv_height = out_height/2;
CL_CHECK(clSetKernelArg(s->krnl, 0, sizeof(cl_mem), &in_yuv)); // src
CL_CHECK(clSetKernelArg(s->krnl, 1, sizeof(cl_int), &in_stride)); // src_row_stride
CL_CHECK(clSetKernelArg(s->krnl, 2, sizeof(cl_int), &in_y_px_stride)); // src_px_stride
CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &zero)); // src_offset
CL_CHECK(clSetKernelArg(s->krnl, 4, sizeof(cl_int), &in_y_height)); // src_rows
CL_CHECK(clSetKernelArg(s->krnl, 5, sizeof(cl_int), &in_y_width)); // src_cols
CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_y)); // dst
CL_CHECK(clSetKernelArg(s->krnl, 7, sizeof(cl_int), &out_y_width)); // dst_row_stride
CL_CHECK(clSetKernelArg(s->krnl, 8, sizeof(cl_int), &zero)); // dst_offset
CL_CHECK(clSetKernelArg(s->krnl, 9, sizeof(cl_int), &out_y_height)); // dst_rows
CL_CHECK(clSetKernelArg(s->krnl, 10, sizeof(cl_int), &out_y_width)); // dst_cols
CL_CHECK(clSetKernelArg(s->krnl, 11, sizeof(cl_mem), &s->m_y_cl)); // M
const size_t work_size_y[2] = {(size_t)out_y_width, (size_t)out_y_height};
CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
(const size_t*)&work_size_y, NULL, 0, 0, NULL));
const size_t work_size_uv[2] = {(size_t)out_uv_width, (size_t)out_uv_height};
CL_CHECK(clSetKernelArg(s->krnl, 2, sizeof(cl_int), &in_uv_px_stride)); // src_px_stride
CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &in_u_offset)); // src_offset
CL_CHECK(clSetKernelArg(s->krnl, 4, sizeof(cl_int), &in_uv_height)); // src_rows
CL_CHECK(clSetKernelArg(s->krnl, 5, sizeof(cl_int), &in_uv_width)); // src_cols
CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_u)); // dst
CL_CHECK(clSetKernelArg(s->krnl, 7, sizeof(cl_int), &out_uv_width)); // dst_row_stride
CL_CHECK(clSetKernelArg(s->krnl, 8, sizeof(cl_int), &zero)); // dst_offset
CL_CHECK(clSetKernelArg(s->krnl, 9, sizeof(cl_int), &out_uv_height)); // dst_rows
CL_CHECK(clSetKernelArg(s->krnl, 10, sizeof(cl_int), &out_uv_width)); // dst_cols
CL_CHECK(clSetKernelArg(s->krnl, 11, sizeof(cl_mem), &s->m_uv_cl)); // M
CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
(const size_t*)&work_size_uv, NULL, 0, 0, NULL));
CL_CHECK(clSetKernelArg(s->krnl, 3, sizeof(cl_int), &in_v_offset)); // src_ofset
CL_CHECK(clSetKernelArg(s->krnl, 6, sizeof(cl_mem), &out_v)); // dst
CL_CHECK(clEnqueueNDRangeKernel(q, s->krnl, 2, NULL,
(const size_t*)&work_size_uv, NULL, 0, 0, NULL));
}

View File

@@ -0,0 +1,25 @@
#pragma once
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#include "common/mat.h"
typedef struct {
cl_kernel krnl;
cl_mem m_y_cl, m_uv_cl;
} Transform;
void transform_init(Transform* s, cl_context ctx, cl_device_id device_id);
void transform_destroy(Transform* transform);
void transform_queue(Transform* s, cl_command_queue q,
cl_mem yuv, int in_width, int in_height, int in_stride, int in_uv_offset,
cl_mem out_y, cl_mem out_u, cl_mem out_v,
int out_width, int out_height,
const mat3& projection);