openpilot v0.9.6 release

date: 2024-01-12T10:13:37 master commit: ba792d576a49a0899b88a753fa1c52956bedf9e6
2024-01-12 22:39:28 -07:00
commit 08e9fb1edc
1881 changed files with 653708 additions and 0 deletions
--- a/selfdrive/modeld/runners/init.py
+++ b/selfdrive/modeld/runners/init.py
@@ -0,0 +1,27 @@
+import os
+from openpilot.system.hardware import TICI
+from openpilot.selfdrive.modeld.runners.runmodel_pyx import RunModel, Runtime
+assert Runtime
+
+USE_THNEED = int(os.getenv('USE_THNEED', str(int(TICI))))
+USE_SNPE = int(os.getenv('USE_SNPE', str(int(TICI))))
+
+class ModelRunner(RunModel):
+  THNEED = 'THNEED'
+  SNPE = 'SNPE'
+  ONNX = 'ONNX'
+
+  def __new__(cls, paths, *args, **kwargs):
+    if ModelRunner.THNEED in paths and USE_THNEED:
+      from openpilot.selfdrive.modeld.runners.thneedmodel_pyx import ThneedModel as Runner
+      runner_type = ModelRunner.THNEED
+    elif ModelRunner.SNPE in paths and USE_SNPE:
+      from openpilot.selfdrive.modeld.runners.snpemodel_pyx import SNPEModel as Runner
+      runner_type = ModelRunner.SNPE
+    elif ModelRunner.ONNX in paths:
+      from openpilot.selfdrive.modeld.runners.onnxmodel import ONNXModel as Runner
+      runner_type = ModelRunner.ONNX
+    else:
+      raise Exception("Couldn't select a model runner, make sure to pass at least one valid model path")
+
+    return Runner(str(paths[runner_type]), *args, **kwargs)
--- a/selfdrive/modeld/runners/onnxmodel.py
+++ b/selfdrive/modeld/runners/onnxmodel.py
@@ -0,0 +1,93 @@
+import onnx
+import itertools
+import os
+import sys
+import numpy as np
+from typing import Tuple, Dict, Union, Any
+
+from openpilot.selfdrive.modeld.runners.runmodel_pyx import RunModel
+
+ORT_TYPES_TO_NP_TYPES = {'tensor(float16)': np.float16, 'tensor(float)': np.float32, 'tensor(uint8)': np.uint8}
+
+def attributeproto_fp16_to_fp32(attr):
+  float32_list = np.frombuffer(attr.raw_data, dtype=np.float16)
+  attr.data_type = 1
+  attr.raw_data = float32_list.astype(np.float32).tobytes()
+
+def convert_fp16_to_fp32(path):
+  model = onnx.load(path)
+  for i in model.graph.initializer:
+    if i.data_type == 10:
+      attributeproto_fp16_to_fp32(i)
+  for i in itertools.chain(model.graph.input, model.graph.output):
+    if i.type.tensor_type.elem_type == 10:
+      i.type.tensor_type.elem_type = 1
+  for i in model.graph.node:
+    for a in i.attribute:
+      if hasattr(a, 't'):
+        if a.t.data_type == 10:
+          attributeproto_fp16_to_fp32(a.t)
+  return model.SerializeToString()
+
+def create_ort_session(path, fp16_to_fp32):
+  os.environ["OMP_NUM_THREADS"] = "4"
+  os.environ["OMP_WAIT_POLICY"] = "PASSIVE"
+
+  import onnxruntime as ort
+  print("Onnx available providers: ", ort.get_available_providers(), file=sys.stderr)
+  options = ort.SessionOptions()
+  options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
+
+  provider: Union[str, Tuple[str, Dict[Any, Any]]]
+  if 'OpenVINOExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
+    provider = 'OpenVINOExecutionProvider'
+  elif 'CUDAExecutionProvider' in ort.get_available_providers() and 'ONNXCPU' not in os.environ:
+    options.intra_op_num_threads = 2
+    provider = ('CUDAExecutionProvider', {'cudnn_conv_algo_search': 'DEFAULT'})
+  else:
+    options.intra_op_num_threads = 2
+    options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+    options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+    provider = 'CPUExecutionProvider'
+
+  model_data = convert_fp16_to_fp32(path) if fp16_to_fp32 else path
+  print("Onnx selected provider: ", [provider], file=sys.stderr)
+  ort_session = ort.InferenceSession(model_data, options, providers=[provider])
+  print("Onnx using ", ort_session.get_providers(), file=sys.stderr)
+  return ort_session
+
+
+class ONNXModel(RunModel):
+  def __init__(self, path, output, runtime, use_tf8, cl_context):
+    self.inputs = {}
+    self.output = output
+    self.use_tf8 = use_tf8
+
+    self.session = create_ort_session(path, fp16_to_fp32=True)
+    self.input_names = [x.name for x in self.session.get_inputs()]
+    self.input_shapes = {x.name: [1, *x.shape[1:]] for x in self.session.get_inputs()}
+    self.input_dtypes = {x.name: ORT_TYPES_TO_NP_TYPES[x.type] for x in self.session.get_inputs()}
+
+    # run once to initialize CUDA provider
+    if "CUDAExecutionProvider" in self.session.get_providers():
+      self.session.run(None, {k: np.zeros(self.input_shapes[k], dtype=self.input_dtypes[k]) for k in self.input_names})
+    print("ready to run onnx model", self.input_shapes, file=sys.stderr)
+
+  def addInput(self, name, buffer):
+    assert name in self.input_names
+    self.inputs[name] = buffer
+
+  def setInputBuffer(self, name, buffer):
+    assert name in self.inputs
+    self.inputs[name] = buffer
+
+  def getCLBuffer(self, name):
+    return None
+
+  def execute(self):
+    inputs = {k: (v.view(np.uint8) / 255. if self.use_tf8 and k == 'input_img' else v) for k,v in self.inputs.items()}
+    inputs = {k: v.reshape(self.input_shapes[k]).astype(self.input_dtypes[k]) for k,v in inputs.items()}
+    outputs = self.session.run(None, inputs)
+    assert len(outputs) == 1, "Only single model outputs are supported"
+    self.output[:] = outputs[0]
+    return self.output
--- a/selfdrive/modeld/runners/run.h
+++ b/selfdrive/modeld/runners/run.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include "selfdrive/modeld/runners/runmodel.h"
+#include "selfdrive/modeld/runners/snpemodel.h"
--- a/selfdrive/modeld/runners/runmodel.h
+++ b/selfdrive/modeld/runners/runmodel.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <cassert>
+
+#include "common/clutil.h"
+#include "common/swaglog.h"
+
+#define USE_CPU_RUNTIME 0
+#define USE_GPU_RUNTIME 1
+#define USE_DSP_RUNTIME 2
+
+struct ModelInput {
+  const std::string name;
+  float *buffer;
+  int size;
+
+  ModelInput(const std::string _name, float *_buffer, int _size) : name(_name), buffer(_buffer), size(_size) {}
+  virtual void setBuffer(float *_buffer, int _size) {
+    assert(size == _size || size == 0);
+    buffer = _buffer;
+    size = _size;
+  }
+};
+
+class RunModel {
+public:
+  std::vector<std::unique_ptr<ModelInput>> inputs;
+
+  virtual ~RunModel() {}
+  virtual void execute() {}
+  virtual void* getCLBuffer(const std::string name) { return nullptr; }
+
+  virtual void addInput(const std::string name, float *buffer, int size) {
+    inputs.push_back(std::unique_ptr<ModelInput>(new ModelInput(name, buffer, size)));
+  }
+  virtual void setInputBuffer(const std::string name, float *buffer, int size) {
+    for (auto &input : inputs) {
+      if (name == input->name) {
+        input->setBuffer(buffer, size);
+        return;
+      }
+    }
+    LOGE("Tried to update input `%s` but no input with this name exists", name.c_str());
+    assert(false);
+  }
+};
--- a/selfdrive/modeld/runners/runmodel.pxd
+++ b/selfdrive/modeld/runners/runmodel.pxd
@@ -0,0 +1,14 @@
+# distutils: language = c++
+
+from libcpp.string cimport string
+
+cdef extern from "selfdrive/modeld/runners/runmodel.h":
+  cdef int USE_CPU_RUNTIME
+  cdef int USE_GPU_RUNTIME
+  cdef int USE_DSP_RUNTIME
+
+  cdef cppclass RunModel:
+    void addInput(string, float*, int)
+    void setInputBuffer(string, float*, int)
+    void * getCLBuffer(string)
+    void execute()
--- a/selfdrive/modeld/runners/runmodel_pyx.pxd
+++ b/selfdrive/modeld/runners/runmodel_pyx.pxd
@@ -0,0 +1,6 @@
+# distutils: language = c++
+
+from .runmodel cimport RunModel as cppRunModel
+
+cdef class RunModel:
+  cdef cppRunModel * model
--- a/selfdrive/modeld/runners/runmodel_pyx.pyx
+++ b/selfdrive/modeld/runners/runmodel_pyx.pyx
@@ -0,0 +1,38 @@
+# distutils: language = c++
+# cython: c_string_encoding=ascii
+
+from libcpp.string cimport string
+from libc.string cimport memcpy
+
+from .runmodel cimport USE_CPU_RUNTIME, USE_GPU_RUNTIME, USE_DSP_RUNTIME
+from selfdrive.modeld.models.commonmodel_pyx cimport CLMem
+
+class Runtime:
+  CPU = USE_CPU_RUNTIME
+  GPU = USE_GPU_RUNTIME
+  DSP = USE_DSP_RUNTIME
+
+cdef class RunModel:
+  def __dealloc__(self):
+    del self.model
+
+  def addInput(self, string name, float[:] buffer):
+    if buffer is not None:
+      self.model.addInput(name, &buffer[0], len(buffer))
+    else:
+      self.model.addInput(name, NULL, 0)
+
+  def setInputBuffer(self, string name, float[:] buffer):
+    if buffer is not None:
+      self.model.setInputBuffer(name, &buffer[0], len(buffer))
+    else:
+      self.model.setInputBuffer(name, NULL, 0)
+
+  def getCLBuffer(self, string name):
+    cdef void * cl_buf = self.model.getCLBuffer(name)
+    if not cl_buf:
+      return None
+    return CLMem.create(cl_buf)
+
+  def execute(self):
+    self.model.execute()
--- a/selfdrive/modeld/runners/snpemodel.cc
+++ b/selfdrive/modeld/runners/snpemodel.cc
@@ -0,0 +1,116 @@
+#pragma clang diagnostic ignored "-Wexceptions"
+
+#include "selfdrive/modeld/runners/snpemodel.h"
+
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "common/util.h"
+#include "common/timing.h"
+
+void PrintErrorStringAndExit() {
+  std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
+  std::exit(EXIT_FAILURE);
+}
+
+SNPEModel::SNPEModel(const std::string path, float *_output, size_t _output_size, int runtime, bool _use_tf8, cl_context context) {
+  output = _output;
+  output_size = _output_size;
+  use_tf8 = _use_tf8;
+
+#ifdef QCOM2
+  if (runtime == USE_GPU_RUNTIME) {
+    snpe_runtime = zdl::DlSystem::Runtime_t::GPU;
+  } else if (runtime == USE_DSP_RUNTIME) {
+    snpe_runtime = zdl::DlSystem::Runtime_t::DSP;
+  } else {
+    snpe_runtime = zdl::DlSystem::Runtime_t::CPU;
+  }
+  assert(zdl::SNPE::SNPEFactory::isRuntimeAvailable(snpe_runtime));
+#endif
+  model_data = util::read_file(path);
+  assert(model_data.size() > 0);
+
+  // load model
+  std::unique_ptr<zdl::DlContainer::IDlContainer> container = zdl::DlContainer::IDlContainer::open((uint8_t*)model_data.data(), model_data.size());
+  if (!container) { PrintErrorStringAndExit(); }
+  LOGW("loaded model with size: %lu", model_data.size());
+
+  // create model runner
+  zdl::SNPE::SNPEBuilder snpe_builder(container.get());
+  while (!snpe) {
+#ifdef QCOM2
+    snpe = snpe_builder.setOutputLayers({})
+                       .setRuntimeProcessor(snpe_runtime)
+                       .setUseUserSuppliedBuffers(true)
+                       .setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
+                       .build();
+#else
+    snpe = snpe_builder.setOutputLayers({})
+                       .setUseUserSuppliedBuffers(true)
+                       .setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
+                       .build();
+#endif
+    if (!snpe) std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
+  }
+
+  // create output buffer
+  zdl::DlSystem::UserBufferEncodingFloat ub_encoding_float;
+  zdl::DlSystem::IUserBufferFactory &ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
+
+  const auto &output_tensor_names_opt = snpe->getOutputTensorNames();
+  if (!output_tensor_names_opt) throw std::runtime_error("Error obtaining output tensor names");
+  const auto &output_tensor_names = *output_tensor_names_opt;
+  assert(output_tensor_names.size() == 1);
+  const char *output_tensor_name = output_tensor_names.at(0);
+  const zdl::DlSystem::TensorShape &buffer_shape = snpe->getInputOutputBufferAttributes(output_tensor_name)->getDims();
+  if (output_size != 0) {
+    assert(output_size == buffer_shape[1]);
+  } else {
+    output_size = buffer_shape[1];
+  }
+  std::vector<size_t> output_strides = {output_size * sizeof(float), sizeof(float)};
+  output_buffer = ub_factory.createUserBuffer(output, output_size * sizeof(float), output_strides, &ub_encoding_float);
+  output_map.add(output_tensor_name, output_buffer.get());
+}
+
+void SNPEModel::addInput(const std::string name, float *buffer, int size) {
+  const int idx = inputs.size();
+  const auto &input_tensor_names_opt = snpe->getInputTensorNames();
+  if (!input_tensor_names_opt) throw std::runtime_error("Error obtaining input tensor names");
+  const auto &input_tensor_names = *input_tensor_names_opt;
+  const char *input_tensor_name = input_tensor_names.at(idx);
+  const bool input_tf8 = use_tf8 && strcmp(input_tensor_name, "input_img") == 0;  // TODO: This is a terrible hack, get rid of this name check both here and in onnx_runner.py
+  LOGW("adding index %d: %s", idx, input_tensor_name);
+
+  zdl::DlSystem::UserBufferEncodingFloat ub_encoding_float;
+  zdl::DlSystem::UserBufferEncodingTf8 ub_encoding_tf8(0, 1./255); // network takes 0-1
+  zdl::DlSystem::IUserBufferFactory &ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
+  zdl::DlSystem::UserBufferEncoding *input_encoding = input_tf8 ? (zdl::DlSystem::UserBufferEncoding*)&ub_encoding_tf8 : (zdl::DlSystem::UserBufferEncoding*)&ub_encoding_float;
+
+  const auto &buffer_shape_opt = snpe->getInputDimensions(input_tensor_name);
+  const zdl::DlSystem::TensorShape &buffer_shape = *buffer_shape_opt;
+  size_t size_of_input = input_tf8 ? sizeof(uint8_t) : sizeof(float);
+  std::vector<size_t> strides(buffer_shape.rank());
+  strides[strides.size() - 1] = size_of_input;
+  size_t product = 1;
+  for (size_t i = 0; i < buffer_shape.rank(); i++) product *= buffer_shape[i];
+  size_t stride = strides[strides.size() - 1];
+  for (size_t i = buffer_shape.rank() - 1; i > 0; i--) {
+    stride *= buffer_shape[i];
+    strides[i-1] = stride;
+  }
+
+  auto input_buffer = ub_factory.createUserBuffer(buffer, product*size_of_input, strides, input_encoding);
+  input_map.add(input_tensor_name, input_buffer.get());
+  inputs.push_back(std::unique_ptr<SNPEModelInput>(new SNPEModelInput(name, buffer, size, std::move(input_buffer))));
+}
+
+void SNPEModel::execute() {
+  if (!snpe->execute(input_map, output_map)) {
+    PrintErrorStringAndExit();
+  }
+}
--- a/selfdrive/modeld/runners/snpemodel.h
+++ b/selfdrive/modeld/runners/snpemodel.h
@@ -0,0 +1,52 @@
+#pragma once
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include <DlContainer/IDlContainer.hpp>
+#include <DlSystem/DlError.hpp>
+#include <DlSystem/ITensor.hpp>
+#include <DlSystem/ITensorFactory.hpp>
+#include <DlSystem/IUserBuffer.hpp>
+#include <DlSystem/IUserBufferFactory.hpp>
+#include <SNPE/SNPE.hpp>
+#include <SNPE/SNPEBuilder.hpp>
+#include <SNPE/SNPEFactory.hpp>
+
+#include "selfdrive/modeld/runners/runmodel.h"
+
+struct SNPEModelInput : public ModelInput {
+  std::unique_ptr<zdl::DlSystem::IUserBuffer> snpe_buffer;
+
+  SNPEModelInput(const std::string _name, float *_buffer, int _size, std::unique_ptr<zdl::DlSystem::IUserBuffer> _snpe_buffer) : ModelInput(_name, _buffer, _size), snpe_buffer(std::move(_snpe_buffer)) {}
+  void setBuffer(float *_buffer, int _size) {
+    ModelInput::setBuffer(_buffer, _size);
+    assert(snpe_buffer->setBufferAddress(_buffer) == true);
+  }
+};
+
+class SNPEModel : public RunModel {
+public:
+  SNPEModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
+  void addInput(const std::string name, float *buffer, int size);
+  void execute();
+
+private:
+  std::string model_data;
+
+#ifdef QCOM2
+  zdl::DlSystem::Runtime_t snpe_runtime;
+#endif
+
+  // snpe model stuff
+  std::unique_ptr<zdl::SNPE::SNPE> snpe;
+  zdl::DlSystem::UserBufferMap input_map;
+  zdl::DlSystem::UserBufferMap output_map;
+  std::unique_ptr<zdl::DlSystem::IUserBuffer> output_buffer;
+
+  bool use_tf8;
+  float *output;
+  size_t output_size;
+};
--- a/selfdrive/modeld/runners/snpemodel.pxd
+++ b/selfdrive/modeld/runners/snpemodel.pxd
@@ -0,0 +1,9 @@
+# distutils: language = c++
+
+from libcpp.string cimport string
+
+from cereal.visionipc.visionipc cimport cl_context
+
+cdef extern from "selfdrive/modeld/runners/snpemodel.h":
+  cdef cppclass SNPEModel:
+    SNPEModel(string, float*, size_t, int, bool, cl_context)
--- a/selfdrive/modeld/runners/snpemodel_pyx.pyx
+++ b/selfdrive/modeld/runners/snpemodel_pyx.pyx
@@ -0,0 +1,17 @@
+# distutils: language = c++
+# cython: c_string_encoding=ascii
+
+import os
+from libcpp cimport bool
+from libcpp.string cimport string
+
+from .snpemodel cimport SNPEModel as cppSNPEModel
+from selfdrive.modeld.models.commonmodel_pyx cimport CLContext
+from selfdrive.modeld.runners.runmodel_pyx cimport RunModel
+from selfdrive.modeld.runners.runmodel cimport RunModel as cppRunModel
+
+os.environ['ADSP_LIBRARY_PATH'] = "/data/pythonpath/third_party/snpe/dsp/"
+
+cdef class SNPEModel(RunModel):
+  def __cinit__(self, string path, float[:] output, int runtime, bool use_tf8, CLContext context):
+    self.model = <cppRunModel *> new cppSNPEModel(path, &output[0], len(output), runtime, use_tf8, context.context)
--- a/selfdrive/modeld/runners/thneedmodel.cc
+++ b/selfdrive/modeld/runners/thneedmodel.cc
@@ -0,0 +1,58 @@
+#include "selfdrive/modeld/runners/thneedmodel.h"
+
+#include <string>
+
+#include "common/swaglog.h"
+
+ThneedModel::ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool luse_tf8, cl_context context) {
+  thneed = new Thneed(true, context);
+  thneed->load(path.c_str());
+  thneed->clexec();
+
+  recorded = false;
+  output = _output;
+}
+
+void* ThneedModel::getCLBuffer(const std::string name) {
+  int index = -1;
+  for (int i = 0; i < inputs.size(); i++) {
+    if (name == inputs[i]->name) {
+      index = i;
+      break;
+    }
+  }
+
+  if (index == -1) {
+    LOGE("Tried to get CL buffer for input `%s` but no input with this name exists", name.c_str());
+    assert(false);
+  }
+
+  if (thneed->input_clmem.size() >= inputs.size()) {
+    return &thneed->input_clmem[inputs.size() - index - 1];
+  } else {
+    return nullptr;
+  }
+}
+
+void ThneedModel::execute() {
+  if (!recorded) {
+    thneed->record = true;
+    float *input_buffers[inputs.size()];
+    for (int i = 0; i < inputs.size(); i++) {
+      input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
+    }
+
+    thneed->copy_inputs(input_buffers);
+    thneed->clexec();
+    thneed->copy_output(output);
+    thneed->stop();
+
+    recorded = true;
+  } else {
+    float *input_buffers[inputs.size()];
+    for (int i = 0; i < inputs.size(); i++) {
+      input_buffers[inputs.size() - i - 1] = inputs[i]->buffer;
+    }
+    thneed->execute(input_buffers, output);
+  }
+}
--- a/selfdrive/modeld/runners/thneedmodel.h
+++ b/selfdrive/modeld/runners/thneedmodel.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <string>
+
+#include "selfdrive/modeld/runners/runmodel.h"
+#include "selfdrive/modeld/thneed/thneed.h"
+
+class ThneedModel : public RunModel {
+public:
+  ThneedModel(const std::string path, float *_output, size_t _output_size, int runtime, bool use_tf8 = false, cl_context context = NULL);
+  void *getCLBuffer(const std::string name);
+  void execute();
+private:
+  Thneed *thneed = NULL;
+  bool recorded;
+  float *output;
+};
--- a/selfdrive/modeld/runners/thneedmodel.pxd
+++ b/selfdrive/modeld/runners/thneedmodel.pxd
@@ -0,0 +1,9 @@
+# distutils: language = c++
+
+from libcpp.string cimport string
+
+from cereal.visionipc.visionipc cimport cl_context
+
+cdef extern from "selfdrive/modeld/runners/thneedmodel.h":
+  cdef cppclass ThneedModel:
+    ThneedModel(string, float*, size_t, int, bool, cl_context)
--- a/selfdrive/modeld/runners/thneedmodel_pyx.pyx
+++ b/selfdrive/modeld/runners/thneedmodel_pyx.pyx
@@ -0,0 +1,14 @@
+# distutils: language = c++
+# cython: c_string_encoding=ascii
+
+from libcpp cimport bool
+from libcpp.string cimport string
+
+from .thneedmodel cimport ThneedModel as cppThneedModel
+from selfdrive.modeld.models.commonmodel_pyx cimport CLContext
+from selfdrive.modeld.runners.runmodel_pyx cimport RunModel
+from selfdrive.modeld.runners.runmodel cimport RunModel as cppRunModel
+
+cdef class ThneedModel(RunModel):
+  def __cinit__(self, string path, float[:] output, int runtime, bool use_tf8, CLContext context):
+    self.model = <cppRunModel *> new cppThneedModel(path, &output[0], len(output), runtime, use_tf8, context.context)