[Backend & Serving] Serving and Runtime support Clone (PaddlePaddle#464)

* Add Serving and Runtime use Clone * support TRT, OpenVINO and Paddle Backend Co-authored-by: Jason <[email protected]>
alanzhichen · Nov 4, 2022 · 277bec3 · 277bec3
1 parent 61634ca
commit 277bec3
Show file tree

Hide file tree

Showing 13 changed files with 341 additions and 148 deletions.
diff --git a/fastdeploy/backends/backend.h b/fastdeploy/backends/backend.h
@@ -21,6 +21,7 @@
 
 #include "fastdeploy/backends/common/multiclass_nms.h"
 #include "fastdeploy/core/fd_tensor.h"
+#include "fastdeploy/core/fd_type.h"
 
 namespace fastdeploy {
 
@@ -63,6 +64,11 @@ class BaseBackend {
   virtual std::vector<TensorInfo> GetOutputInfos() = 0;
   virtual bool Infer(std::vector<FDTensor>& inputs,
                      std::vector<FDTensor>* outputs) = 0;
+  virtual std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
+                                             int device_id = -1) {
+    FDERROR << "Clone no support" << std::endl;
+    return nullptr;
+  }
 };
 
 }  // namespace fastdeploy
diff --git a/fastdeploy/backends/openvino/ov_backend.cc b/fastdeploy/backends/openvino/ov_backend.cc
@@ -74,6 +74,8 @@ ov::element::Type FDDataTypeToOV(const FDDataType& type) {
   return ov::element::f32;
 }
 
+ov::Core OpenVINOBackend::core_;
+
 void OpenVINOBackend::InitTensorInfo(
     const std::vector<ov::Output<ov::Node>>& ov_outputs,
     std::map<std::string, TensorInfo>* tensor_infos) {
@@ -96,10 +98,6 @@ bool OpenVINOBackend::InitFromPaddle(const std::string& model_file,
     return false;
   }
   option_ = option;
-  ov::AnyMap properties;
-  if (option_.cpu_thread_num > 0) {
-    properties["INFERENCE_NUM_THREADS"] = option_.cpu_thread_num;
-  }
 
   std::shared_ptr<ov::Model> model = core_.read_model(model_file, params_file);
 
@@ -149,7 +147,19 @@ bool OpenVINOBackend::InitFromPaddle(const std::string& model_file,
     output_infos_.push_back(iter->second);
   }
 
+  ov::AnyMap properties;
+  if (option_.cpu_thread_num > 0) {
+    properties["INFERENCE_NUM_THREADS"] = option_.cpu_thread_num;
+  }
+  if (option_.ov_num_streams ==  -1) {
+    properties["NUM_STREAMS"] = ov::streams::AUTO;
+  } else if (option_.ov_num_streams ==  -2) {
+    properties["NUM_STREAMS"] = ov::streams::NUMA;
+  } else if (option_.ov_num_streams > 0) {
+    properties["NUM_STREAMS"] = option_.ov_num_streams;
+  }
   compiled_model_ = core_.compile_model(model, "CPU", properties);
+
   request_ = compiled_model_.create_infer_request();
   initialized_ = true;
   return true;
@@ -185,10 +195,6 @@ bool OpenVINOBackend::InitFromOnnx(const std::string& model_file,
     return false;
   }
   option_ = option;
-  ov::AnyMap properties;
-  if (option_.cpu_thread_num > 0) {
-    properties["INFERENCE_NUM_THREADS"] = option_.cpu_thread_num;
-  }
 
   std::shared_ptr<ov::Model> model = core_.read_model(model_file);
 
@@ -238,8 +244,21 @@ bool OpenVINOBackend::InitFromOnnx(const std::string& model_file,
     output_infos_.push_back(iter->second);
   }
 
+  ov::AnyMap properties;
+  if (option_.cpu_thread_num > 0) {
+    properties["INFERENCE_NUM_THREADS"] = option_.cpu_thread_num;
+  }
+  if (option_.ov_num_streams ==  -1) {
+    properties["NUM_STREAMS"] = ov::streams::AUTO;
+  } else if (option_.ov_num_streams ==  -2) {
+    properties["NUM_STREAMS"] = ov::streams::NUMA;
+  } else if (option_.ov_num_streams > 0) {
+    properties["NUM_STREAMS"] = option_.ov_num_streams;
+  }
   compiled_model_ = core_.compile_model(model, "CPU", properties);
+
   request_ = compiled_model_.create_infer_request();
+
   initialized_ = true;
   return true;
 }
@@ -281,4 +300,14 @@ bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
   return true;
 }
 
+std::unique_ptr<BaseBackend> OpenVINOBackend::Clone(void *stream, int device_id) {
+  std::unique_ptr<BaseBackend> new_backend = utils::make_unique<OpenVINOBackend>();
+  auto casted_backend = dynamic_cast<OpenVINOBackend*>(new_backend.get());
+  casted_backend->option_ = option_;
+  casted_backend->request_ = compiled_model_.create_infer_request();
+  casted_backend->input_infos_.assign(input_infos_.begin(), input_infos_.end());
+  casted_backend->output_infos_.assign(output_infos_.begin(), output_infos_.end());
+  return new_backend;
+}
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/backends/openvino/ov_backend.h b/fastdeploy/backends/openvino/ov_backend.h
@@ -20,17 +20,20 @@
 #include <vector>
 
 #include "fastdeploy/backends/backend.h"
+#include "fastdeploy/utils/unique_ptr.h"
 #include "openvino/openvino.hpp"
 
 namespace fastdeploy {
 
 struct OpenVINOBackendOption {
-  int cpu_thread_num = 8;
+  int cpu_thread_num = -1;
+  int ov_num_streams = 1;
   std::map<std::string, std::vector<int64_t>> shape_infos;
 };
 
 class OpenVINOBackend : public BaseBackend {
  public:
+  static ov::Core core_;
   OpenVINOBackend() {}
   virtual ~OpenVINOBackend() = default;
 
@@ -54,10 +57,13 @@ class OpenVINOBackend : public BaseBackend {
   std::vector<TensorInfo> GetInputInfos() override;
   std::vector<TensorInfo> GetOutputInfos() override;
 
+  std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
+                                     int device_id = -1) override;
+
  private:
   void InitTensorInfo(const std::vector<ov::Output<ov::Node>>& ov_outputs,
                       std::map<std::string, TensorInfo>* tensor_infos);
-  ov::Core core_;
+
   ov::CompiledModel compiled_model_;
   ov::InferRequest request_;
   OpenVINOBackendOption option_;

diff --git a/fastdeploy/backends/paddle/paddle_backend.cc b/fastdeploy/backends/paddle/paddle_backend.cc
@@ -216,6 +216,30 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
   return true;
 }
 
+std::unique_ptr<BaseBackend> PaddleBackend::Clone(void *stream, int device_id) {
+  std::unique_ptr<BaseBackend> new_backend = utils::make_unique<PaddleBackend>();
+  auto casted_backend = dynamic_cast<PaddleBackend*>(new_backend.get());
+  if(device_id > 0 && option_.use_gpu == true && device_id != option_.gpu_id) {
+    auto clone_option = option_;
+    clone_option.gpu_id = device_id;
+    clone_option.external_stream_ = stream;
+    casted_backend->InitFromPaddle(clone_option.model_file,
+                                   clone_option.params_file,
+                                   clone_option);
+    FDWARNING << "The target device id:" 
+             << device_id
+             << " is different from current device id:"
+             << option_.gpu_id
+             << ", cannot share memory with current engine."
+             << std::endl;
+    return new_backend;
+  }
+  casted_backend->inputs_desc_.assign(inputs_desc_.begin(), inputs_desc_.end());
+  casted_backend->outputs_desc_.assign(outputs_desc_.begin(), outputs_desc_.end());
+  casted_backend->predictor_ = std::move(predictor_->Clone(stream));
+  return new_backend;
+}
+
 #ifdef ENABLE_TRT_BACKEND
 void PaddleBackend::SetTRTDynamicShapeToConfig(const PaddleBackendOption& option) {
     std::map<std::string, std::vector<int>> max_shape;

diff --git a/fastdeploy/backends/paddle/paddle_backend.h b/fastdeploy/backends/paddle/paddle_backend.h
@@ -24,6 +24,7 @@
 #include "paddle2onnx/converter.h"
 #endif
 #include "paddle_inference_api.h"  // NOLINT
+#include "fastdeploy/utils/unique_ptr.h"
 
 #ifdef ENABLE_TRT_BACKEND
 #include "fastdeploy/backends/tensorrt/trt_backend.h"
@@ -43,6 +44,9 @@ struct IpuOption {
 };
 
 struct PaddleBackendOption {
+  std::string model_file = "";   // Path of model file
+  std::string params_file = "";  // Path of parameters file, can be empty
+
 #ifdef WITH_GPU
   bool use_gpu = true;
 #else
@@ -110,6 +114,9 @@ class PaddleBackend : public BaseBackend {
 
   int NumOutputs() const override { return outputs_desc_.size(); }
 
+  std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
+                                     int device_id = -1) override;
+
   TensorInfo GetInputInfo(int index) override;
   TensorInfo GetOutputInfo(int index) override;
   std::vector<TensorInfo> GetInputInfos() override;

diff --git a/fastdeploy/backends/tensorrt/trt_backend.cc b/fastdeploy/backends/tensorrt/trt_backend.cc
@@ -285,6 +285,7 @@ bool TrtBackend::Infer(std::vector<FDTensor>& inputs,
     BuildTrtEngine();
   }
 
+  cudaSetDevice(option_.gpu_id);
   SetInputs(inputs);
   AllocateOutputsBuffer(outputs);
 
@@ -356,13 +357,17 @@ void TrtBackend::GetInputOutputInfo() {
       outputs_device_buffer_[name] = FDDeviceBuffer(dtype);
       casted_output_tensors_[name] = FDTensor();
     }
+    io_name_index_[name] = i;
   }
   bindings_.resize(num_binds);
 }
 
 void TrtBackend::SetInputs(const std::vector<FDTensor>& inputs) {
   for (const auto& item : inputs) {
-    auto idx = engine_->getBindingIndex(item.name.c_str());
+    // auto idx = engine_->getBindingIndex(item.name.c_str());
+    auto iter = io_name_index_.find(item.name);
+    FDASSERT(iter != io_name_index_.end(), "TRTBackend SetInputs not find name:%s", item.name.c_str());
+    auto idx = iter->second; 
     std::vector<int> shape(item.shape.begin(), item.shape.end());
     auto dims = ToDims(shape);
     context_->setBindingDimensions(idx, dims);
@@ -410,7 +415,10 @@ void TrtBackend::AllocateOutputsBuffer(std::vector<FDTensor>* outputs) {
     outputs->resize(outputs_desc_.size());
   }
   for (size_t i = 0; i < outputs_desc_.size(); ++i) {
-    auto idx = engine_->getBindingIndex(outputs_desc_[i].name.c_str());
+    // auto idx = engine_->getBindingIndex(outputs_desc_[i].name.c_str());
+    auto idx_iter = io_name_index_.find(outputs_desc_[i].name);
+    FDASSERT(idx_iter != io_name_index_.end(), "TRTBackend Outputs not find name:%s", outputs_desc_[i].name.c_str());
+    auto idx = idx_iter->second; 
     auto output_dims = context_->getBindingDimensions(idx);
 
     // find the original index of output
@@ -673,4 +681,47 @@ std::vector<TensorInfo> TrtBackend::GetOutputInfos() {
   return infos;
 }
 
+std::unique_ptr<BaseBackend> TrtBackend::Clone(void *stream, int device_id) {
+  std::unique_ptr<BaseBackend> new_backend = utils::make_unique<TrtBackend>();
+  auto casted_backend = dynamic_cast<TrtBackend*>(new_backend.get());
+  if(device_id > 0 && device_id != option_.gpu_id) {
+    auto clone_option = option_;
+    clone_option.gpu_id = device_id;
+    clone_option.external_stream_ = stream;
+    if (option_.model_format == ModelFormat::ONNX) {
+      FDASSERT(casted_backend->InitFromOnnx(option_.model_file, clone_option),
+              "Clone model from ONNX failed while initialize TrtBackend.");
+    } else {
+      FDASSERT(casted_backend->InitFromPaddle(option_.model_file,
+                                              option_.params_file, clone_option),
+              "Clone model from Paddle failed while initialize TrtBackend.");
+    }
+    FDWARNING << "The target device id:" 
+          << device_id
+          << " is different from current device id:"
+          << option_.gpu_id
+          << ", cannot share memory with current engine."
+          << std::endl;
+    return new_backend;
+  }
+  cudaSetDevice(option_.gpu_id);
+  casted_backend->option_.gpu_id = option_.gpu_id;
+  if (stream) {
+    casted_backend->stream_ = reinterpret_cast<cudaStream_t>(stream);
+  } else {
+    FDASSERT(cudaStreamCreate(&casted_backend->stream_) == 0,
+           "[ERROR] Error occurs while clone calling cudaStreamCreate().");
+  }
+  casted_backend->inputs_desc_.assign(inputs_desc_.begin(), inputs_desc_.end());
+  casted_backend->outputs_desc_.assign(outputs_desc_.begin(), outputs_desc_.end());
+  casted_backend->outputs_order_.insert(outputs_order_.begin(), outputs_order_.end());
+  casted_backend->shape_range_info_.insert(shape_range_info_.begin(), shape_range_info_.end());
+  casted_backend->engine_ = engine_;
+  casted_backend->context_ = std::shared_ptr<nvinfer1::IExecutionContext>(
+      casted_backend->engine_->createExecutionContext());
+  casted_backend->GetInputOutputInfo();
+  FDINFO << "TRTBackend clone finish." << std::endl;
+  return new_backend;
+}
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/backends/tensorrt/trt_backend.h b/fastdeploy/backends/tensorrt/trt_backend.h
@@ -25,6 +25,7 @@
 #include "NvOnnxParser.h"
 #include "fastdeploy/backends/backend.h"
 #include "fastdeploy/backends/tensorrt/utils.h"
+#include "fastdeploy/utils/unique_ptr.h"
 
 class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
  public:
@@ -45,7 +46,7 @@ class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
 
   void writeCalibrationCache(const void* cache,
                              size_t length) noexcept override {
-    std::cout << "NOT IMPLEMENT." << std::endl;
+    fastdeploy::FDERROR << "NOT IMPLEMENT." << std::endl;
   }
 
  private:
@@ -62,6 +63,11 @@ struct TrtValueInfo {
 };
 
 struct TrtBackendOption {
+  std::string model_file = "";   // Path of model file
+  std::string params_file = "";  // Path of parameters file, can be empty
+  // format of input model
+  ModelFormat model_format = ModelFormat::AUTOREC;
+
   int gpu_id = 0;
   bool enable_fp16 = false;
   bool enable_int8 = false;
@@ -99,6 +105,8 @@ class TrtBackend : public BaseBackend {
   TensorInfo GetOutputInfo(int index);
   std::vector<TensorInfo> GetInputInfos() override;
   std::vector<TensorInfo> GetOutputInfos() override;
+  std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
+                                     int device_id = -1) override;
 
   ~TrtBackend() {
     if (parser_) {
@@ -119,6 +127,7 @@ class TrtBackend : public BaseBackend {
   std::vector<TrtValueInfo> outputs_desc_;
   std::map<std::string, FDDeviceBuffer> inputs_device_buffer_;
   std::map<std::string, FDDeviceBuffer> outputs_device_buffer_;
+  std::map<std::string, int> io_name_index_;
 
   std::string calibration_str_;
 

diff --git a/fastdeploy/core/fd_type.cc b/fastdeploy/core/fd_type.cc
@@ -182,4 +182,31 @@ const FDDataType TypeToDataType<uint8_t>::dtype = UINT8;
 template <>
 const FDDataType TypeToDataType<int8_t>::dtype = INT8;
 
+std::string Str(const ModelFormat& f) {
+  if (f == ModelFormat::PADDLE) {
+    return "ModelFormat::PADDLE";
+  } else if (f == ModelFormat::ONNX) {
+    return "ModelFormat::ONNX";
+  }else if (f == ModelFormat::RKNN) {
+    return "ModelFormat::RKNN";
+  } else if (f == ModelFormat::TORCHSCRIPT) {
+    return "ModelFormat::TORCHSCRIPT";
+  }
+  return "UNKNOWN-ModelFormat";
+}
+
+std::ostream& operator<<(std::ostream& out, const ModelFormat& format) {
+  if (format == ModelFormat::PADDLE) {
+    out << "ModelFormat::PADDLE";
+  } else if (format == ModelFormat::ONNX) {
+    out << "ModelFormat::ONNX";
+  } else if (format == ModelFormat::RKNN) {
+    out << "ModelFormat::RKNN";
+  } else if (format == ModelFormat::TORCHSCRIPT) {
+    out << "ModelFormat::TORCHSCRIPT";
+  }
+  out << "UNKNOWN-ModelFormat";
+  return out;
+}
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/core/fd_type.h b/fastdeploy/core/fd_type.h
@@ -65,4 +65,16 @@ struct FASTDEPLOY_DECL TypeToDataType {
   static const FDDataType dtype;
 };
 
+/*! Deep learning model format */
+enum ModelFormat {
+  AUTOREC,      ///< Auto recognize the model format by model file name
+  PADDLE,       ///< Model with paddlepaddle format
+  ONNX,         ///< Model with ONNX format
+  RKNN,         ///< Model with RKNN format
+  TORCHSCRIPT,  ///< Model with TorchScript format
+};
+
+FASTDEPLOY_DECL std::ostream& operator<<(std::ostream& out,
+                                         const ModelFormat& format);
+
 }  // namespace fastdeploy