From b30f62af360e90c48863aa912658c07210500784 Mon Sep 17 00:00:00 2001
From: Zheng-Bicheng <58363586+Zheng-Bicheng@users.noreply.github.com>
Date: Thu, 13 Apr 2023 16:37:36 +0800
Subject: [PATCH] [Backend] Refactoring RKNPU2 Backend code (#1772)

* update rknpu2 runtime

* update rknpu2 runtime

* update rknpu2 runtime

* update for rknpu2 backend

* update for rknpu2 backend

---------

Co-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>
---
 docs/cn/build_and_install/rknpu2.md           |   2 +-
 fastdeploy/runtime/backends/rknpu2/option.h   |  34 +-
 .../runtime/backends/rknpu2/rknpu2_backend.cc | 478 +++++++++++-------
 .../runtime/backends/rknpu2/rknpu2_backend.h  | 139 ++++-
 4 files changed, 422 insertions(+), 231 deletions(-)

diff --git a/docs/cn/build_and_install/rknpu2.md b/docs/cn/build_and_install/rknpu2.md
index 83c06b7635..cee9a7396c 100644
--- a/docs/cn/build_and_install/rknpu2.md
+++ b/docs/cn/build_and_install/rknpu2.md
@@ -88,7 +88,7 @@ cd FastDeploy
 git checkout develop
 
 mkdir build && cd build
-cmake ..  -DENABLE_ORT_BACKEND=ON \
+cmake ..  -DENABLE_ORT_BACKEND=OFF \
 	      -DENABLE_RKNPU2_BACKEND=ON \
 	      -DENABLE_VISION=ON \
 	      -DRKNN2_TARGET_SOC=RK3588 \
diff --git a/fastdeploy/runtime/backends/rknpu2/option.h b/fastdeploy/runtime/backends/rknpu2/option.h
index 7b641a21f5..665ad820de 100644
--- a/fastdeploy/runtime/backends/rknpu2/option.h
+++ b/fastdeploy/runtime/backends/rknpu2/option.h
@@ -21,32 +21,28 @@ typedef enum _rknpu2_cpu_name {
   UNDEFINED,
 } CpuName;
 
-/*! RKNPU2 core mask for mobile device. */
+/* The specification of NPU core setting.It has the following choices :
+ * RKNN_NPU_CORE_AUTO : Referring to automatic mode, meaning that it will
+ * select the idle core inside the NPU.
+ * RKNN_NPU_CORE_0 : Running on the NPU0 core.
+ * RKNN_NPU_CORE_1: Runing on the NPU1 core.
+ * RKNN_NPU_CORE_2: Runing on the NPU2 core.
+ * RKNN_NPU_CORE_0_1: Running on both NPU0 and NPU1 core simultaneously.
+ * RKNN_NPU_CORE_0_1_2: Running on both NPU0, NPU1 and NPU2 simultaneously.
+ */
 typedef enum _rknpu2_core_mask {
-  RKNN_NPU_CORE_AUTO = 0,  //< default, run on NPU core randomly.
-  RKNN_NPU_CORE_0 = 1,     //< run on NPU core 0.
-  RKNN_NPU_CORE_1 = 2,     //< run on NPU core 1.
-  RKNN_NPU_CORE_2 = 4,     //< run on NPU core 2.
-  RKNN_NPU_CORE_0_1 =
-      RKNN_NPU_CORE_0 | RKNN_NPU_CORE_1,  //< run on NPU core 1 and core 2.
-  RKNN_NPU_CORE_0_1_2 =
-      RKNN_NPU_CORE_0_1 | RKNN_NPU_CORE_2,  //< run on NPU core 1 and core 2.
+  RKNN_NPU_CORE_AUTO = 0,
+  RKNN_NPU_CORE_0 = 1,
+  RKNN_NPU_CORE_1 = 2,
+  RKNN_NPU_CORE_2 = 4,
+  RKNN_NPU_CORE_0_1 = RKNN_NPU_CORE_0 | RKNN_NPU_CORE_1,
+  RKNN_NPU_CORE_0_1_2 = RKNN_NPU_CORE_0_1 | RKNN_NPU_CORE_2,
   RKNN_NPU_CORE_UNDEFINED,
 } CoreMask;
 }  // namespace rknpu2
 
 struct RKNPU2BackendOption {
   rknpu2::CpuName cpu_name = rknpu2::CpuName::RK3588;
-
-  // The specification of NPU core setting.It has the following choices :
-  // RKNN_NPU_CORE_AUTO : Referring to automatic mode, meaning that it will
-  // select the idle core inside the NPU.
-  // RKNN_NPU_CORE_0 : Running on the NPU0 core
-  // RKNN_NPU_CORE_1: Runing on the NPU1 core
-  // RKNN_NPU_CORE_2: Runing on the NPU2 core
-  // RKNN_NPU_CORE_0_1: Running on both NPU0 and NPU1 core simultaneously.
-  // RKNN_NPU_CORE_0_1_2: Running on both NPU0, NPU1 and NPU2 simultaneously.
   rknpu2::CoreMask core_mask = rknpu2::CoreMask::RKNN_NPU_CORE_AUTO;
 };
-
 }  // namespace fastdeploy
diff --git a/fastdeploy/runtime/backends/rknpu2/rknpu2_backend.cc b/fastdeploy/runtime/backends/rknpu2/rknpu2_backend.cc
index 5bc9171a83..5e744ff4e4 100644
--- a/fastdeploy/runtime/backends/rknpu2/rknpu2_backend.cc
+++ b/fastdeploy/runtime/backends/rknpu2/rknpu2_backend.cc
@@ -12,201 +12,312 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "fastdeploy/runtime/backends/rknpu2/rknpu2_backend.h"
-
-#include "fastdeploy/utils/perf.h"
 namespace fastdeploy {
 RKNPU2Backend::~RKNPU2Backend() {
-  // Release memory uniformly here
-  if (input_attrs_ != nullptr) {
-    free(input_attrs_);
-  }
+  if (tensor_attrs_init_) {
+    if (input_attrs_ != nullptr) {
+      free(input_attrs_);
+    }
 
-  if (output_attrs_ != nullptr) {
-    free(output_attrs_);
+    if (output_attrs_ != nullptr) {
+      free(output_attrs_);
+    }
   }
 
-  for (uint32_t i = 0; i < io_num.n_input; i++) {
-    rknn_destroy_mem(ctx, input_mems_[i]);
+  if (tensor_memory_init_) {
+    for (uint32_t i = 0; i < io_num_.n_input; i++) {
+      rknn_destroy_mem(ctx_, input_mems_[i]);
+    }
+
+    for (uint32_t i = 0; i < io_num_.n_output; i++) {
+      rknn_destroy_mem(ctx_, output_mems_[i]);
+    }
   }
-  if (input_mems_ != nullptr) {
-    free(input_mems_);
+}
+
+/*
+ *  @name       RuntimeOptionIsApplicable
+ *  @brief      This function is used to determine whether the RuntimeOption
+ *              meets the operating conditions of RKNPU2.
+ *  @param      None
+ *  @return     bool
+ *  @note       None
+ */
+bool RKNPU2Backend::RuntimeOptionIsApplicable(
+    const RuntimeOption& runtime_option) {
+  if (!Supported(runtime_option.model_format, Backend::RKNPU2)) {
+    FDERROR << "The model format is not supported for RKNPU2." << std::endl;
+    return false;
   }
 
-  for (uint32_t i = 0; i < io_num.n_output; i++) {
-    rknn_destroy_mem(ctx, output_mems_[i]);
+  if (!Supported(runtime_option.device, Backend::RKNPU2)) {
+    FDERROR << "The device is not supported for RKNPU2." << std::endl;
+    return false;
   }
-  if (output_mems_ != nullptr) {
-    free(output_mems_);
+
+  if (runtime_option.model_from_memory_) {
+    FDERROR << "RKNPU2 backend doesn't support load model from memory, please "
+               "load model from disk."
+            << std::endl;
+    return false;
   }
+  return true;
 }
-/***************************************************************
+
+/*
  *  @name       GetSDKAndDeviceVersion
- *  @brief      get RKNN sdk and device version
+ *  @brief      Get RKNPU2 sdk and device version.
  *  @param      None
  *  @return     bool
- *  @note       None
- ***************************************************************/
+ *  @note       The private variable ctx_ must be initialized.
+ */
 bool RKNPU2Backend::GetSDKAndDeviceVersion() {
   int ret;
-  // get sdk and device version
-  ret = rknn_query(ctx, RKNN_QUERY_SDK_VERSION, &sdk_ver, sizeof(sdk_ver));
+  ret = rknn_query(ctx_, RKNN_QUERY_SDK_VERSION, &sdk_ver_, sizeof(sdk_ver_));
   if (ret != RKNN_SUCC) {
-    printf("rknn_query fail! ret=%d\n", ret);
+    FDERROR << "The function(rknn_query) failed! ret=" << ret << std::endl;
     return false;
   }
-  FDINFO << "rknn_api/rknnrt version: " << sdk_ver.api_version
-         << ", driver version: " << sdk_ver.drv_version << std::endl;
+  FDINFO << "rknpu2 runtime version: " << sdk_ver_.api_version << std::endl;
+  FDINFO << "rknpu2 driver version: " << sdk_ver_.drv_version << std::endl;
   return true;
 }
 
-/***************************************************************
+/*
  *  @name      BuildOption
- *  @brief     save option
+ *  @brief     Save option and set core mask.
  *  @param     RKNPU2BackendOption
  *  @note      None
- ***************************************************************/
+ */
 void RKNPU2Backend::BuildOption(const RKNPU2BackendOption& option) {
-  this->option_ = option;
+  option_ = option;
 
   // save cpu_name
-  this->option_.cpu_name = option.cpu_name;
+  option_.cpu_name = option.cpu_name;
 
   // save context
-  this->option_.core_mask = option.core_mask;
+  option_.core_mask = option.core_mask;
+
+  // set core mask
+  if (option_.cpu_name == rknpu2::CpuName::RK3588) {
+    if (!SetCoreMask(option_.core_mask)) {
+      FDERROR << "set core mask failed" << std::endl;
+    }
+  }
 }
 
 /***************************************************************
  *  @name       Init
  *  @brief      Initialize RKNN model
  *  @param      model_file: Binary data for the RKNN model or the path of RKNN
- *model. params_file: None option: config
  *  @return     bool
  *  @note       None
  ***************************************************************/
 bool RKNPU2Backend::Init(const RuntimeOption& runtime_option) {
-  if (!(Supported(runtime_option.model_format, Backend::RKNPU2) &&
-        Supported(runtime_option.device, Backend::RKNPU2))) {
+  if (!RuntimeOptionIsApplicable(runtime_option)) {
+    FDERROR << "Runtime option is not applicable." << std::endl;
     return false;
   }
-  if (runtime_option.model_from_memory_) {
-    FDERROR << "RKNPU2 backend doesn't support load model from memory, please "
-               "load model from disk."
-            << std::endl;
+
+  if (!LoadModel((char*)runtime_option.model_file.data())) {
+    FDERROR << "Load model failed" << std::endl;
     return false;
   }
 
-  // LoadModel
-  if (!this->LoadModel((char*)runtime_option.model_file.data())) {
-    FDERROR << "load model failed" << std::endl;
+  if (!InitInputAndOutputNumber()) {
+    FDERROR << "Get SDK and device version failed" << std::endl;
     return false;
   }
 
-  // GetSDKAndDeviceVersion
-  if (!this->GetSDKAndDeviceVersion()) {
-    FDERROR << "get SDK and device version failed" << std::endl;
+  if (!GetSDKAndDeviceVersion()) {
+    FDERROR << "Get SDK and device version failed" << std::endl;
     return false;
   }
 
-  // BuildOption
-  this->BuildOption(runtime_option.rknpu2_option);
+  BuildOption(runtime_option.rknpu2_option);
 
-  // SetCoreMask if RK3588
-  if (this->option_.cpu_name == rknpu2::CpuName::RK3588) {
-    if (!this->SetCoreMask(option_.core_mask)) {
-      FDERROR << "set core mask failed" << std::endl;
-      return false;
-    }
-  }
-
-  // GetModelInputOutputInfos
-  if (!this->GetModelInputOutputInfos()) {
-    FDERROR << "get model input output infos failed" << std::endl;
+  if (!InitInputAndOutputInformation()) {
+    FDERROR << "Get model input output information failed" << std::endl;
     return false;
   }
 
   return true;
 }
 
-/***************************************************************
+/*
  *  @name       SetCoreMask
- *  @brief      set NPU core for model
+ *  @brief      Set NPU core for model
  *  @param      core_mask: The specification of NPU core setting.
  *  @return     bool
  *  @note       Only support RK3588
- ***************************************************************/
+ */
 bool RKNPU2Backend::SetCoreMask(const rknpu2::CoreMask& core_mask) const {
-  int ret = rknn_set_core_mask(ctx, static_cast<rknn_core_mask>(core_mask));
+  if (option_.cpu_name != rknpu2::CpuName::RK3588) {
+    FDINFO << "SetCoreMask only support when soc is RK3588." << std::endl;
+    return false;
+  }
+
+  int ret = rknn_set_core_mask(ctx_, static_cast<rknn_core_mask>(core_mask));
   if (ret != RKNN_SUCC) {
-    FDERROR << "rknn_set_core_mask fail! ret=" << ret << std::endl;
+    FDERROR << "The function(rknn_set_core_mask) failed! ret=" << ret
+            << std::endl;
     return false;
   }
   return true;
 }
 
-/***************************************************************
+/*
  *  @name       LoadModel
- *  @brief      read rknn model
+ *  @brief      Read the model and initialize rknn context.
  *  @param      model: Binary data for the RKNN model or the path of RKNN model.
  *  @return     bool
  *  @note       None
- ***************************************************************/
+ */
 bool RKNPU2Backend::LoadModel(void* model) {
   int ret = RKNN_SUCC;
-  ret = rknn_init(&ctx, model, 0, 0, nullptr);
+  ret = rknn_init(&ctx_, model, 0, 0, nullptr);
   if (ret != RKNN_SUCC) {
-    FDERROR << "rknn_init fail! ret=" << ret << std::endl;
+    FDERROR << "The function(rknn_init) failed! ret=" << ret << std::endl;
     return false;
   }
   return true;
 }
 
-/***************************************************************
- *  @name       GetModelInputOutputInfos
- *  @brief      Get the detailed input and output infos of Model
- *  @param      None
+/*
+ *  @name       InitInputAndOutputNumber
+ *  @brief      Initialize io_num_.
+ *  @param
  *  @return     bool
- *  @note       None
- ***************************************************************/
-bool RKNPU2Backend::GetModelInputOutputInfos() {
+ *  @note       The private variable ctx must be initialized to use this
+ * function.
+ */
+bool RKNPU2Backend::InitInputAndOutputNumber() {
+  if (io_num_init_) {
+    FDERROR << "The private variable io_num_ has been initialized."
+            << std::endl;
+    return false;
+  }
   int ret = RKNN_SUCC;
-
-  // Get the number of model inputs and outputs
-  ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+  ret = rknn_query(ctx_, RKNN_QUERY_IN_OUT_NUM, &io_num_, sizeof(io_num_));
   if (ret != RKNN_SUCC) {
+    FDERROR << "The function(rknn_query) failed! ret=" << ret << std::endl;
     return false;
   }
+  io_num_init_ = true;
+  return true;
+}
 
-  // Get detailed input parameters
-  input_attrs_ =
-      (rknn_tensor_attr*)malloc(sizeof(rknn_tensor_attr) * io_num.n_input);
-  memset(input_attrs_, 0, io_num.n_input * sizeof(rknn_tensor_attr));
-  inputs_desc_.resize(io_num.n_input);
+/*
+ *  @name       InitRKNNTensorAddress
+ *  @brief      Allocate memory for input_attrs_ and output_attrs_.
+ *  @param      None
+ *  @return     bool
+ *  @note       None
+ */
+bool RKNPU2Backend::InitRKNNTensorAddress() {
+  if (tensor_attrs_init_) {
+    FDERROR << "Private variable input_attrs_ and output_attrs_ memory has "
+               "been allocated. Please do not allocate memory repeatedly or "
+               "memory leak may occur."
+            << std::endl;
+    return false;
+  }
 
-  // create input tensor memory
-  // rknn_tensor_mem* input_mems[io_num.n_input];
-  input_mems_ =
-      (rknn_tensor_mem**)malloc(sizeof(rknn_tensor_mem*) * io_num.n_input);
+  if (!io_num_init_) {
+    InitInputAndOutputNumber();
+  }
 
-  // get input info and copy to input tensor info
-  for (uint32_t i = 0; i < io_num.n_input; i++) {
-    input_attrs_[i].index = i;
+  if (io_num_.n_input == 0) {
+    FDERROR << "The number of input tensors is 0." << std::endl;
+    return false;
+  }
 
-    // query info
-    ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs_[i]),
+  if (io_num_.n_output == 0) {
+    FDERROR << "The number of output tensors is 0." << std::endl;
+    return false;
+  }
+
+  // Allocate memory for private variable input_attrs_.
+  input_attrs_ =
+      (rknn_tensor_attr*)malloc(sizeof(rknn_tensor_attr) * io_num_.n_input);
+  memset(input_attrs_, 0, io_num_.n_input * sizeof(rknn_tensor_attr));
+  for (uint32_t i = 0; i < io_num_.n_input; i++) {
+    int ret = RKNN_SUCC;
+    input_attrs_[i].index = i;
+    ret = rknn_query(ctx_, RKNN_QUERY_INPUT_ATTR, &(input_attrs_[i]),
                      sizeof(rknn_tensor_attr));
-    DumpTensorAttr(input_attrs_[i]);
 
     if (ret != RKNN_SUCC) {
-      printf("rknn_init error! ret=%d\n", ret);
+      FDERROR << "The function(rknn_query) failed! ret=" << ret << std::endl;
       return false;
     }
+
     if ((input_attrs_[i].fmt != RKNN_TENSOR_NHWC) &&
         (input_attrs_[i].fmt != RKNN_TENSOR_UNDEFINED)) {
       FDERROR << "rknpu2_backend only support input format is NHWC or UNDEFINED"
               << std::endl;
+      return false;
     }
 
-    // copy input_attrs_ to input tensor info
+    DumpTensorAttr(input_attrs_[i]);
+  }
+
+  // Allocate memory for private variable output_attrs_.
+  output_attrs_ =
+      (rknn_tensor_attr*)malloc(sizeof(rknn_tensor_attr) * io_num_.n_output);
+  memset(output_attrs_, 0, io_num_.n_output * sizeof(rknn_tensor_attr));
+  for (uint32_t i = 0; i < io_num_.n_output; i++) {
+    int ret = RKNN_SUCC;
+    output_attrs_[i].index = i;
+    ret = rknn_query(ctx_, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs_[i]),
+                     sizeof(rknn_tensor_attr));
+
+    if (ret != RKNN_SUCC) {
+      FDERROR << "The function(rknn_query) failed! ret=" << ret << std::endl;
+      return false;
+    }
+
+    // FastDeploy Only support postprocess when output type is fp32,
+    // so output_attrs_.type needs to be fixed as RKNN_TENSOR_FLOAT32.
+    output_attrs_[i].type = RKNN_TENSOR_FLOAT32;
+    DumpTensorAttr(output_attrs_[i]);
+  }
+  tensor_attrs_init_ = true;
+  return true;
+}
+
+/*
+ *  @name       InitInputAndOutputInformation
+ *  @brief      Get the detailed input and output information of Model
+ *  @param      None
+ *  @return     bool
+ *  @note       None
+ */
+bool RKNPU2Backend::InitInputAndOutputInformation() {
+  if (!io_num_init_) {
+    InitInputAndOutputNumber();
+  }
+
+  if (!tensor_attrs_init_) {
+    InitRKNNTensorAddress();
+  }
+
+  if (io_num_.n_input == 0) {
+    FDERROR << "The number of input tensors is 0." << std::endl;
+    return false;
+  }
+
+  if (io_num_.n_output == 0) {
+    FDERROR << "The number of output tensors is 0." << std::endl;
+    return false;
+  }
+
+  inputs_desc_.resize(io_num_.n_input);
+  outputs_desc_.resize(io_num_.n_output);
+
+  // Get input info and copy to input tensor info
+  for (uint32_t i = 0; i < io_num_.n_input; i++) {
+    // Copy input_attrs_ to input tensor info
     std::string temp_name = input_attrs_[i].name;
     std::vector<int> temp_shape{};
     temp_shape.resize(input_attrs_[i].n_dims);
@@ -220,37 +331,15 @@ bool RKNPU2Backend::GetModelInputOutputInfos() {
     inputs_desc_[i] = temp_input_info;
   }
 
-  // Get detailed output parameters
-  output_attrs_ =
-      (rknn_tensor_attr*)malloc(sizeof(rknn_tensor_attr) * io_num.n_output);
-  memset(output_attrs_, 0, io_num.n_output * sizeof(rknn_tensor_attr));
-  outputs_desc_.resize(io_num.n_output);
-
-  // Create output tensor memory
-  output_mems_ =
-      (rknn_tensor_mem**)malloc(sizeof(rknn_tensor_mem*) * io_num.n_output);
-  ;
-
-  for (uint32_t i = 0; i < io_num.n_output; i++) {
-    output_attrs_[i].index = i;
-    // query info
-    ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs_[i]),
-                     sizeof(rknn_tensor_attr));
-    DumpTensorAttr(output_attrs_[i]);
-
-    if (ret != RKNN_SUCC) {
-      FDERROR << "rknn_query fail! ret = " << ret << std::endl;
-      return false;
-    }
-
+  for (uint32_t i = 0; i < io_num_.n_output; i++) {
     // If the output dimension is 3, the runtime will automatically change it
     // to 4. Obviously, this is wrong, and manual correction is required here.
-    int n_dims = output_attrs_[i].n_dims;
+    int n_dims = static_cast<int>(output_attrs_[i].n_dims);
     if ((n_dims == 4) && (output_attrs_[i].dims[3] == 1)) {
       n_dims--;
     }
 
-    // copy output_attrs_ to output tensor
+    // Copy output_attrs_ to output tensor
     std::string temp_name = output_attrs_[i].name;
     std::vector<int> temp_shape{};
     temp_shape.resize(n_dims);
@@ -266,13 +355,13 @@ bool RKNPU2Backend::GetModelInputOutputInfos() {
   return true;
 }
 
-/***************************************************************
+/*
  *  @name       DumpTensorAttr
  *  @brief      Get the model's detailed inputs and outputs
  *  @param      rknn_tensor_attr
  *  @return     None
  *  @note       None
- ***************************************************************/
+ */
 void RKNPU2Backend::DumpTensorAttr(rknn_tensor_attr& attr) {
   printf(
       "index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], "
@@ -305,81 +394,98 @@ std::vector<TensorInfo> RKNPU2Backend::GetOutputInfos() {
   return outputs_desc_;
 }
 
-bool RKNPU2Backend::Infer(std::vector<FDTensor>& inputs,
-                          std::vector<FDTensor>* outputs, bool copy_to_fd) {
-  int ret = RKNN_SUCC;
-  // Judge whether the input and output size are the same
-  if (inputs.size() != inputs_desc_.size()) {
-    FDERROR << "[RKNPU2Backend] Size of the inputs(" << inputs.size()
-            << ") should keep same with the inputs of this model("
-            << inputs_desc_.size() << ")." << std::endl;
+/*
+ *  @name       InitRKNNTensorMemory
+ *  @brief      Allocate memory for input and output tensors.
+ *  @param      std::vector<FDTensor>& inputs
+ *  @return     None
+ *  @note       None
+ */
+bool RKNPU2Backend::InitRKNNTensorMemory(std::vector<FDTensor>& inputs) {
+  if (tensor_memory_init_) {
+    FDERROR << "Private variable input_mems_ and output_mems_ memory has "
+               "been allocated. Please do not allocate memory repeatedly or "
+               "memory leak may occur."
+            << std::endl;
     return false;
   }
+  int ret = RKNN_SUCC;
+  input_mems_.resize(io_num_.n_input);
+  output_mems_.resize(io_num_.n_output);
+  for (uint32_t i = 0; i < io_num_.n_input; i++) {
+    // Judge whether the input and output types are the same
+    rknn_tensor_type input_type =
+        fastdeploy::RKNPU2Backend::FDDataTypeToRknnTensorType(inputs[i].dtype);
+    if (input_type != input_attrs_[i].type) {
+      FDWARNING << "The input tensor type != model's inputs type."
+                << "The input_type need "
+                << get_type_string(input_attrs_[i].type) << ",but inputs[" << i
+                << "].type is " << get_type_string(input_type) << std::endl;
+    }
 
-  if (!this->infer_init) {
-    for (uint32_t i = 0; i < io_num.n_input; i++) {
-      // Judge whether the input and output types are the same
-      rknn_tensor_type input_type =
-          fastdeploy::RKNPU2Backend::FDDataTypeToRknnTensorType(
-              inputs[i].dtype);
-      if (input_type != input_attrs_[i].type) {
-        FDWARNING << "The input tensor type != model's inputs type."
-                  << "The input_type need "
-                  << get_type_string(input_attrs_[i].type) << ",but inputs["
-                  << i << "].type is " << get_type_string(input_type)
-                  << std::endl;
-      }
-
-      // Create input tensor memory
-      input_attrs_[i].type = input_type;
-      input_attrs_[i].size = inputs[i].Nbytes();
-      input_attrs_[i].size_with_stride = inputs[i].Nbytes();
+    // Create input tensor memory
+    input_attrs_[i].type = input_type;
+    input_attrs_[i].size = inputs[i].Nbytes();
+    input_attrs_[i].size_with_stride = inputs[i].Nbytes();
 
-      input_mems_[i] = rknn_create_mem(ctx, inputs[i].Nbytes());
-      if (input_mems_[i] == nullptr) {
-        FDERROR << "rknn_create_mem input_mems_ error." << std::endl;
-        return false;
-      }
+    input_mems_[i] = rknn_create_mem(ctx_, inputs[i].Nbytes());
+    if (input_mems_[i] == nullptr) {
+      FDERROR << "The function(rknn_create_mem) failed! ret=" << ret
+              << std::endl;
+      return false;
+    }
 
-      // Set input tensor memory
-      ret = rknn_set_io_mem(ctx, input_mems_[i], &input_attrs_[i]);
-      if (ret != RKNN_SUCC) {
-        FDERROR << "input tensor memory rknn_set_io_mem fail! ret=" << ret
-                << std::endl;
-        return false;
-      }
+    // Set input tensor memory
+    ret = rknn_set_io_mem(ctx_, input_mems_[i], &input_attrs_[i]);
+    if (ret != RKNN_SUCC) {
+      FDERROR << "The function(rknn_set_io_mem) failed! ret=" << ret
+              << std::endl;
+      return false;
     }
+  }
 
-    for (uint32_t i = 0; i < io_num.n_output; ++i) {
-      // Most post-processing does not support the fp16 format.
-      // The unified output here is float32
-      uint32_t output_size = output_attrs_[i].n_elems * sizeof(float);
-      output_mems_[i] = rknn_create_mem(ctx, output_size);
-      if (output_mems_[i] == nullptr) {
-        FDERROR << "rknn_create_mem output_mems_ error." << std::endl;
-        return false;
-      }
+  for (uint32_t i = 0; i < io_num_.n_output; ++i) {
+    // Most post-processing does not support the fp16 format.
+    uint32_t output_size = output_attrs_[i].n_elems * sizeof(float);
+    output_mems_[i] = rknn_create_mem(ctx_, output_size);
+    if (output_mems_[i] == nullptr) {
+      FDERROR << "The function(rknn_create_mem) failed! ret=" << ret
+              << std::endl;
+      return false;
+    }
 
-      // The data type of output data is changed to FP32
-      output_attrs_[i].type = RKNN_TENSOR_FLOAT32;
+    // Set output tensor memory
+    ret = rknn_set_io_mem(ctx_, output_mems_[i], &output_attrs_[i]);
+    if (ret != RKNN_SUCC) {
+      FDERROR << "The function(rknn_set_io_mem) failed! ret=" << ret
+              << std::endl;
+      return false;
+    }
+  }
 
-      // default output type is depend on model, this requires float32 to
-      // compute top5
-      ret = rknn_set_io_mem(ctx, output_mems_[i], &output_attrs_[i]);
+  tensor_memory_init_ = true;
+  return true;
+}
 
-      // set output memory and attribute
-      if (ret != RKNN_SUCC) {
-        FDERROR << "output tensor memory rknn_set_io_mem fail! ret=" << ret
-                << std::endl;
-        return false;
-      }
+bool RKNPU2Backend::Infer(std::vector<FDTensor>& inputs,
+                          std::vector<FDTensor>* outputs, bool copy_to_fd) {
+  if (!tensor_memory_init_) {
+    if (!InitRKNNTensorMemory(inputs)) {
+      FDERROR << "Init tensor memory failed." << std::endl;
     }
+  }
 
-    this->infer_init = true;
+  int ret = RKNN_SUCC;
+  // Judge whether the input and output size are the same
+  if (inputs.size() != inputs_desc_.size()) {
+    FDERROR << "[RKNPU2Backend] Size of the inputs(" << inputs.size()
+            << ") should keep same with the inputs of this model("
+            << inputs_desc_.size() << ")." << std::endl;
+    return false;
   }
 
   // Copy input data to input tensor memory
-  for (uint32_t i = 0; i < io_num.n_input; i++) {
+  for (uint32_t i = 0; i < io_num_.n_input; i++) {
     uint32_t width = input_attrs_[i].dims[2];
     uint32_t stride = input_attrs_[i].w_stride;
     if (width == stride) {
@@ -395,7 +501,7 @@ bool RKNPU2Backend::Infer(std::vector<FDTensor>& inputs,
   }
 
   // run rknn
-  ret = rknn_run(ctx, nullptr);
+  ret = rknn_run(ctx_, nullptr);
   if (ret != RKNN_SUCC) {
     FDERROR << "rknn run error! ret=" << ret << std::endl;
     return false;
@@ -418,14 +524,14 @@ bool RKNPU2Backend::Infer(std::vector<FDTensor>& inputs,
   return true;
 }
 
-/***************************************************************
+/*
  *  @name       RknnTensorTypeToFDDataType
  *  @brief      Change RknnTensorType To FDDataType
  *  @param      rknn_tensor_type
  *  @return     None
  *  @note       Most post-processing does not support the fp16 format.
  *              Therefore, if the input is FP16, the output will be FP32.
- ***************************************************************/
+ */
 FDDataType RKNPU2Backend::RknnTensorTypeToFDDataType(rknn_tensor_type type) {
   if (type == rknn_tensor_type::RKNN_TENSOR_FLOAT16) {
     return FDDataType::FP32;
@@ -452,13 +558,13 @@ FDDataType RKNPU2Backend::RknnTensorTypeToFDDataType(rknn_tensor_type type) {
   return FDDataType::UNKNOWN1;
 }
 
-/***************************************************************
+/*
  *  @name       FDDataTypeToRknnTensorType
  *  @brief      Change FDDataType To RknnTensorType
  *  @param      FDDataType
  *  @return     None
  *  @note       None
- ***************************************************************/
+ */
 rknn_tensor_type RKNPU2Backend::FDDataTypeToRknnTensorType(
     fastdeploy::FDDataType type) {
   if (type == FDDataType::FP16) {
diff --git a/fastdeploy/runtime/backends/rknpu2/rknpu2_backend.h b/fastdeploy/runtime/backends/rknpu2/rknpu2_backend.h
index ccafc9c63d..91fce69028 100644
--- a/fastdeploy/runtime/backends/rknpu2/rknpu2_backend.h
+++ b/fastdeploy/runtime/backends/rknpu2/rknpu2_backend.h
@@ -13,9 +13,9 @@
 // limitations under the License.
 #pragma once
 
+#include "fastdeploy/core/fd_tensor.h"
 #include "fastdeploy/runtime/backends/backend.h"
 #include "fastdeploy/runtime/backends/rknpu2/option.h"
-#include "fastdeploy/core/fd_tensor.h"
 #include "rknn_api.h"  // NOLINT
 #include <cstring>
 #include <iostream>
@@ -26,63 +26,152 @@
 namespace fastdeploy {
 class RKNPU2Backend : public BaseBackend {
  public:
+  /***************************** BaseBackend API *****************************/
   RKNPU2Backend() = default;
-
   virtual ~RKNPU2Backend();
-
   bool Init(const RuntimeOption& runtime_option);
-
   int NumInputs() const override {
     return static_cast<int>(inputs_desc_.size());
   }
-
   int NumOutputs() const override {
     return static_cast<int>(outputs_desc_.size());
   }
-
   TensorInfo GetInputInfo(int index) override;
   TensorInfo GetOutputInfo(int index) override;
   std::vector<TensorInfo> GetInputInfos() override;
   std::vector<TensorInfo> GetOutputInfos() override;
   bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs,
              bool copy_to_fd = true) override;
+  /***************************** BaseBackend API *****************************/
 
  private:
-  // BaseBackend API
-  void BuildOption(const RKNPU2BackendOption& option);
-  
-  // RKNN API
+  /*
+   *  @name       RuntimeOptionIsApplicable
+   *  @brief      This function is used to determine whether the RuntimeOption
+   *              meets the operating conditions of RKNPU2.
+   *  @param      None
+   *  @return     bool
+   *  @note       None
+   */
+  bool RuntimeOptionIsApplicable(const RuntimeOption& runtime_option);
+
+  /*
+   *  @name       LoadModel
+   *  @brief      Read the model and initialize rknn context.
+   *  @param      model: Binary data for the RKNN model or the path of RKNN model.
+   *  @return     bool
+   *  @note       None
+   */
   bool LoadModel(void* model);
 
+  /*
+   *  @name       GetSDKAndDeviceVersion
+   *  @brief      Get RKNPU2 sdk and device version.
+   *  @param      None
+   *  @return     bool
+   *  @note       The private variable ctx must be initialized to use this function.
+   */
   bool GetSDKAndDeviceVersion();
 
+  /*
+   *  @name      BuildOption
+   *  @brief     Save option and set core mask.
+   *  @param     RKNPU2BackendOption
+   *  @note      None
+   */
+  void BuildOption(const RKNPU2BackendOption& option);
+
+  /*
+   *  @name       SetCoreMask
+   *  @brief      Set NPU core for model
+   *  @param      core_mask: The specification of NPU core setting.
+   *  @return     bool
+   *  @note       Only support RK3588
+   */
   bool SetCoreMask(const rknpu2::CoreMask& core_mask) const;
 
-  bool GetModelInputOutputInfos();
+  /*
+   *  @name       InitInputAndOutputNumber
+   *  @brief      Initialize io_num_.
+   *  @param
+   *  @return     bool
+   *  @note       The private variable ctx must be initialized to use this function.
+   */
+  bool InitInputAndOutputNumber();
+
+  /*
+   *  @name       InitRKNNTensorAddress
+   *  @brief      Allocate memory for input_attrs_ and output_attrs_.
+   *  @param      None
+   *  @return     bool
+   *  @note       None
+   */
+  bool InitRKNNTensorAddress();
+
+  /*
+   *  @name       InitInputAndOutputInformation
+   *  @brief      Initialize inputs_desc_ and outputs_desc_.
+   *  @param      None
+   *  @return     bool
+   *  @note       None
+   */
+  bool InitInputAndOutputInformation();
+
+  /*
+   *  @name       InitRKNNTensorMemory
+   *  @brief      Allocate memory for input and output tensors.
+   *  @param      std::vector<FDTensor>& inputs
+   *  @return     None
+   *  @note       None
+   */
+  bool InitRKNNTensorMemory(std::vector<FDTensor>& inputs);
+
+  rknn_context ctx_{};
+  rknn_sdk_version sdk_ver_{};
+
+  rknn_input_output_num io_num_{0, 0};
 
-  // The object of rknn context.
-  rknn_context ctx{};
-  // The structure rknn_sdk_version is used to indicate the version
-  // information of the RKNN SDK.
-  rknn_sdk_version sdk_ver{};
-  // The structure rknn_input_output_num represents the number of
-  // input and output Tensor
-  rknn_input_output_num io_num{};
   std::vector<TensorInfo> inputs_desc_;
   std::vector<TensorInfo> outputs_desc_;
 
   rknn_tensor_attr* input_attrs_ = nullptr;
   rknn_tensor_attr* output_attrs_ = nullptr;
 
-  rknn_tensor_mem** input_mems_;
-  rknn_tensor_mem** output_mems_;
+  std::vector<rknn_tensor_mem*> input_mems_;
+  std::vector<rknn_tensor_mem*> output_mems_;
 
-  bool infer_init = false;
+  bool io_num_init_ = false;
+  bool tensor_attrs_init_ = false;
+  bool tensor_memory_init_ = false;
 
   RKNPU2BackendOption option_;
 
-  static void DumpTensorAttr(rknn_tensor_attr& attr);
-  static FDDataType RknnTensorTypeToFDDataType(rknn_tensor_type type);
-  static rknn_tensor_type FDDataTypeToRknnTensorType(FDDataType type);
+  /*
+   *  @name       DumpTensorAttr
+   *  @brief      Get the model's detailed inputs and outputs
+   *  @param      rknn_tensor_attr
+   *  @return     None
+   *  @note       None
+   */
+  void DumpTensorAttr(rknn_tensor_attr& attr);
+
+  /*
+   *  @name       RknnTensorTypeToFDDataType
+   *  @brief      Change RknnTensorType To FDDataType
+   *  @param      rknn_tensor_type
+   *  @return     None
+   *  @note       Most post-processing does not support the fp16 format.
+   *              Therefore, if the input is FP16, the output will be FP32.
+   */
+  FDDataType RknnTensorTypeToFDDataType(rknn_tensor_type type);
+
+  /*
+   *  @name       FDDataTypeToRknnTensorType
+   *  @brief      Change FDDataType To RknnTensorType
+   *  @param      FDDataType
+   *  @return     None
+   *  @note       None
+   */
+  rknn_tensor_type FDDataTypeToRknnTensorType(FDDataType type);
 };
 }  // namespace fastdeploy