[CVCUDA] CMake integration, vison processor CV-CUDA integration, Padd…

…leClas support CV-CUDA (PaddlePaddle#1074) * cvcuda resize * cvcuda center crop * cvcuda resize * add a fdtensor in fdmat * get cv mat and get tensor support gpu * paddleclas cvcuda preprocessor * fix compile err * fix windows compile error * rename reused to cached * address comment * remove debug code * add comment * add manager run * use cuda and cuda used * use cv cuda doc * address comment --------- Co-authored-by: Jason <[email protected]>
felixhjh · Jan 30, 2023 · 62e051e · 62e051e
1 parent 0c735e9
commit 62e051e
Show file tree

Hide file tree

Showing 26 changed files with 814 additions and 216 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -66,6 +66,7 @@ option(ENABLE_LITE_BACKEND "Whether to enable paddle lite backend." OFF)
 option(ENABLE_VISION "Whether to enable vision models usage." OFF)
 option(ENABLE_TEXT "Whether to enable text models usage." OFF)
 option(ENABLE_FLYCV "Whether to enable flycv to boost image preprocess." OFF)
+option(ENABLE_CVCUDA "Whether to enable NVIDIA CV-CUDA to boost image preprocess." OFF)
 option(ENABLE_ENCRYPTION "Whether to enable ENCRYPTION." OFF)
 option(WITH_ASCEND "Whether to compile for Huawei Ascend deploy." OFF)
 option(WITH_TIMVX "Whether to compile for TIMVX deploy." OFF)
@@ -373,6 +374,12 @@ if(ENABLE_VISION)
     include(${PROJECT_SOURCE_DIR}/cmake/flycv.cmake)
     list(APPEND DEPEND_LIBS external_flycv)
   endif()
+
+  if(ENABLE_CVCUDA)
+    include(${PROJECT_SOURCE_DIR}/cmake/cvcuda.cmake)
+    add_definitions(-DENABLE_CVCUDA)
+    list(APPEND DEPEND_LIBS nvcv_types cvcuda)
+  endif()
 endif()
 
 if(ENABLE_TEXT)

diff --git a/FastDeploy.cmake.in b/FastDeploy.cmake.in
@@ -13,6 +13,7 @@ set(ENABLE_TRT_BACKEND @ENABLE_TRT_BACKEND@)
 set(ENABLE_PADDLE2ONNX @ENABLE_PADDLE2ONNX@)
 set(ENABLE_VISION @ENABLE_VISION@)
 set(ENABLE_FLYCV @ENABLE_FLYCV@)
+set(ENABLE_CVCUDA @ENABLE_CVCUDA@)
 set(ENABLE_TEXT @ENABLE_TEXT@)
 set(ENABLE_ENCRYPTION @ENABLE_ENCRYPTION@)
 set(BUILD_ON_JETSON @BUILD_ON_JETSON@)
@@ -140,6 +141,7 @@ if(WITH_GPU)
     message(FATAL_ERROR "[FastDeploy] Cannot find library cudart in ${CUDA_DIRECTORY}, Please define CUDA_DIRECTORY, e.g -DCUDA_DIRECTORY=/path/to/cuda")
   endif()
   list(APPEND FASTDEPLOY_LIBS ${CUDA_LIB})
+  list(APPEND FASTDEPLOY_INCS ${CUDA_DIRECTORY}/include)
 
   if (ENABLE_TRT_BACKEND)
     if(BUILD_ON_JETSON)
@@ -218,6 +220,12 @@ if(ENABLE_VISION)
     endif()
   endif()
 
+  if(ENABLE_CVCUDA)
+    find_library(CVCUDA_LIB cvcuda ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/cvcuda/lib NO_DEFAULT_PATH)
+    find_library(NVCV_TYPES_LIB nvcv_types ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/cvcuda/lib NO_DEFAULT_PATH)
+    list(APPEND FASTDEPLOY_LIBS ${CVCUDA_LIB} ${NVCV_TYPES_LIB})
+  endif()
+
 endif()
 
 if (ENABLE_TEXT)
@@ -288,6 +296,7 @@ if(ENABLE_OPENVINO_BACKEND)
 endif()
 message(STATUS "  ENABLE_TRT_BACKEND        : ${ENABLE_TRT_BACKEND}")
 message(STATUS "  ENABLE_VISION             : ${ENABLE_VISION}")
+message(STATUS "  ENABLE_CVCUDA             : ${ENABLE_CVCUDA}")
 message(STATUS "  ENABLE_TEXT               : ${ENABLE_TEXT}")
 message(STATUS "  ENABLE_ENCRYPTION         : ${ENABLE_ENCRYPTION}")
 if(WITH_GPU)

diff --git a/cmake/cvcuda.cmake b/cmake/cvcuda.cmake
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_GPU)
+  message(FATAL_ERROR "ENABLE_CVCUDA is available on Linux and WITH_GPU=ON, but now WITH_GPU=OFF.")
+endif()
+
+if(APPLE OR ANDROID OR IOS OR WIN32)
+  message(FATAL_ERROR "Cannot enable CV-CUDA in mac/ios/android/windows os, please set -DENABLE_CVCUDA=OFF.")
+endif()
+
+if(NOT (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64"))
+  message(FATAL_ERROR "CV-CUDA only support x86_64.")
+endif()
+
+set(CVCUDA_LIB_URL https://github.com/CVCUDA/CV-CUDA/releases/download/v0.2.0-alpha/nvcv-lib-0.2.0_alpha-cuda11-x86_64-linux.tar.xz)
+set(CVCUDA_LIB_FILENAME nvcv-lib-0.2.0_alpha-cuda11-x86_64-linux.tar.xz)
+set(CVCUDA_DEV_URL https://github.com/CVCUDA/CV-CUDA/releases/download/v0.2.0-alpha/nvcv-dev-0.2.0_alpha-cuda11-x86_64-linux.tar.xz)
+set(CVCUDA_DEV_FILENAME nvcv-dev-0.2.0_alpha-cuda11-x86_64-linux.tar.xz)
+
+download_and_decompress(${CVCUDA_LIB_URL} ${CMAKE_CURRENT_BINARY_DIR}/${CVCUDA_LIB_FILENAME} ${THIRD_PARTY_PATH}/cvcuda)
+download_and_decompress(${CVCUDA_DEV_URL} ${CMAKE_CURRENT_BINARY_DIR}/${CVCUDA_DEV_FILENAME} ${THIRD_PARTY_PATH}/cvcuda)
+
+execute_process(COMMAND rm -rf ${THIRD_PARTY_PATH}/install/cvcuda)
+execute_process(COMMAND mkdir -p ${THIRD_PARTY_PATH}/install/cvcuda)
+execute_process(COMMAND cp -r ${THIRD_PARTY_PATH}/cvcuda/opt/nvidia/cvcuda0/lib/x86_64-linux-gnu/ ${THIRD_PARTY_PATH}/install/cvcuda/lib)
+execute_process(COMMAND cp -r ${THIRD_PARTY_PATH}/cvcuda/opt/nvidia/cvcuda0/include/ ${THIRD_PARTY_PATH}/install/cvcuda/include)
+
+link_directories(${THIRD_PARTY_PATH}/install/cvcuda/lib)
+include_directories(${THIRD_PARTY_PATH}/install/cvcuda/include)
+
+set(CMAKE_CXX_STANDARD 17)
diff --git a/docs/cn/faq/use_cv_cuda.md b/docs/cn/faq/use_cv_cuda.md
@@ -0,0 +1,39 @@
+# 使用CV-CUDA/CUDA加速GPU端到端推理性能
+
+FastDeploy集成了CV-CUDA来加速预/后处理，个别CV-CUDA不支持的算子使用了CUDA kernel的方式实现。
+
+FastDeploy的Vision Processor模块对CV-CUDA的算子做了进一步的封装，用户不需要自己去调用CV-CUDA，
+使用FastDeploy的模型推理接口即可利用CV-CUDA的加速能力。
+
+FastDeploy的Vision Processor模块在集成CV-CUDA时，做了以下工作来方便用户的使用：
+- GPU内存管理，缓存算子的输入、输出tensor，避免重复分配GPU内存
+- CV-CUDA不支持的个别算子利用CUDA kernel实现
+- CV-CUDA/CUDA不支持的算子可以fallback到OpenCV/FlyCV
+
+## 使用方式
+编译FastDeploy时，开启CV-CUDA编译选项
+```bash
+# 编译C++预测库时, 开启CV-CUDA编译选项.
+-DENABLE_CVCUDA=ON \
+
+# 在编译Python预测库时, 开启CV-CUDA编译选项
+export ENABLE_CVCUDA=ON
+```
+
+只有继承了ProcessorManager类的模型预处理，才可以使用CV-CUDA，这里以PaddleClasPreprocessor为例
+```bash
+# C++
+# 创建model之后，调用model preprocessor的UseCuda接口即可打开CV-CUDA/CUDA预处理
+# 第一个参数enable_cv_cuda，true代表使用CV-CUDA，false代表只使用CUDA（支持的算子较少）
+# 第二个参数是GPU id，-1代表不指定，使用当前GPU
+model.GetPreprocessor().UseCuda(true, 0);
+
+# Python
+model.preprocessor.use_cuda(True, 0)
+```
+
+## 最佳实践
+
+- 如果预处理第一个算子是resize，则要根据实际情况决定resize是否跑在GPU。因为当resize跑在GPU，
+  且图片解码在CPU时，需要把原图copy到GPU内存，开销较大，而resize之后再copy到GPU内存，则往往只需要
+  copy较少的数据。
diff --git a/examples/vision/classification/paddleclas/serving/models/preprocess/1/model.py b/examples/vision/classification/paddleclas/serving/models/preprocess/1/model.py
@@ -70,7 +70,7 @@ def initialize(self, args):
             yaml_path)
         if args['model_instance_kind'] == 'GPU':
             device_id = int(args['model_instance_device_id'])
-            self.preprocess_.use_gpu(device_id)
+            self.preprocess_.use_cuda(False, device_id)
 
     def execute(self, requests):
         """`execute` must be implemented in every Python model. `execute`

diff --git a/fastdeploy/vision/classification/ppcls/ppcls_pybind.cc b/fastdeploy/vision/classification/ppcls/ppcls_pybind.cc
@@ -18,76 +18,102 @@ void BindPaddleClas(pybind11::module& m) {
   pybind11::class_<vision::classification::PaddleClasPreprocessor>(
       m, "PaddleClasPreprocessor")
       .def(pybind11::init<std::string>())
-      .def("run", [](vision::classification::PaddleClasPreprocessor& self, std::vector<pybind11::array>& im_list) {
-        std::vector<vision::FDMat> images;
-        for (size_t i = 0; i < im_list.size(); ++i) {
-          images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
-        }
-        std::vector<FDTensor> outputs;
-        if (!self.Run(&images, &outputs)) {
-          throw std::runtime_error("Failed to preprocess the input data in PaddleClasPreprocessor.");
-        }
-        if (!self.WithGpu()) {
-          for (size_t i = 0; i < outputs.size(); ++i) {
-            outputs[i].StopSharing();
-          }
-        }
-        return outputs;
-      })
-      .def("use_gpu", [](vision::classification::PaddleClasPreprocessor& self, int gpu_id = -1) {
-        self.UseGpu(gpu_id);
-      })
-      .def("disable_normalize", [](vision::classification::PaddleClasPreprocessor& self) {
-        self.DisableNormalize();
-      })
-      .def("disable_permute", [](vision::classification::PaddleClasPreprocessor& self) {
-        self.DisablePermute();
-      });
+      .def("run",
+           [](vision::classification::PaddleClasPreprocessor& self,
+              std::vector<pybind11::array>& im_list) {
+             std::vector<vision::FDMat> images;
+             for (size_t i = 0; i < im_list.size(); ++i) {
+               images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
+             }
+             std::vector<FDTensor> outputs;
+             if (!self.Run(&images, &outputs)) {
+               throw std::runtime_error(
+                   "Failed to preprocess the input data in "
+                   "PaddleClasPreprocessor.");
+             }
+             if (!self.CudaUsed()) {
+               for (size_t i = 0; i < outputs.size(); ++i) {
+                 outputs[i].StopSharing();
+               }
+             }
+             return outputs;
+           })
+      .def("use_cuda",
+           [](vision::classification::PaddleClasPreprocessor& self,
+              bool enable_cv_cuda = false,
+              int gpu_id = -1) { self.UseCuda(enable_cv_cuda, gpu_id); })
+      .def("disable_normalize",
+           [](vision::classification::PaddleClasPreprocessor& self) {
+             self.DisableNormalize();
+           })
+      .def("disable_permute",
+           [](vision::classification::PaddleClasPreprocessor& self) {
+             self.DisablePermute();
+           });
 
   pybind11::class_<vision::classification::PaddleClasPostprocessor>(
       m, "PaddleClasPostprocessor")
       .def(pybind11::init<int>())
-      .def("run", [](vision::classification::PaddleClasPostprocessor& self, std::vector<FDTensor>& inputs) {
-        std::vector<vision::ClassifyResult> results;
-        if (!self.Run(inputs, &results)) {
-          throw std::runtime_error("Failed to postprocess the runtime result in PaddleClasPostprocessor.");
-        }
-        return results;
-      })
-      .def("run", [](vision::classification::PaddleClasPostprocessor& self, std::vector<pybind11::array>& input_array) {
-        std::vector<vision::ClassifyResult> results;
-        std::vector<FDTensor> inputs;
-        PyArrayToTensorList(input_array, &inputs, /*share_buffer=*/true);
-        if (!self.Run(inputs, &results)) {
-          throw std::runtime_error("Failed to postprocess the runtime result in PaddleClasPostprocessor.");
-        }
-        return results;
-      })
-      .def_property("topk", &vision::classification::PaddleClasPostprocessor::GetTopk, &vision::classification::PaddleClasPostprocessor::SetTopk);
+      .def("run",
+           [](vision::classification::PaddleClasPostprocessor& self,
+              std::vector<FDTensor>& inputs) {
+             std::vector<vision::ClassifyResult> results;
+             if (!self.Run(inputs, &results)) {
+               throw std::runtime_error(
+                   "Failed to postprocess the runtime result in "
+                   "PaddleClasPostprocessor.");
+             }
+             return results;
+           })
+      .def("run",
+           [](vision::classification::PaddleClasPostprocessor& self,
+              std::vector<pybind11::array>& input_array) {
+             std::vector<vision::ClassifyResult> results;
+             std::vector<FDTensor> inputs;
+             PyArrayToTensorList(input_array, &inputs, /*share_buffer=*/true);
+             if (!self.Run(inputs, &results)) {
+               throw std::runtime_error(
+                   "Failed to postprocess the runtime result in "
+                   "PaddleClasPostprocessor.");
+             }
+             return results;
+           })
+      .def_property("topk",
+                    &vision::classification::PaddleClasPostprocessor::GetTopk,
+                    &vision::classification::PaddleClasPostprocessor::SetTopk);
 
   pybind11::class_<vision::classification::PaddleClasModel, FastDeployModel>(
       m, "PaddleClasModel")
       .def(pybind11::init<std::string, std::string, std::string, RuntimeOption,
                           ModelFormat>())
-      .def("clone", [](vision::classification::PaddleClasModel& self) {
-        return self.Clone();
-      })
-      .def("predict", [](vision::classification::PaddleClasModel& self, pybind11::array& data) {
-        cv::Mat im = PyArrayToCvMat(data);
-        vision::ClassifyResult result;
-        self.Predict(im, &result);
-        return result;
-      })
-      .def("batch_predict", [](vision::classification::PaddleClasModel& self, std::vector<pybind11::array>& data) {
-        std::vector<cv::Mat> images;
-        for (size_t i = 0; i < data.size(); ++i) {
-          images.push_back(PyArrayToCvMat(data[i]));
-        }
-        std::vector<vision::ClassifyResult> results;
-        self.BatchPredict(images, &results);
-        return results;
-      })
-      .def_property_readonly("preprocessor", &vision::classification::PaddleClasModel::GetPreprocessor)
-      .def_property_readonly("postprocessor", &vision::classification::PaddleClasModel::GetPostprocessor);
+      .def("clone",
+           [](vision::classification::PaddleClasModel& self) {
+             return self.Clone();
+           })
+      .def("predict",
+           [](vision::classification::PaddleClasModel& self,
+              pybind11::array& data) {
+             cv::Mat im = PyArrayToCvMat(data);
+             vision::ClassifyResult result;
+             self.Predict(im, &result);
+             return result;
+           })
+      .def("batch_predict",
+           [](vision::classification::PaddleClasModel& self,
+              std::vector<pybind11::array>& data) {
+             std::vector<cv::Mat> images;
+             for (size_t i = 0; i < data.size(); ++i) {
+               images.push_back(PyArrayToCvMat(data[i]));
+             }
+             std::vector<vision::ClassifyResult> results;
+             self.BatchPredict(images, &results);
+             return results;
+           })
+      .def_property_readonly(
+          "preprocessor",
+          &vision::classification::PaddleClasModel::GetPreprocessor)
+      .def_property_readonly(
+          "postprocessor",
+          &vision::classification::PaddleClasModel::GetPostprocessor);
 }
 }  // namespace fastdeploy