Skip to content

Commit

Permalink
[CVCUDA] CMake integration, vison processor CV-CUDA integration, Padd…
Browse files Browse the repository at this point in the history
…leClas support CV-CUDA (PaddlePaddle#1074)

* cvcuda resize

* cvcuda center crop

* cvcuda resize

* add a fdtensor in fdmat

* get cv mat and get tensor support gpu

* paddleclas cvcuda preprocessor

* fix compile err

* fix windows compile error

* rename reused to cached

* address comment

* remove debug code

* add comment

* add manager run

* use cuda and cuda used

* use cv cuda doc

* address comment

---------

Co-authored-by: Jason <[email protected]>
  • Loading branch information
wang-xinyu and jiangjiajun authored Jan 30, 2023
1 parent 0c735e9 commit 62e051e
Show file tree
Hide file tree
Showing 26 changed files with 814 additions and 216 deletions.
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ option(ENABLE_LITE_BACKEND "Whether to enable paddle lite backend." OFF)
option(ENABLE_VISION "Whether to enable vision models usage." OFF)
option(ENABLE_TEXT "Whether to enable text models usage." OFF)
option(ENABLE_FLYCV "Whether to enable flycv to boost image preprocess." OFF)
option(ENABLE_CVCUDA "Whether to enable NVIDIA CV-CUDA to boost image preprocess." OFF)
option(ENABLE_ENCRYPTION "Whether to enable ENCRYPTION." OFF)
option(WITH_ASCEND "Whether to compile for Huawei Ascend deploy." OFF)
option(WITH_TIMVX "Whether to compile for TIMVX deploy." OFF)
Expand Down Expand Up @@ -373,6 +374,12 @@ if(ENABLE_VISION)
include(${PROJECT_SOURCE_DIR}/cmake/flycv.cmake)
list(APPEND DEPEND_LIBS external_flycv)
endif()

if(ENABLE_CVCUDA)
include(${PROJECT_SOURCE_DIR}/cmake/cvcuda.cmake)
add_definitions(-DENABLE_CVCUDA)
list(APPEND DEPEND_LIBS nvcv_types cvcuda)
endif()
endif()

if(ENABLE_TEXT)
Expand Down
9 changes: 9 additions & 0 deletions FastDeploy.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ set(ENABLE_TRT_BACKEND @ENABLE_TRT_BACKEND@)
set(ENABLE_PADDLE2ONNX @ENABLE_PADDLE2ONNX@)
set(ENABLE_VISION @ENABLE_VISION@)
set(ENABLE_FLYCV @ENABLE_FLYCV@)
set(ENABLE_CVCUDA @ENABLE_CVCUDA@)
set(ENABLE_TEXT @ENABLE_TEXT@)
set(ENABLE_ENCRYPTION @ENABLE_ENCRYPTION@)
set(BUILD_ON_JETSON @BUILD_ON_JETSON@)
Expand Down Expand Up @@ -140,6 +141,7 @@ if(WITH_GPU)
message(FATAL_ERROR "[FastDeploy] Cannot find library cudart in ${CUDA_DIRECTORY}, Please define CUDA_DIRECTORY, e.g -DCUDA_DIRECTORY=/path/to/cuda")
endif()
list(APPEND FASTDEPLOY_LIBS ${CUDA_LIB})
list(APPEND FASTDEPLOY_INCS ${CUDA_DIRECTORY}/include)

if (ENABLE_TRT_BACKEND)
if(BUILD_ON_JETSON)
Expand Down Expand Up @@ -218,6 +220,12 @@ if(ENABLE_VISION)
endif()
endif()

if(ENABLE_CVCUDA)
find_library(CVCUDA_LIB cvcuda ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/cvcuda/lib NO_DEFAULT_PATH)
find_library(NVCV_TYPES_LIB nvcv_types ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/cvcuda/lib NO_DEFAULT_PATH)
list(APPEND FASTDEPLOY_LIBS ${CVCUDA_LIB} ${NVCV_TYPES_LIB})
endif()

endif()

if (ENABLE_TEXT)
Expand Down Expand Up @@ -288,6 +296,7 @@ if(ENABLE_OPENVINO_BACKEND)
endif()
message(STATUS " ENABLE_TRT_BACKEND : ${ENABLE_TRT_BACKEND}")
message(STATUS " ENABLE_VISION : ${ENABLE_VISION}")
message(STATUS " ENABLE_CVCUDA : ${ENABLE_CVCUDA}")
message(STATUS " ENABLE_TEXT : ${ENABLE_TEXT}")
message(STATUS " ENABLE_ENCRYPTION : ${ENABLE_ENCRYPTION}")
if(WITH_GPU)
Expand Down
43 changes: 43 additions & 0 deletions cmake/cvcuda.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

if(NOT WITH_GPU)
message(FATAL_ERROR "ENABLE_CVCUDA is available on Linux and WITH_GPU=ON, but now WITH_GPU=OFF.")
endif()

if(APPLE OR ANDROID OR IOS OR WIN32)
message(FATAL_ERROR "Cannot enable CV-CUDA in mac/ios/android/windows os, please set -DENABLE_CVCUDA=OFF.")
endif()

if(NOT (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64"))
message(FATAL_ERROR "CV-CUDA only support x86_64.")
endif()

set(CVCUDA_LIB_URL https://github.com/CVCUDA/CV-CUDA/releases/download/v0.2.0-alpha/nvcv-lib-0.2.0_alpha-cuda11-x86_64-linux.tar.xz)
set(CVCUDA_LIB_FILENAME nvcv-lib-0.2.0_alpha-cuda11-x86_64-linux.tar.xz)
set(CVCUDA_DEV_URL https://github.com/CVCUDA/CV-CUDA/releases/download/v0.2.0-alpha/nvcv-dev-0.2.0_alpha-cuda11-x86_64-linux.tar.xz)
set(CVCUDA_DEV_FILENAME nvcv-dev-0.2.0_alpha-cuda11-x86_64-linux.tar.xz)

download_and_decompress(${CVCUDA_LIB_URL} ${CMAKE_CURRENT_BINARY_DIR}/${CVCUDA_LIB_FILENAME} ${THIRD_PARTY_PATH}/cvcuda)
download_and_decompress(${CVCUDA_DEV_URL} ${CMAKE_CURRENT_BINARY_DIR}/${CVCUDA_DEV_FILENAME} ${THIRD_PARTY_PATH}/cvcuda)

execute_process(COMMAND rm -rf ${THIRD_PARTY_PATH}/install/cvcuda)
execute_process(COMMAND mkdir -p ${THIRD_PARTY_PATH}/install/cvcuda)
execute_process(COMMAND cp -r ${THIRD_PARTY_PATH}/cvcuda/opt/nvidia/cvcuda0/lib/x86_64-linux-gnu/ ${THIRD_PARTY_PATH}/install/cvcuda/lib)
execute_process(COMMAND cp -r ${THIRD_PARTY_PATH}/cvcuda/opt/nvidia/cvcuda0/include/ ${THIRD_PARTY_PATH}/install/cvcuda/include)

link_directories(${THIRD_PARTY_PATH}/install/cvcuda/lib)
include_directories(${THIRD_PARTY_PATH}/install/cvcuda/include)

set(CMAKE_CXX_STANDARD 17)
39 changes: 39 additions & 0 deletions docs/cn/faq/use_cv_cuda.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# 使用CV-CUDA/CUDA加速GPU端到端推理性能

FastDeploy集成了CV-CUDA来加速预/后处理,个别CV-CUDA不支持的算子使用了CUDA kernel的方式实现。

FastDeploy的Vision Processor模块对CV-CUDA的算子做了进一步的封装,用户不需要自己去调用CV-CUDA,
使用FastDeploy的模型推理接口即可利用CV-CUDA的加速能力。

FastDeploy的Vision Processor模块在集成CV-CUDA时,做了以下工作来方便用户的使用:
- GPU内存管理,缓存算子的输入、输出tensor,避免重复分配GPU内存
- CV-CUDA不支持的个别算子利用CUDA kernel实现
- CV-CUDA/CUDA不支持的算子可以fallback到OpenCV/FlyCV

## 使用方式
编译FastDeploy时,开启CV-CUDA编译选项
```bash
# 编译C++预测库时, 开启CV-CUDA编译选项.
-DENABLE_CVCUDA=ON \

# 在编译Python预测库时, 开启CV-CUDA编译选项
export ENABLE_CVCUDA=ON
```

只有继承了ProcessorManager类的模型预处理,才可以使用CV-CUDA,这里以PaddleClasPreprocessor为例
```bash
# C++
# 创建model之后,调用model preprocessor的UseCuda接口即可打开CV-CUDA/CUDA预处理
# 第一个参数enable_cv_cuda,true代表使用CV-CUDA,false代表只使用CUDA(支持的算子较少)
# 第二个参数是GPU id,-1代表不指定,使用当前GPU
model.GetPreprocessor().UseCuda(true, 0);

# Python
model.preprocessor.use_cuda(True, 0)
```

## 最佳实践

- 如果预处理第一个算子是resize,则要根据实际情况决定resize是否跑在GPU。因为当resize跑在GPU,
且图片解码在CPU时,需要把原图copy到GPU内存,开销较大,而resize之后再copy到GPU内存,则往往只需要
copy较少的数据。
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def initialize(self, args):
yaml_path)
if args['model_instance_kind'] == 'GPU':
device_id = int(args['model_instance_device_id'])
self.preprocess_.use_gpu(device_id)
self.preprocess_.use_cuda(False, device_id)

def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
Expand Down
150 changes: 88 additions & 62 deletions fastdeploy/vision/classification/ppcls/ppcls_pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,76 +18,102 @@ void BindPaddleClas(pybind11::module& m) {
pybind11::class_<vision::classification::PaddleClasPreprocessor>(
m, "PaddleClasPreprocessor")
.def(pybind11::init<std::string>())
.def("run", [](vision::classification::PaddleClasPreprocessor& self, std::vector<pybind11::array>& im_list) {
std::vector<vision::FDMat> images;
for (size_t i = 0; i < im_list.size(); ++i) {
images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
}
std::vector<FDTensor> outputs;
if (!self.Run(&images, &outputs)) {
throw std::runtime_error("Failed to preprocess the input data in PaddleClasPreprocessor.");
}
if (!self.WithGpu()) {
for (size_t i = 0; i < outputs.size(); ++i) {
outputs[i].StopSharing();
}
}
return outputs;
})
.def("use_gpu", [](vision::classification::PaddleClasPreprocessor& self, int gpu_id = -1) {
self.UseGpu(gpu_id);
})
.def("disable_normalize", [](vision::classification::PaddleClasPreprocessor& self) {
self.DisableNormalize();
})
.def("disable_permute", [](vision::classification::PaddleClasPreprocessor& self) {
self.DisablePermute();
});
.def("run",
[](vision::classification::PaddleClasPreprocessor& self,
std::vector<pybind11::array>& im_list) {
std::vector<vision::FDMat> images;
for (size_t i = 0; i < im_list.size(); ++i) {
images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
}
std::vector<FDTensor> outputs;
if (!self.Run(&images, &outputs)) {
throw std::runtime_error(
"Failed to preprocess the input data in "
"PaddleClasPreprocessor.");
}
if (!self.CudaUsed()) {
for (size_t i = 0; i < outputs.size(); ++i) {
outputs[i].StopSharing();
}
}
return outputs;
})
.def("use_cuda",
[](vision::classification::PaddleClasPreprocessor& self,
bool enable_cv_cuda = false,
int gpu_id = -1) { self.UseCuda(enable_cv_cuda, gpu_id); })
.def("disable_normalize",
[](vision::classification::PaddleClasPreprocessor& self) {
self.DisableNormalize();
})
.def("disable_permute",
[](vision::classification::PaddleClasPreprocessor& self) {
self.DisablePermute();
});

pybind11::class_<vision::classification::PaddleClasPostprocessor>(
m, "PaddleClasPostprocessor")
.def(pybind11::init<int>())
.def("run", [](vision::classification::PaddleClasPostprocessor& self, std::vector<FDTensor>& inputs) {
std::vector<vision::ClassifyResult> results;
if (!self.Run(inputs, &results)) {
throw std::runtime_error("Failed to postprocess the runtime result in PaddleClasPostprocessor.");
}
return results;
})
.def("run", [](vision::classification::PaddleClasPostprocessor& self, std::vector<pybind11::array>& input_array) {
std::vector<vision::ClassifyResult> results;
std::vector<FDTensor> inputs;
PyArrayToTensorList(input_array, &inputs, /*share_buffer=*/true);
if (!self.Run(inputs, &results)) {
throw std::runtime_error("Failed to postprocess the runtime result in PaddleClasPostprocessor.");
}
return results;
})
.def_property("topk", &vision::classification::PaddleClasPostprocessor::GetTopk, &vision::classification::PaddleClasPostprocessor::SetTopk);
.def("run",
[](vision::classification::PaddleClasPostprocessor& self,
std::vector<FDTensor>& inputs) {
std::vector<vision::ClassifyResult> results;
if (!self.Run(inputs, &results)) {
throw std::runtime_error(
"Failed to postprocess the runtime result in "
"PaddleClasPostprocessor.");
}
return results;
})
.def("run",
[](vision::classification::PaddleClasPostprocessor& self,
std::vector<pybind11::array>& input_array) {
std::vector<vision::ClassifyResult> results;
std::vector<FDTensor> inputs;
PyArrayToTensorList(input_array, &inputs, /*share_buffer=*/true);
if (!self.Run(inputs, &results)) {
throw std::runtime_error(
"Failed to postprocess the runtime result in "
"PaddleClasPostprocessor.");
}
return results;
})
.def_property("topk",
&vision::classification::PaddleClasPostprocessor::GetTopk,
&vision::classification::PaddleClasPostprocessor::SetTopk);

pybind11::class_<vision::classification::PaddleClasModel, FastDeployModel>(
m, "PaddleClasModel")
.def(pybind11::init<std::string, std::string, std::string, RuntimeOption,
ModelFormat>())
.def("clone", [](vision::classification::PaddleClasModel& self) {
return self.Clone();
})
.def("predict", [](vision::classification::PaddleClasModel& self, pybind11::array& data) {
cv::Mat im = PyArrayToCvMat(data);
vision::ClassifyResult result;
self.Predict(im, &result);
return result;
})
.def("batch_predict", [](vision::classification::PaddleClasModel& self, std::vector<pybind11::array>& data) {
std::vector<cv::Mat> images;
for (size_t i = 0; i < data.size(); ++i) {
images.push_back(PyArrayToCvMat(data[i]));
}
std::vector<vision::ClassifyResult> results;
self.BatchPredict(images, &results);
return results;
})
.def_property_readonly("preprocessor", &vision::classification::PaddleClasModel::GetPreprocessor)
.def_property_readonly("postprocessor", &vision::classification::PaddleClasModel::GetPostprocessor);
.def("clone",
[](vision::classification::PaddleClasModel& self) {
return self.Clone();
})
.def("predict",
[](vision::classification::PaddleClasModel& self,
pybind11::array& data) {
cv::Mat im = PyArrayToCvMat(data);
vision::ClassifyResult result;
self.Predict(im, &result);
return result;
})
.def("batch_predict",
[](vision::classification::PaddleClasModel& self,
std::vector<pybind11::array>& data) {
std::vector<cv::Mat> images;
for (size_t i = 0; i < data.size(); ++i) {
images.push_back(PyArrayToCvMat(data[i]));
}
std::vector<vision::ClassifyResult> results;
self.BatchPredict(images, &results);
return results;
})
.def_property_readonly(
"preprocessor",
&vision::classification::PaddleClasModel::GetPreprocessor)
.def_property_readonly(
"postprocessor",
&vision::classification::PaddleClasModel::GetPostprocessor);
}
} // namespace fastdeploy
Loading

0 comments on commit 62e051e

Please sign in to comment.