Move GPU post-processing separately to cuda_postprocess.cu (wang-xiny…

…u#1331) * Move GPU post-processing separately to cuda_postprocess.cu * Update postprocess.cpp * Update postprocess.cu --------- Co-authored-by: wang-xinyu <[email protected]> Co-authored-by: Wang Xinyu <[email protected]>
B1SH0PP · Jul 18, 2023 · ae3bd5e · ae3bd5e
1 parent e9a972e
commit ae3bd5e
Show file tree

Hide file tree

Showing 9 changed files with 620 additions and 634 deletions.
diff --git a/yolov8/include/model.h b/yolov8/include/model.h
@@ -3,17 +3,17 @@
 #include <string>
 #include <assert.h>
 
-nvinfer1::IHostMemory* buildEngineYolov8n(const int& batchsize, nvinfer1::IBuilder* builder,
+nvinfer1::IHostMemory* buildEngineYolov8n(nvinfer1::IBuilder* builder,
 nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
 
-nvinfer1::IHostMemory* buildEngineYolov8s(const int& batchsize, nvinfer1::IBuilder* builder,
+nvinfer1::IHostMemory* buildEngineYolov8s(nvinfer1::IBuilder* builder,
 nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
 
-nvinfer1::IHostMemory* buildEngineYolov8m(const int& batchsize, nvinfer1::IBuilder* builder,
+nvinfer1::IHostMemory* buildEngineYolov8m(nvinfer1::IBuilder* builder,
 nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
 
-nvinfer1::IHostMemory* buildEngineYolov8l(const int& batchsize, nvinfer1::IBuilder* builder,
+nvinfer1::IHostMemory* buildEngineYolov8l(nvinfer1::IBuilder* builder,
 nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
 
-nvinfer1::IHostMemory* buildEngineYolov8x(const int& batchsize, nvinfer1::IBuilder* builder,
+nvinfer1::IHostMemory* buildEngineYolov8x(nvinfer1::IBuilder* builder,
 nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
diff --git a/yolov8/include/postprocess.h b/yolov8/include/postprocess.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "types.h"
+#include "NvInfer.h"
 #include <opencv2/opencv.hpp>
 
 cv::Rect get_rect(cv::Mat& img, float bbox[4]);
@@ -9,6 +10,13 @@ void nms(std::vector<Detection>& res, float *output, float conf_thresh, float nm
 
 void batch_nms(std::vector<std::vector<Detection>>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5);
 
-void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
+void draw_bbox(std::vector<cv::Mat> &img_batch, std::vector<std::vector<Detection>> &res_batch);
+
+void batch_process(std::vector<std::vector<Detection>> &res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector<cv::Mat>& img_batch);
+
+void process_decode_ptr_host(std::vector<Detection> &res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count);
+
+void cuda_decode(float* predict, int num_bboxes, float confidence_threshold,float* parray,int max_objects, cudaStream_t stream);
+
+void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
 
-void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector<cv::Mat>& img_batch);
diff --git a/yolov8/include/preprocess.h b/yolov8/include/preprocess.h
@@ -6,12 +6,6 @@
 #include <map>
 
 
-struct AffineMatrix {
-    float value[6];
-};
-
-const int bbox_element = sizeof(AffineMatrix) / sizeof(float)+1;      // left, top, right, bottom, confidence, class, keepflag
-
 void cuda_preprocess_init(int max_image_size);
 
 void cuda_preprocess_destroy();
@@ -20,6 +14,3 @@ void cuda_preprocess(uint8_t *src, int src_width, int src_height, float *dst, in
 
 void cuda_batch_preprocess(std::vector<cv::Mat> &img_batch, float *dst, int dst_width, int dst_height, cudaStream_t stream);
 
-void cuda_decode(float* predict, int num_bboxes, float confidence_threshold,float* parray,int max_objects, cudaStream_t stream);
-
-void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
diff --git a/yolov8/include/types.h b/yolov8/include/types.h
@@ -8,3 +8,8 @@ struct alignas(float) Detection {
   float class_id;
 };
 
+struct AffineMatrix {
+    float value[6];
+};
+
+const int bbox_element = sizeof(AffineMatrix) / sizeof(float)+1;      // left, top, right, bottom, confidence, class, keepflag
diff --git a/yolov8/main.cpp b/yolov8/main.cpp
@@ -1,32 +1,33 @@
+
+#include <iostream>
+#include <fstream>
+#include <opencv2/opencv.hpp>
 #include "model.h"
 #include "utils.h"
 #include "preprocess.h"
 #include "postprocess.h"
-#include <iostream>
-#include <opencv2/opencv.hpp>
 #include "cuda_utils.h"
-#include <fstream>
 #include "logging.h"
 
 Logger gLogger;
 using namespace nvinfer1;
 const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
 
-void serialize_engine(const int &kBatchSize, std::string &wts_name, std::string &engine_name, std::string &sub_type) {
+void serialize_engine(std::string &wts_name, std::string &engine_name, std::string &sub_type) {
     IBuilder *builder = createInferBuilder(gLogger);
     IBuilderConfig *config = builder->createBuilderConfig();
     IHostMemory *serialized_engine = nullptr;
 
     if (sub_type == "n") {
-        serialized_engine = buildEngineYolov8n(kBatchSize, builder, config, DataType::kFLOAT, wts_name);
+        serialized_engine = buildEngineYolov8n(builder, config, DataType::kFLOAT, wts_name);
     } else if (sub_type == "s") {
-        serialized_engine = buildEngineYolov8s(kBatchSize, builder, config, DataType::kFLOAT, wts_name);
+        serialized_engine = buildEngineYolov8s(builder, config, DataType::kFLOAT, wts_name);
     } else if (sub_type == "m") {
-        serialized_engine = buildEngineYolov8m(kBatchSize, builder, config, DataType::kFLOAT, wts_name);
+        serialized_engine = buildEngineYolov8m(builder, config, DataType::kFLOAT, wts_name);
     } else if (sub_type == "l") {
-        serialized_engine = buildEngineYolov8l(kBatchSize, builder, config, DataType::kFLOAT, wts_name);
+        serialized_engine = buildEngineYolov8l(builder, config, DataType::kFLOAT, wts_name);
     } else if (sub_type == "x") {
-        serialized_engine = buildEngineYolov8x(kBatchSize, builder, config, DataType::kFLOAT, wts_name);
+        serialized_engine = buildEngineYolov8x(builder, config, DataType::kFLOAT, wts_name);
     }
 
     assert(serialized_engine);
@@ -88,12 +89,12 @@ void prepare_buffer(ICudaEngine *engine, float **input_buffer_device, float **ou
     }
 }
 
-void infer(IExecutionContext &context, cudaStream_t &stream, void **buffers, float *output, int batchSize, float* decode_ptr_host, float* decode_ptr_device, int batchSize_in, int model_bboxes, std::string cuda_post_process) {
+void infer(IExecutionContext &context, cudaStream_t &stream, void **buffers, float *output, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
     // infer on the batch asynchronously, and DMA output back to host
     auto start = std::chrono::system_clock::now();
-    context.enqueue(batchSize, buffers, stream, nullptr);
+    context.enqueue(batchsize, buffers, stream, nullptr);
     if (cuda_post_process == "c") {
-        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,stream));
+        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,stream));
         auto end = std::chrono::system_clock::now();
         std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
     } else if (cuda_post_process == "g") {
@@ -143,7 +144,7 @@ int main(int argc, char **argv) {
 
     // Create a model using the API directly and serialize it to a file
     if (!wts_name.empty()) {
-        serialize_engine(kBatchSize, wts_name, engine_name, sub_type);
+        serialize_engine(wts_name, engine_name, sub_type);
         return 0;
     }
 
@@ -185,7 +186,7 @@ int main(int argc, char **argv) {
         // Preprocess
         cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
         // Run inference
-        infer(*context, stream, (void **)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, img_batch.size(), model_bboxes, cuda_post_process);
+        infer(*context, stream, (void **)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process);
         std::vector<std::vector<Detection>> res_batch;
         if (cuda_post_process == "c") {
             // NMS