Skip to content

Commit

Permalink
Move GPU post-processing separately to cuda_postprocess.cu (wang-xiny…
Browse files Browse the repository at this point in the history
…u#1331)

* Move GPU post-processing separately to cuda_postprocess.cu

* Update postprocess.cpp

* Update postprocess.cu

---------

Co-authored-by: wang-xinyu <[email protected]>
Co-authored-by: Wang Xinyu <[email protected]>
  • Loading branch information
3 people authored Jul 18, 2023
1 parent e9a972e commit ae3bd5e
Show file tree
Hide file tree
Showing 9 changed files with 620 additions and 634 deletions.
10 changes: 5 additions & 5 deletions yolov8/include/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@
#include <string>
#include <assert.h>

nvinfer1::IHostMemory* buildEngineYolov8n(const int& batchsize, nvinfer1::IBuilder* builder,
nvinfer1::IHostMemory* buildEngineYolov8n(nvinfer1::IBuilder* builder,
nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);

nvinfer1::IHostMemory* buildEngineYolov8s(const int& batchsize, nvinfer1::IBuilder* builder,
nvinfer1::IHostMemory* buildEngineYolov8s(nvinfer1::IBuilder* builder,
nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);

nvinfer1::IHostMemory* buildEngineYolov8m(const int& batchsize, nvinfer1::IBuilder* builder,
nvinfer1::IHostMemory* buildEngineYolov8m(nvinfer1::IBuilder* builder,
nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);

nvinfer1::IHostMemory* buildEngineYolov8l(const int& batchsize, nvinfer1::IBuilder* builder,
nvinfer1::IHostMemory* buildEngineYolov8l(nvinfer1::IBuilder* builder,
nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);

nvinfer1::IHostMemory* buildEngineYolov8x(const int& batchsize, nvinfer1::IBuilder* builder,
nvinfer1::IHostMemory* buildEngineYolov8x(nvinfer1::IBuilder* builder,
nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
12 changes: 10 additions & 2 deletions yolov8/include/postprocess.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include "types.h"
#include "NvInfer.h"
#include <opencv2/opencv.hpp>

cv::Rect get_rect(cv::Mat& img, float bbox[4]);
Expand All @@ -9,6 +10,13 @@ void nms(std::vector<Detection>& res, float *output, float conf_thresh, float nm

void batch_nms(std::vector<std::vector<Detection>>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5);

void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void draw_bbox(std::vector<cv::Mat> &img_batch, std::vector<std::vector<Detection>> &res_batch);

void batch_process(std::vector<std::vector<Detection>> &res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector<cv::Mat>& img_batch);

void process_decode_ptr_host(std::vector<Detection> &res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count);

void cuda_decode(float* predict, int num_bboxes, float confidence_threshold,float* parray,int max_objects, cudaStream_t stream);

void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);

void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector<cv::Mat>& img_batch);
9 changes: 0 additions & 9 deletions yolov8/include/preprocess.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@
#include <map>


struct AffineMatrix {
float value[6];
};

const int bbox_element = sizeof(AffineMatrix) / sizeof(float)+1; // left, top, right, bottom, confidence, class, keepflag

void cuda_preprocess_init(int max_image_size);

void cuda_preprocess_destroy();
Expand All @@ -20,6 +14,3 @@ void cuda_preprocess(uint8_t *src, int src_width, int src_height, float *dst, in

void cuda_batch_preprocess(std::vector<cv::Mat> &img_batch, float *dst, int dst_width, int dst_height, cudaStream_t stream);

void cuda_decode(float* predict, int num_bboxes, float confidence_threshold,float* parray,int max_objects, cudaStream_t stream);

void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
5 changes: 5 additions & 0 deletions yolov8/include/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,8 @@ struct alignas(float) Detection {
float class_id;
};

struct AffineMatrix {
float value[6];
};

const int bbox_element = sizeof(AffineMatrix) / sizeof(float)+1; // left, top, right, bottom, confidence, class, keepflag
29 changes: 15 additions & 14 deletions yolov8/main.cpp
Original file line number Diff line number Diff line change
@@ -1,32 +1,33 @@

#include <iostream>
#include <fstream>
#include <opencv2/opencv.hpp>
#include "model.h"
#include "utils.h"
#include "preprocess.h"
#include "postprocess.h"
#include <iostream>
#include <opencv2/opencv.hpp>
#include "cuda_utils.h"
#include <fstream>
#include "logging.h"

Logger gLogger;
using namespace nvinfer1;
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

void serialize_engine(const int &kBatchSize, std::string &wts_name, std::string &engine_name, std::string &sub_type) {
void serialize_engine(std::string &wts_name, std::string &engine_name, std::string &sub_type) {
IBuilder *builder = createInferBuilder(gLogger);
IBuilderConfig *config = builder->createBuilderConfig();
IHostMemory *serialized_engine = nullptr;

if (sub_type == "n") {
serialized_engine = buildEngineYolov8n(kBatchSize, builder, config, DataType::kFLOAT, wts_name);
serialized_engine = buildEngineYolov8n(builder, config, DataType::kFLOAT, wts_name);
} else if (sub_type == "s") {
serialized_engine = buildEngineYolov8s(kBatchSize, builder, config, DataType::kFLOAT, wts_name);
serialized_engine = buildEngineYolov8s(builder, config, DataType::kFLOAT, wts_name);
} else if (sub_type == "m") {
serialized_engine = buildEngineYolov8m(kBatchSize, builder, config, DataType::kFLOAT, wts_name);
serialized_engine = buildEngineYolov8m(builder, config, DataType::kFLOAT, wts_name);
} else if (sub_type == "l") {
serialized_engine = buildEngineYolov8l(kBatchSize, builder, config, DataType::kFLOAT, wts_name);
serialized_engine = buildEngineYolov8l(builder, config, DataType::kFLOAT, wts_name);
} else if (sub_type == "x") {
serialized_engine = buildEngineYolov8x(kBatchSize, builder, config, DataType::kFLOAT, wts_name);
serialized_engine = buildEngineYolov8x(builder, config, DataType::kFLOAT, wts_name);
}

assert(serialized_engine);
Expand Down Expand Up @@ -88,12 +89,12 @@ void prepare_buffer(ICudaEngine *engine, float **input_buffer_device, float **ou
}
}

void infer(IExecutionContext &context, cudaStream_t &stream, void **buffers, float *output, int batchSize, float* decode_ptr_host, float* decode_ptr_device, int batchSize_in, int model_bboxes, std::string cuda_post_process) {
void infer(IExecutionContext &context, cudaStream_t &stream, void **buffers, float *output, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
// infer on the batch asynchronously, and DMA output back to host
auto start = std::chrono::system_clock::now();
context.enqueue(batchSize, buffers, stream, nullptr);
context.enqueue(batchsize, buffers, stream, nullptr);
if (cuda_post_process == "c") {
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,stream));
CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,stream));
auto end = std::chrono::system_clock::now();
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
} else if (cuda_post_process == "g") {
Expand Down Expand Up @@ -143,7 +144,7 @@ int main(int argc, char **argv) {

// Create a model using the API directly and serialize it to a file
if (!wts_name.empty()) {
serialize_engine(kBatchSize, wts_name, engine_name, sub_type);
serialize_engine(wts_name, engine_name, sub_type);
return 0;
}

Expand Down Expand Up @@ -185,7 +186,7 @@ int main(int argc, char **argv) {
// Preprocess
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
// Run inference
infer(*context, stream, (void **)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, img_batch.size(), model_bboxes, cuda_post_process);
infer(*context, stream, (void **)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process);
std::vector<std::vector<Detection>> res_batch;
if (cuda_post_process == "c") {
// NMS
Expand Down
Loading

0 comments on commit ae3bd5e

Please sign in to comment.