finish Yolov4 code

taxuezcy · Oct 9, 2020 · fd95ad2 · fd95ad2
1 parent 19d727a
commit fd95ad2
Show file tree

Hide file tree

Showing 18 changed files with 2,631 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@
 *.pt
 *symbol.json
 *.params
+*.weights
diff --git a/Yolov4/CMakeLists.txt b/Yolov4/CMakeLists.txt
@@ -0,0 +1,39 @@
+cmake_minimum_required(VERSION 3.5)
+
+project(Yolov4_trt)
+
+set(CMAKE_CXX_STANDARD 14)
+
+# CUDA
+find_package(CUDA REQUIRED)
+message(STATUS "Find CUDA include at ${CUDA_INCLUDE_DIRS}")
+message(STATUS "Find CUDA libraries: ${CUDA_LIBRARIES}")
+
+# TensorRT
+set(TENSORRT_ROOT /usr/src/tensorrt/)
+find_path(TENSORRT_INCLUDE_DIR NvInfer.h
+        HINTS ${TENSORRT_ROOT} PATH_SUFFIXES include/)
+message(STATUS "Found TensorRT headers at ${TENSORRT_INCLUDE_DIR}")
+find_library(TENSORRT_LIBRARY_INFER nvinfer
+        HINTS ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
+        PATH_SUFFIXES lib lib64 lib/x64)
+find_library(TENSORRT_LIBRARY_ONNXPARSER nvonnxparser
+        HINTS  ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
+        PATH_SUFFIXES lib lib64 lib/x64)
+set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_ONNXPARSER})
+message(STATUS "Find TensorRT libs: ${TENSORRT_LIBRARY}")
+
+# OpenCV
+find_package(OpenCV REQUIRED)
+message(STATUS "Find OpenCV include at ${OpenCV_INCLUDE_DIRS}")
+message(STATUS "Find OpenCV libraries: ${OpenCV_LIBRARIES}")
+
+set(COMMON_INCLUDE ../includes/common)
+set(YAML_INCLUDE ../includes/yaml-cpp/include)
+set(YAML_LIB_DIR ../includes/yaml-cpp/libs)
+
+include_directories(${CUDA_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${OpenCV_INCLUDE_DIRS} ${COMMON_INCLUDE} ${YAML_INCLUDE})
+link_directories(${YAML_LIB_DIR})
+
+add_executable(Yolov4_trt main.cpp Yolov4.cpp)
+target_link_libraries(Yolov4_trt ${OpenCV_LIBRARIES} ${CUDA_LIBRARIES} ${TENSORRT_LIBRARY} yaml-cpp)
diff --git a/Yolov4/README.md b/Yolov4/README.md
@@ -0,0 +1,32 @@
+# YOLOv4 Network darknet=>ONNX=>TensorRT
+
+## 1.Reference
+- **YOLOv4:** [YOLOv4: Optimal Speed and Accuracy of Object Detection](https://arxiv.org/abs/2004.10934)
+- **darknet:** [https://github.com/AlexeyAB/darknet](https://github.com/AlexeyAB/darknet)
+- get YOLOv4 weights from here: [yolov4.weights](https://drive.google.com/open?id=1sWNozS0emz7bmQTUWDLvsubLGnCwUiIS)
+
+## 2.Export ONNX Model
+```
+python3 export_onnx.py
+```
+
+## 3.Build Yolov4_trt Project
+```
+mkdir build && cd build
+cmake ..
+make -j
+```
+
+## 4.Run Yolov4_trt
+```
+./Yolov4_trt ../config.yaml ../samples
+```
+
+## 5.Inference Time Benchmark
+resolution|darknet|TensorRT|
+---|---|---
+416x416||4.2ms
+608x608||6.3ms
+
+## 6.Results
+![](prediction.jpg)
diff --git a/Yolov4/Yolov4.cpp b/Yolov4/Yolov4.cpp
@@ -0,0 +1,284 @@
+#include "Yolov4.h"
+#include "yaml-cpp/yaml.h"
+#include "common.hpp"
+
+YOLOv4::YOLOv4(const std::string &config_file) {
+    YAML::Node root = YAML::LoadFile(config_file);
+    YAML::Node config = root["seresnext"];
+    onnx_file = config["onnx_file"].as<std::string>();
+    engine_file = config["engine_file"].as<std::string>();
+    labels_file = config["labels_file"].as<std::string>();
+    BATCH_SIZE = config["BATCH_SIZE"].as<int>();
+    INPUT_CHANNEL = config["INPUT_CHANNEL"].as<int>();
+    IMAGE_WIDTH = config["IMAGE_WIDTH"].as<int>();
+    IMAGE_HEIGHT = config["IMAGE_HEIGHT"].as<int>();
+    obj_threshold = config["obj_threshold"].as<float>();
+    nms_threshold = config["nms_threshold"].as<float>();
+    stride = config["stride"].as<std::vector<int>>();
+    anchors = config["anchors"].as<std::vector<std::vector<int>>>();
+    coco_labels = readCOCOLabel(labels_file);
+    CATEGORY = coco_labels.size();
+    grids = {
+            {3, int(IMAGE_WIDTH / stride[0]), int(IMAGE_HEIGHT / stride[0])},
+            {3, int(IMAGE_WIDTH / stride[1]), int(IMAGE_HEIGHT / stride[1])},
+            {3, int(IMAGE_WIDTH / stride[2]), int(IMAGE_HEIGHT / stride[2])},
+    };
+    class_colors.resize(CATEGORY);
+    srand((int) time(nullptr));
+    for (cv::Scalar &class_color : class_colors)
+        class_color = cv::Scalar(rand() % 255, rand() % 255, rand() % 255);
+}
+
+YOLOv4::~YOLOv4() = default;
+
+void YOLOv4::LoadEngine() {
+    // create and load engine
+    std::fstream existEngine;
+    existEngine.open(engine_file, std::ios::in);
+    if (existEngine) {
+        readTrtFile(engine_file, engine);
+        assert(engine != nullptr);
+    } else {
+        onnxToTRTModel(onnx_file, engine_file, engine, BATCH_SIZE);
+        assert(engine != nullptr);
+    }
+}
+
+bool YOLOv4::InferenceFolder(const std::string &folder_name) {
+    std::vector<std::string> sample_images = readFolder(folder_name);
+    //get context
+    assert(engine != nullptr);
+    context = engine->createExecutionContext();
+    assert(context != nullptr);
+
+    //get buffers
+    assert(engine->getNbBindings() == 2);
+    void *buffers[2];
+    std::vector<int64_t> bufferSize;
+    int nbBindings = engine->getNbBindings();
+    bufferSize.resize(nbBindings);
+
+    for (int i = 0; i < nbBindings; ++i) {
+        nvinfer1::Dims dims = engine->getBindingDimensions(i);
+        nvinfer1::DataType dtype = engine->getBindingDataType(i);
+        int64_t totalSize = volume(dims) * 1 * getElementSize(dtype);
+        bufferSize[i] = totalSize;
+        std::cout << "binding" << i << ": " << totalSize << std::endl;
+        cudaMalloc(&buffers[i], totalSize);
+    }
+
+    //get stream
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    int outSize = bufferSize[1] / sizeof(float) / BATCH_SIZE;
+
+    EngineInference(sample_images, outSize, buffers, bufferSize, stream);
+
+    // release the stream and the buffers
+    cudaStreamDestroy(stream);
+    cudaFree(buffers[0]);
+    cudaFree(buffers[1]);
+
+    // destroy the engine
+    context->destroy();
+    engine->destroy();
+}
+
+void YOLOv4::EngineInference(const std::vector<std::string> &image_list, const int &outSize, void **buffers,
+                             const std::vector<int64_t> &bufferSize, cudaStream_t stream) {
+    int index = 0;
+    int batch_id = 0;
+    std::vector<cv::Mat> vec_Mat(BATCH_SIZE);
+    std::vector<std::string> vec_name(BATCH_SIZE);
+    float total_time = 0;
+    for (const std::string &image_name : image_list)
+    {
+        index++;
+        std::cout << "Processing: " << image_name << std::endl;
+        cv::Mat src_img = cv::imread(image_name);
+        if (src_img.data)
+        {
+            cv::cvtColor(src_img, src_img, cv::COLOR_BGR2RGB);
+            vec_Mat[batch_id] = src_img.clone();
+            vec_name[batch_id] = image_name;
+            batch_id++;
+        }
+        if (batch_id == BATCH_SIZE or index == image_list.size())
+        {
+            auto t_start_pre = std::chrono::high_resolution_clock::now();
+            std::cout << "prepareImage" << std::endl;
+            std::vector<float>curInput = prepareImage(vec_Mat);
+            auto t_end_pre = std::chrono::high_resolution_clock::now();
+            float total_pre = std::chrono::duration<float, std::milli>(t_end_pre - t_start_pre).count();
+            std::cout << "prepare image take: " << total_pre << " ms." << std::endl;
+            total_time += total_pre;
+            batch_id = 0;
+            if (!curInput.data()) {
+                std::cout << "prepare images ERROR!" << std::endl;
+                continue;
+            }
+            // DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
+            std::cout << "host2device" << std::endl;
+            cudaMemcpyAsync(buffers[0], curInput.data(), bufferSize[0], cudaMemcpyHostToDevice, stream);
+
+            // do inference
+            std::cout << "execute" << std::endl;
+            auto t_start = std::chrono::high_resolution_clock::now();
+            context->execute(BATCH_SIZE, buffers);
+            auto t_end = std::chrono::high_resolution_clock::now();
+            float total_inf = std::chrono::duration<float, std::milli>(t_end - t_start).count();
+            std::cout << "Inference take: " << total_inf << " ms." << std::endl;
+            total_time += total_inf;
+            std::cout << "execute success" << std::endl;
+            std::cout << "device2host" << std::endl;
+            std::cout << "post process" << std::endl;
+            auto r_start = std::chrono::high_resolution_clock::now();
+//            float out[outSize * BATCH_SIZE * 2];
+            float *out = new float[outSize * BATCH_SIZE];
+            cudaMemcpyAsync(out, buffers[1], bufferSize[1], cudaMemcpyDeviceToHost, stream);
+            cudaStreamSynchronize(stream);
+            auto boxes = postProcess(vec_Mat, out, outSize);
+            auto r_end = std::chrono::high_resolution_clock::now();
+            float total_res = std::chrono::duration<float, std::milli>(r_end - r_start).count();
+            std::cout << "Post process take: " << total_res << " ms." << std::endl;
+            total_time += total_res;
+            for (int i = 0; i < (int)vec_Mat.size(); i++)
+            {
+                auto org_img = vec_Mat[i];
+                auto rects = boxes[i];
+                cv::cvtColor(org_img, org_img, cv::COLOR_BGR2RGB);
+                for(const auto &rect : rects)
+                {
+                    char t[256];
+                    sprintf(t, "%.2f", rect.prob);
+                    std::string name = coco_labels[rect.classes] + "-" + t;
+                    cv::putText(org_img, name, cv::Point(rect.x - rect.w / 2, rect.y - rect.h / 2 - 5), cv::FONT_HERSHEY_COMPLEX, 0.7, class_colors[rect.classes], 2);
+                    cv::Rect rst(rect.x - rect.w / 2, rect.y - rect.h / 2, rect.w, rect.h);
+                    cv::rectangle(org_img, rst, class_colors[rect.classes], 2, cv::LINE_8, 0);
+                }
+                int pos = vec_name[i].find_last_of(".");
+                std::string rst_name = vec_name[i].insert(pos, "_");
+                std::cout << rst_name << std::endl;
+                cv::imwrite(rst_name, org_img);
+            }
+            vec_Mat = std::vector<cv::Mat>(BATCH_SIZE);
+            delete[] out;
+        }
+    }
+    std::cout << "Average processing time is " << total_time / image_list.size() << "ms" << std::endl;
+}
+
+std::vector<float> YOLOv4::prepareImage(std::vector<cv::Mat> &vec_img) {
+    std::vector<float> result(BATCH_SIZE * IMAGE_WIDTH * IMAGE_HEIGHT * INPUT_CHANNEL);
+    float *data = result.data();
+    for (const cv::Mat &src_img : vec_img)
+    {
+        if (!src_img.data)
+            continue;
+        cv::Mat flt_img;
+        cv::resize(src_img, flt_img, cv::Size(IMAGE_WIDTH, IMAGE_HEIGHT));
+        flt_img.convertTo(flt_img, CV_32FC3, 1.0 / 255);
+
+        //HWC TO CHW
+        std::vector<cv::Mat> split_img(INPUT_CHANNEL);
+        cv::split(flt_img, split_img);
+
+        int channelLength = IMAGE_WIDTH * IMAGE_HEIGHT;
+        for (int i = 0; i < INPUT_CHANNEL; ++i)
+        {
+            memcpy(data, split_img[i].data, channelLength * sizeof(float));
+            data += channelLength;
+        }
+    }
+    return result;
+}
+
+std::vector<std::vector<YOLOv4::DetectRes>> YOLOv4::postProcess(const std::vector<cv::Mat> &vec_Mat, float *output,
+                                                                const int &outSize) {
+    std::vector<std::vector<DetectRes>> vec_result;
+    int index = 0;
+    for (const cv::Mat &src_img : vec_Mat)
+    {
+        std::vector<DetectRes> result;
+        float *out = output + index * outSize;
+        int position = 0;
+        for (int n = 0; n < (int)grids.size(); n++)
+        {
+            for (int c = 0; c < grids[n][0]; c++)
+            {
+                std::vector<int> anchor = anchors[n * grids[n][0] + c];
+                for (int h = 0; h < grids[n][1]; h++)
+                    for (int w = 0; w < grids[n][2]; w++)
+                    {
+                        float *row = out + position * (CATEGORY + 5);
+                        position++;
+                        DetectRes box;
+                        auto max_pos = std::max_element(row + 5, row + CATEGORY + 5);
+                        box.prob = sigmoid(row[4]) * sigmoid(row[max_pos - row]);
+                        if (box.prob < obj_threshold)
+                            continue;
+                        box.classes = max_pos - row - 5;
+                        box.x = (sigmoid(row[0]) + w) / grids[n][1] * src_img.cols;
+                        box.y = (sigmoid(row[1]) + h) / grids[n][2] * src_img.rows;
+                        box.w = exponential(row[2]) * anchor[0] / IMAGE_WIDTH * src_img.cols;
+                        box.h = exponential(row[3]) * anchor[1] / IMAGE_HEIGHT * src_img.rows;
+                        result.push_back(box);
+                    }
+            }
+        }
+        NmsDetect(result);
+        vec_result.push_back(result);
+        index++;
+    }
+    return vec_result;
+}
+
+void YOLOv4::NmsDetect(std::vector<DetectRes> &detections) {
+    sort(detections.begin(), detections.end(), [=](const DetectRes &left, const DetectRes &right) {
+        return left.prob > right.prob;
+    });
+
+    for (int i = 0; i < (int)detections.size(); i++)
+        for (int j = i + 1; j < (int)detections.size(); j++)
+        {
+            if (detections[i].classes == detections[j].classes)
+            {
+                float iou = IOUCalculate(detections[i], detections[j]);
+                if (iou > nms_threshold)
+                    detections[j].prob = 0;
+            }
+        }
+
+    detections.erase(std::remove_if(detections.begin(), detections.end(), [](const DetectRes &det)
+    { return det.prob == 0; }), detections.end());
+}
+
+float YOLOv4::IOUCalculate(const YOLOv4::DetectRes &det_a, const YOLOv4::DetectRes &det_b) {
+    cv::Point2f center_a(det_a.x + det_a.w / 2, det_a.y + det_a.h / 2);
+    cv::Point2f center_b(det_b.x + det_b.w / 2, det_b.y + det_b.h / 2);
+    cv::Point2f left_up(std::min(det_a.x, det_b.x),std::min(det_a.y, det_b.y));
+    cv::Point2f right_down(std::max(det_a.x + det_a.w, det_b.x + det_b.w),std::max(det_a.y + det_a.h, det_b.y + det_b.h));
+    float distance_d = (center_a - center_b).x * (center_a - center_b).x + (center_a - center_b).y * (center_a - center_b).y;
+    float distance_c = (left_up - right_down).x * (left_up - right_down).x + (left_up - right_down).y * (left_up - right_down).y;
+    float inter_l = det_a.x > det_b.x ? det_a.x : det_b.x;
+    float inter_t = det_a.y > det_b.y ? det_a.y : det_b.y;
+    float inter_r = det_a.x + det_a.w < det_b.x + det_b.w ? det_a.x + det_a.w : det_b.x + det_b.w;
+    float inter_b = det_a.y + det_a.h < det_b.y + det_b.h ? det_a.y + det_a.h : det_b.y + det_b.h;
+    if (inter_b < inter_t || inter_r < inter_l)
+        return 0;
+    float inter_area = (inter_b - inter_t) * (inter_r - inter_l);
+    float union_area = det_a.w * det_a.h + det_b.w * det_b.h - inter_area;
+    if (union_area == 0)
+        return 0;
+    else
+        return inter_area / union_area - distance_d / distance_c;
+}
+
+float YOLOv4::sigmoid(float in){
+    return 1.f / (1.f + exp(-in));
+}
+
+float YOLOv4::exponential(float in){
+    return exp(in);
+}