[Benchmark]Benchmark cpp for YOLOv5 (PaddlePaddle#1224)

* add GPL lisence * add GPL-3.0 lisence * add GPL-3.0 lisence * add GPL-3.0 lisence * support yolov8 * add pybind for yolov8 * add yolov8 readme * add cpp benchmark * add cpu and gpu mem * public part split * add runtime mode * fixed bugs * add cpu_thread_nums * deal with comments * deal with comments * deal with comments * rm useless code * add FASTDEPLOY_DECL * add FASTDEPLOY_DECL
younghuvee · Feb 7, 2023 · c487359 · c487359
1 parent e90e1ff
commit c487359
Show file tree

Hide file tree

Showing 27 changed files with 422 additions and 44 deletions.
diff --git a/benchmark/cpp/CMakeLists.txt b/benchmark/cpp/CMakeLists.txt
@@ -0,0 +1,17 @@
+PROJECT(infer_demo C CXX)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
+
+# specify the decompress directory of FastDeploy SDK
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+include(${FASTDEPLOY_INSTALL_DIR}/utils/gflags.cmake)
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+include_directories(${FASTDEPLOY_INCS})
+
+add_executable(benchmark_yolov5 ${PROJECT_SOURCE_DIR}/benchmark_yolov5.cc)
+
+if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
+  target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags pthread)
+else()
+  target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags)
+endif()
diff --git a/benchmark/cpp/benchmark_yolov5.cc b/benchmark/cpp/benchmark_yolov5.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/benchmark/utils.h"
+#include "fastdeploy/vision.h"
+#include "flags.h"
+
+bool RunModel(std::string model_file, std::string image_file, size_t warmup,
+              size_t repeats, size_t dump_period, std::string cpu_mem_file_name,
+              std::string gpu_mem_file_name) {
+  // Initialization
+  auto option = fastdeploy::RuntimeOption();
+  if (!CreateRuntimeOption(&option)) {
+    PrintUsage();
+    return false;
+  }
+  if (FLAGS_profile_mode == "runtime") {
+    option.EnableProfiling(FLAGS_include_h2d_d2h, repeats, warmup);
+  }
+  auto model = fastdeploy::vision::detection::YOLOv5(model_file, "", option);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return false;
+  }
+  auto im = cv::imread(image_file);
+  // For Runtime
+  if (FLAGS_profile_mode == "runtime") {
+    fastdeploy::vision::DetectionResult res;
+    if (!model.Predict(im, &res)) {
+      std::cerr << "Failed to predict." << std::endl;
+      return false;
+    }
+    double profile_time = model.GetProfileTime() * 1000;
+    std::cout << "Runtime(ms): " << profile_time << "ms." << std::endl;
+    auto vis_im = fastdeploy::vision::VisDetection(im, res);
+    cv::imwrite("vis_result.jpg", vis_im);
+    std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+  } else {
+    // For End2End
+    // Step1: warm up for warmup times
+    std::cout << "Warmup " << warmup << " times..." << std::endl;
+    for (int i = 0; i < warmup; i++) {
+      fastdeploy::vision::DetectionResult res;
+      if (!model.Predict(im, &res)) {
+        std::cerr << "Failed to predict." << std::endl;
+        return false;
+      }
+    }
+    std::vector<float> end2end_statis;
+    // Step2: repeat for repeats times
+    std::cout << "Counting time..." << std::endl;
+    fastdeploy::TimeCounter tc;
+    fastdeploy::vision::DetectionResult res;
+    for (int i = 0; i < repeats; i++) {
+      if (FLAGS_collect_memory_info && i % dump_period == 0) {
+        fastdeploy::benchmark::DumpCurrentCpuMemoryUsage(cpu_mem_file_name);
+        fastdeploy::benchmark::DumpCurrentGpuMemoryUsage(gpu_mem_file_name,
+                                                         FLAGS_device_id);
+      }
+      tc.Start();
+      if (!model.Predict(im, &res)) {
+        std::cerr << "Failed to predict." << std::endl;
+        return false;
+      }
+      tc.End();
+      end2end_statis.push_back(tc.Duration() * 1000);
+    }
+    float end2end = std::accumulate(end2end_statis.end() - repeats,
+                                    end2end_statis.end(), 0.f) /
+                    repeats;
+    std::cout << "End2End(ms): " << end2end << "ms." << std::endl;
+    auto vis_im = fastdeploy::vision::VisDetection(im, res);
+    cv::imwrite("vis_result.jpg", vis_im);
+    std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+  }
+
+  return true;
+}
+
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  int repeats = FLAGS_repeat;
+  int warmup = FLAGS_warmup;
+  int dump_period = FLAGS_dump_period;
+  std::string cpu_mem_file_name = "result_cpu.txt";
+  std::string gpu_mem_file_name = "result_gpu.txt";
+  // Run model
+  if (RunModel(FLAGS_model, FLAGS_image, warmup, repeats, dump_period,
+               cpu_mem_file_name, gpu_mem_file_name) != true) {
+    exit(1);
+  }
+  if (FLAGS_collect_memory_info) {
+    float cpu_mem = fastdeploy::benchmark::GetCpuMemoryUsage(cpu_mem_file_name);
+    float gpu_mem = fastdeploy::benchmark::GetGpuMemoryUsage(gpu_mem_file_name);
+    std::cout << "cpu_rss_mb: " << cpu_mem << "MB." << std::endl;
+    std::cout << "gpu_rss_mb: " << gpu_mem << "MB." << std::endl;
+  }
+  return 0;
+}
diff --git a/benchmark/cpp/flags.h b/benchmark/cpp/flags.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "gflags/gflags.h"
+#include "fastdeploy/utils/perf.h"
+
+DEFINE_string(model, "", "Directory of the inference model.");
+DEFINE_string(image, "", "Path of the image file.");
+DEFINE_string(device, "cpu",
+              "Type of inference device, support 'cpu' or 'gpu'.");
+DEFINE_int32(device_id, 0, "device(gpu) id.");
+DEFINE_int32(warmup, 200, "Number of warmup for profiling.");
+DEFINE_int32(repeat, 1000, "Number of repeats for profiling.");
+DEFINE_string(profile_mode, "runtime", "runtime or end2end.");
+DEFINE_string(backend, "default",
+              "The inference runtime backend, support: ['default', 'ort', "
+              "'paddle', 'ov', 'trt', 'paddle_trt']");
+DEFINE_int32(cpu_thread_nums, 8, "Set numbers of cpu thread.");
+DEFINE_bool(
+    include_h2d_d2h, false, "Whether run profiling with h2d and d2h.");
+DEFINE_bool(
+    use_fp16, false,
+    "Whether to use FP16 mode, only support 'trt' and 'paddle_trt' backend");
+DEFINE_bool(
+    collect_memory_info, false, "Whether to collect memory info");
+DEFINE_int32(dump_period, 100, "How often to collect memory info.");
+
+void PrintUsage() {
+  std::cout << "Usage: infer_demo --model model_path --image img_path --device "
+               "[cpu|gpu] --backend "
+               "[default|ort|paddle|ov|trt|paddle_trt] "
+               "--use_fp16 false"
+            << std::endl;
+  std::cout << "Default value of device: cpu" << std::endl;
+  std::cout << "Default value of backend: default" << std::endl;
+  std::cout << "Default value of use_fp16: false" << std::endl;
+}
+
+bool CreateRuntimeOption(fastdeploy::RuntimeOption* option) {
+  if (FLAGS_device == "gpu") {
+    option->UseGpu();
+    if (FLAGS_backend == "ort") {
+      option->UseOrtBackend();
+    } else if (FLAGS_backend == "paddle") {
+      option->UsePaddleInferBackend();
+    } else if (FLAGS_backend == "trt" || FLAGS_backend == "paddle_trt") {
+      option->UseTrtBackend();
+      option->SetTrtInputShape("input", {1, 3, 112, 112});
+      if (FLAGS_backend == "paddle_trt") {
+        option->EnablePaddleToTrt();
+      }
+      if (FLAGS_use_fp16) {
+        option->EnableTrtFP16();
+      }
+    } else if (FLAGS_backend == "default") {
+      return true;
+    } else {
+      std::cout << "While inference with GPU, only support "
+                   "default/ort/paddle/trt/paddle_trt now, "
+                << FLAGS_backend << " is not supported." << std::endl;
+      return false;
+    }
+  } else if (FLAGS_device == "cpu") {
+    option->SetCpuThreadNum(FLAGS_cpu_thread_nums);
+    if (FLAGS_backend == "ort") {
+      option->UseOrtBackend();
+    } else if (FLAGS_backend == "ov") {
+      option->UseOpenVINOBackend();
+    } else if (FLAGS_backend == "paddle") {
+      option->UsePaddleInferBackend();
+    } else if (FLAGS_backend == "default") {
+      return true;
+    } else {
+      std::cout << "While inference with CPU, only support "
+                   "default/ort/ov/paddle now, "
+                << FLAGS_backend << " is not supported." << std::endl;
+      return false;
+    }
+  } else {
+    std::cerr << "Only support device CPU/GPU now, " << FLAGS_device
+              << " is not supported." << std::endl;
+    return false;
+  }
+
+  return true;
+}
diff --git a/benchmark/README.md → benchmark/python/README.md b/benchmark/README.md → benchmark/python/README.md
diff --git a/benchmark/benchmark_ernie_seq_cls.py → benchmark/python/benchmark_ernie_seq_cls.py b/benchmark/benchmark_ernie_seq_cls.py → benchmark/python/benchmark_ernie_seq_cls.py
diff --git a/benchmark/benchmark_ppcls.py → benchmark/python/benchmark_ppcls.py b/benchmark/benchmark_ppcls.py → benchmark/python/benchmark_ppcls.py
@@ -17,7 +17,8 @@
 import os
 import numpy as np
 import time
-from tqdm import tqdm 
+from tqdm import tqdm
+
 
 def parse_arguments():
     import argparse
@@ -38,19 +39,19 @@ def parse_arguments():
         "--profile_mode",
         type=str,
         default="runtime",
-        help="runtime or end2end.")      
+        help="runtime or end2end.")
     parser.add_argument(
         "--repeat",
         required=True,
         type=int,
         default=1000,
-        help="number of repeats for profiling.")    
+        help="number of repeats for profiling.")
     parser.add_argument(
         "--warmup",
         required=True,
         type=int,
         default=50,
-        help="number of warmup for profiling.")      
+        help="number of warmup for profiling.")
     parser.add_argument(
         "--device",
         default="cpu",
@@ -74,7 +75,7 @@ def parse_arguments():
         "--include_h2d_d2h",
         type=ast.literal_eval,
         default=False,
-        help="whether run profiling with h2d and d2h")       
+        help="whether run profiling with h2d and d2h")
     args = parser.parse_args()
     return args
 
@@ -85,7 +86,7 @@ def build_option(args):
     backend = args.backend
     enable_trt_fp16 = args.enable_trt_fp16
     if args.profile_mode == "runtime":
-        option.enable_profiling(args.include_h2d_d2h, args.repeat, args.warmup)    
+        option.enable_profiling(args.include_h2d_d2h, args.repeat, args.warmup)
     option.set_cpu_thread_num(args.cpu_num_thread)
     if device == "gpu":
         option.use_gpu()
@@ -274,25 +275,27 @@ def cpu_stat_func(self, q, pid, interval=0.0):
             enable_gpu = args.device == "gpu"
             monitor = Monitor(enable_gpu, gpu_id)
             monitor.start()
-        
+
         im_ori = cv2.imread(args.image)
         if args.profile_mode == "runtime":
             result = model.predict(im_ori)
             profile_time = model.get_profile_time()
             dump_result["runtime"] = profile_time * 1000
-            f.writelines("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
+            f.writelines("Runtime(ms): {} \n".format(
+                str(dump_result["runtime"])))
             print("Runtime(ms): {} \n".format(str(dump_result["runtime"])))
         else:
             # end2end
             for i in range(args.warmup):
                 result = model.predict(im_ori)
-            
+
             start = time.time()
             for i in tqdm(range(args.repeat)):
                 result = model.predict(im_ori)
             end = time.time()
             dump_result["end2end"] = ((end - start) / args.repeat) * 1000.0
-            f.writelines("End2End(ms): {} \n".format(str(dump_result["end2end"])))
+            f.writelines("End2End(ms): {} \n".format(
+                str(dump_result["end2end"])))
             print("End2End(ms): {} \n".format(str(dump_result["end2end"])))
 
         if enable_collect_memory_info:
@@ -304,7 +307,7 @@ def cpu_stat_func(self, q, pid, interval=0.0):
                 'memory.used'] if 'gpu' in mem_info else 0
             dump_result["gpu_util"] = mem_info['gpu'][
                 'utilization.gpu'] if 'gpu' in mem_info else 0
-        
+
         if enable_collect_memory_info:
             f.writelines("cpu_rss_mb: {} \n".format(
                 str(dump_result["cpu_rss_mb"])))