[IE CLDNN] Restore DetectionOutput performance and fixed coding conve…

…ntions to be alined with others (openvinotoolkit#6048)
Chararu · Jun 7, 2021 · a40cf0e · a40cf0e
1 parent bc42238
commit a40cf0e
Showing 1 changed file with 153 additions and 57 deletions.
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/detection_output_cpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/detection_output_cpu.cpp
@@ -33,34 +33,38 @@ namespace {
 
 /************************ Detection Output CPU ************************/
 struct detection_output_cpu : typed_primitive_impl<detection_output> {
+    enum NMSType {CAFFE, MXNET};
     const detection_output_node& outer;
+    NMSType nms_type;
 
-    explicit detection_output_cpu(const detection_output_node& outer) : outer(outer) {}
+    explicit detection_output_cpu(const detection_output_node& outer)
+        : outer(outer)
+        , nms_type(outer.get_primitive()->decrease_label_id ? MXNET : CAFFE) {}
 
-    static void IntersectBBox(const bounding_box& bbox1,
-                              const bounding_box& bbox2,
-                              bounding_box& intersectBbox) {
+    static inline void intersect_bbox(const bounding_box& bbox1,
+                                      const bounding_box& bbox2,
+                                      bounding_box& intersect_bbox) {
         if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin ||
             bbox2.ymin > bbox1.ymax || bbox2.ymax < bbox1.ymin) {
-            intersectBbox.xmin = 0;
-            intersectBbox.ymin = 0;
-            intersectBbox.xmax = 0;
-            intersectBbox.ymax = 0;
+            intersect_bbox.xmin = 0;
+            intersect_bbox.ymin = 0;
+            intersect_bbox.xmax = 0;
+            intersect_bbox.ymax = 0;
         } else {
-            intersectBbox.xmin = std::max<float>(bbox1.xmin, bbox2.xmin);
-            intersectBbox.ymin = std::max<float>(bbox1.ymin, bbox2.ymin);
-            intersectBbox.xmax = std::min<float>(bbox1.xmax, bbox2.xmax);
-            intersectBbox.ymax = std::min<float>(bbox1.ymax, bbox2.ymax);
+            intersect_bbox.xmin = std::max<float>(bbox1.xmin, bbox2.xmin);
+            intersect_bbox.ymin = std::max<float>(bbox1.ymin, bbox2.ymin);
+            intersect_bbox.xmax = std::min<float>(bbox1.xmax, bbox2.xmax);
+            intersect_bbox.ymax = std::min<float>(bbox1.ymax, bbox2.ymax);
         }
     }
 
-    static float JaccardOverlap(const bounding_box& bbox1, const bounding_box& bbox2) {
-        bounding_box intersectBbox;
-        IntersectBBox(bbox1, bbox2, intersectBbox);
+    static float jaccard_overlap(const bounding_box& bbox1, const bounding_box& bbox2) {
+        bounding_box inter_bbox;
+        intersect_bbox(bbox1, bbox2, inter_bbox);
 
         float intersectWidth, intersectHeight;
-        intersectWidth = intersectBbox.xmax - intersectBbox.xmin;
-        intersectHeight = intersectBbox.ymax - intersectBbox.ymin;
+        intersectWidth = inter_bbox.xmax - inter_bbox.xmin;
+        intersectHeight = inter_bbox.ymax - inter_bbox.ymin;
         if (intersectWidth > 0 && intersectHeight > 0) {
             float intersect_size = intersectWidth * intersectHeight;
             float bbox1_size = bbox1.area();
@@ -174,15 +178,13 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
         }
     }
 
-    void mxNetNms(const std::vector<std::vector<bounding_box>>& bboxes,
-                  const float nms_threshold,
-                  const int top_k,
-                  const bool share_location,
-                  std::map<int, std::vector<int>>& indices,
-                  std::vector<std::pair<float, std::pair<int, int>>>& scoreIndexPairs) {
-        std::sort(scoreIndexPairs.begin(),
-                    scoreIndexPairs.end(),
-                    SortScorePairDescend<std::pair<int, int>>);
+    void mxnet_nms(const std::vector<std::vector<bounding_box>>& bboxes,
+                   const float nms_threshold,
+                   const int top_k,
+                   const bool share_location,
+                   std::map<int, std::vector<int>>& indices,
+                   std::vector<std::pair<float, std::pair<int, int>>>& scoreIndexPairs) {
+        std::sort(scoreIndexPairs.begin(), scoreIndexPairs.end(), comp_score_descend<std::pair<int, int>>);
 
         if (top_k != -1)
             if (scoreIndexPairs.size() > static_cast<size_t>(top_k))
@@ -195,7 +197,7 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
             for (size_t i = 0; i < currInd.size(); i++) {
                 const int keptIdx = currInd[i];
                 const auto& currBbox = share_location ? bboxes[0] : bboxes[cls];
-                float overlap = JaccardOverlap(currBbox[prior], currBbox[keptIdx]);
+                float overlap = jaccard_overlap(currBbox[prior], currBbox[keptIdx]);
                 if (overlap > nms_threshold) {
                     keep = false;
                     break;
@@ -208,23 +210,27 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
         }
     }
 
-    static void caffeNMS(const std::vector<bounding_box>& bboxes,
-                         std::vector<std::pair<float, int>>& scores,
-                         const float nms_threshold,
-                         const int top_k,
-                         std::vector<int>& indices) {
-        std::stable_sort(scores.begin(), scores.end(), SortScorePairDescend<int>);
-
+    static void caffe_nms(const std::vector<bounding_box>& bboxes,
+                          std::vector<std::pair<float, int>>& scores,
+                          const float nms_threshold,
+                          const int top_k,
+                          std::vector<int>& indices) {
         if (top_k > -1 && static_cast<size_t>(top_k) < static_cast<size_t>(scores.size())) {
+            std::partial_sort(scores.begin(),
+                              scores.begin() + top_k,
+                              scores.end(),
+                              comp_score_descend<int>);
             scores.resize(top_k);
+        } else {
+            std::stable_sort(scores.begin(), scores.end(), comp_score_descend<int>);
         }
         // NMS
         for (const auto& s : scores) {
             const int idx = s.second;
             bool keep = true;
             for (int k = 0; k < static_cast<int>(indices.size()); ++k) {
                 const int kept_idx = indices[k];
-                float overlap = JaccardOverlap(bboxes[idx], bboxes[kept_idx]);
+                float overlap = jaccard_overlap(bboxes[idx], bboxes[kept_idx]);
                 if (overlap > nms_threshold) {
                     keep = false;
                     break;
@@ -237,8 +243,8 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
     }
 
     template <typename T>
-    static bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                                     const std::pair<float, T>& pair2) {
+    static bool comp_score_descend(const std::pair<float, T>& pair1,
+                                   const std::pair<float, T>& pair2) {
         return pair1.first > pair2.first;
     }
 
@@ -252,12 +258,11 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
         auto out_ptr = lock.begin();
 
         const auto& args = instance.argument;
-        std::vector<std::vector<std::vector<std::pair<float, int>>>>
-            final_detections;  // Per image -> For each label: Pair (score, prior index)
+        // Per image -> For each label: Pair (score, prior index)
+        std::vector<std::vector<std::vector<std::pair<float, int>>>> final_detections;
         for (int image = 0; image < num_of_images; ++image) {
             const std::vector<std::vector<bounding_box>>& bboxes_per_image = all_bboxes[image];
             std::vector<std::vector<std::pair<float, int>>>& conf_per_image = confidences[image];
-            std::vector<std::pair<float, std::pair<int, int>>>& score_image = scoreIndexPairs[image];
             std::map<int, std::vector<int>> indices;
             int num_det = 0;
 #ifdef FIX_OPENMP_RELEASE_ISSUE
@@ -269,19 +274,20 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
 #pragma omp parallel for num_threads(num_threads_to_use) reduction(+ : num_det)
 #endif
 #endif
-            if (!args.decrease_label_id) {
+            if (nms_type == CAFFE) {
                 for (int cls = 0; cls < static_cast<int>(args.num_classes); ++cls) {
                     if (static_cast<int>(cls) == args.background_label_id) {
                         conf_per_image[cls].clear();
                         continue;  // Skip background class.
                     }
                     std::vector<std::pair<float, int>>& scores = conf_per_image[cls];
                     const int label = args.share_location ? 0 : cls;
-                    caffeNMS(bboxes_per_image[label], scores, args.nms_threshold, args.top_k, indices[cls]);
+                    caffe_nms(bboxes_per_image[label], scores, args.nms_threshold, args.top_k, indices[cls]);
                     num_det += static_cast<int>(indices[cls].size());
                 }
             } else {
-                mxNetNms(bboxes_per_image, args.nms_threshold, args.top_k, args.share_location, indices, score_image);
+                std::vector<std::pair<float, std::pair<int, int>>>& score_image = scoreIndexPairs[image];
+                mxnet_nms(bboxes_per_image, args.nms_threshold, args.top_k, args.share_location, indices, score_image);
                 for (auto it = indices.begin(); it != indices.end(); it++) {
                     num_det += static_cast<int>(it->second.size());
                 }
@@ -303,9 +309,7 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
                     }
                 }
 
-                std::sort(score_index_pairs.begin(),
-                            score_index_pairs.end(),
-                            SortScorePairDescend<std::pair<int, int>>);
+                std::sort(score_index_pairs.begin(), score_index_pairs.end(), comp_score_descend<std::pair<int, int>>);
                 score_index_pairs.resize(args.keep_top_k);
 
                 std::vector<std::vector<std::pair<float, int>>> new_indices(args.num_classes);
@@ -499,10 +503,100 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
     }
 
     template <typename dtype>
-    void extract_confidences_per_image(const detection_output_inst& instance,
-                                       std::vector<std::vector<std::vector<std::pair<float, int>>>>& confidences,
-                                       const int num_of_priors,
-                                       std::vector<std::vector<std::pair<float, std::pair<int, int>>>>& scoreIndexPairs) {
+    void extract_confidences_per_image_caffe(const detection_output_inst& instance,
+                                             std::vector<std::vector<std::vector<std::pair<float, int>>>>& confidences,
+                                             const int num_of_priors) {
+        const int num_classes = instance.argument.num_classes;
+
+        const int num_of_images = static_cast<int>(confidences.size());
+        auto& input_confidence = instance.confidence_memory();
+        const float confidence_threshold = instance.argument.confidence_threshold;
+
+        mem_lock<dtype> lock{(memory_impl::ptr) &input_confidence};
+        auto confidence_data = lock.begin();
+
+        assert(num_of_priors * num_classes == input_confidence.get_layout().size.feature[0]);
+
+        const auto& input_buffer_size = input_confidence.get_layout().get_buffer_size();
+        const int input_buffer_size_x = input_buffer_size.spatial[0];
+        const int input_buffer_size_y = input_buffer_size.spatial[1];
+        const int input_buffer_size_f = input_buffer_size.feature[0];
+        const auto& input_padding = input_confidence.get_layout().data_padding;
+        const int input_padding_lower_x = input_padding.lower_size().spatial[0];
+        const int input_padding_lower_y = input_padding.lower_size().spatial[1];
+        const int stride = input_buffer_size_y * input_buffer_size_x;
+
+        for (int image = 0; image < num_of_images; ++image) {
+            std::vector<std::vector<std::pair<float, int>>>& label_to_scores = confidences[image];
+            std::vector<std::pair<float, std::pair<int, int>>> score_index_per_prior;
+            label_to_scores.resize(num_classes);
+            int idx = get_linear_feature_index(image,
+                                               0,
+                                               input_buffer_size_f,
+                                               input_buffer_size_y,
+                                               input_buffer_size_x,
+                                               input_padding_lower_y,
+                                               input_padding_lower_x);
+            if (stride == 1 && std::is_same<dtype, float>::value) {
+                float const* confidence_ptr_float = (float const*)(&(*confidence_data));
+                confidence_ptr_float += idx;
+                __m128 threshold = _mm_load_ps1(&confidence_threshold);
+                for (int prior = 0; prior < num_of_priors; ++prior) {
+                    int cls = 0;
+                    for (; cls + 3 < num_classes; cls += 4) {
+                        __m128 scores = _mm_loadu_ps(confidence_ptr_float);
+                        confidence_ptr_float += 4;
+                        __m128i mask128 = _mm_castps_si128(_mm_cmpgt_ps(scores, threshold));
+                        if (_mm_testz_si128(mask128, mask128)) {
+                            continue;
+                        }
+                        int mask = _mm_movemask_ps(_mm_castsi128_ps(mask128));
+                        if (mask & 1) {
+                            label_to_scores[cls + 0].emplace_back(_mm_cvtss_f32(scores), prior);
+                        }
+                        if (mask & 2) {
+                            int score = _mm_extract_ps(scores, 1);
+                            float s = reinterpret_cast<float&>(score);
+                            label_to_scores[cls + 1].emplace_back(s, prior);
+                        }
+                        if (mask & 4) {
+                            int score = _mm_extract_ps(scores, 2);
+                            float s = reinterpret_cast<float&>(score);
+                            label_to_scores[cls + 2].emplace_back(s, prior);
+                        }
+                        if (mask & 8) {
+                            int score = _mm_extract_ps(scores, 3);
+                            float s = reinterpret_cast<float&>(score);
+                            label_to_scores[cls + 3].emplace_back(s, prior);
+                        }
+                    }
+                    for (; cls < num_classes; ++cls) {
+                        float score = *confidence_ptr_float;
+                        if (score > confidence_threshold) {
+                            label_to_scores[cls].emplace_back(score, prior);
+                        }
+                        ++confidence_ptr_float;
+                    }
+                }
+            } else {
+                for (int prior = 0; prior < num_of_priors; ++prior) {
+                    for (int cls = 0; cls < num_classes; ++cls) {
+                        float score = static_cast<float>(confidence_data[idx]);
+                        if (score > confidence_threshold) {
+                            label_to_scores[cls].emplace_back(score, prior);
+                        }
+                        idx += stride;
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename dtype>
+    void extract_confidences_per_image_mxnet(const detection_output_inst& instance,
+                                             std::vector<std::vector<std::vector<std::pair<float, int>>>>& confidences,
+                                             const int num_of_priors,
+                                             std::vector<std::vector<std::pair<float, std::pair<int, int>>>>& scoreIndexPairs) {
         const int num_classes = instance.argument.num_classes;
 
         const int num_of_images = static_cast<int>(confidences.size());
@@ -675,7 +769,11 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
             }
         }
         // Extract confidences per image.
-        extract_confidences_per_image<dtype>(instance, confidences, num_of_priors, scoreIndexPairs);
+        if (nms_type == CAFFE) {
+            extract_confidences_per_image_caffe<dtype>(instance, confidences, num_of_priors);
+        } else {
+            extract_confidences_per_image_mxnet<dtype>(instance, confidences, num_of_priors, scoreIndexPairs);
+        }
     }
 
     event_impl::ptr execute_impl(const std::vector<event_impl::ptr>& events, detection_output_inst& instance) override {
@@ -686,18 +784,16 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
         auto ev = instance.get_network().get_engine().create_user_event(instance.get_network().get_id(), false);
 
         const int num_of_images = instance.location_memory().get_layout().size.batch[0];  // batch size
-        std::vector<std::vector<std::vector<bounding_box>>> bboxes(
-            num_of_images);  // Per image : label -> decoded bounding boxes.
-        std::vector<std::vector<std::vector<std::pair<float, int>>>> confidences(
-            num_of_images);  // Per image : class -> confidences per bounding box.
+        // Per image : label -> decoded bounding boxes.
+        std::vector<std::vector<std::vector<bounding_box>>> bboxes(num_of_images);
+        // Per image : class -> confidences per bounding box.
+        std::vector<std::vector<std::vector<std::pair<float, int>>>> confidences(num_of_images);
         std::vector<std::vector<std::pair<float, std::pair<int, int>>>> scoreIndexPairs;
         if (instance.location_memory().get_layout().data_type == data_types::f32) {
             prepare_data<data_type_to_type<data_types::f32>::type>(instance, bboxes, confidences, scoreIndexPairs);
-
             generate_detections<data_type_to_type<data_types::f32>::type>(instance, num_of_images, bboxes, confidences, scoreIndexPairs);
         } else {
             prepare_data<data_type_to_type<data_types::f16>::type>(instance, bboxes, confidences, scoreIndexPairs);
-
             generate_detections<data_type_to_type<data_types::f16>::type>(instance, num_of_images, bboxes, confidences, scoreIndexPairs);
         }