Skip to content

Commit

Permalink
[IE CLDNN] Restore DetectionOutput performance and fixed coding conve…
Browse files Browse the repository at this point in the history
…ntions to be alined with others (openvinotoolkit#6048)
  • Loading branch information
yeonbok authored Jun 7, 2021
1 parent bc42238 commit a40cf0e
Showing 1 changed file with 153 additions and 57 deletions.
210 changes: 153 additions & 57 deletions inference-engine/thirdparty/clDNN/src/gpu/detection_output_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,34 +33,38 @@ namespace {

/************************ Detection Output CPU ************************/
struct detection_output_cpu : typed_primitive_impl<detection_output> {
enum NMSType {CAFFE, MXNET};
const detection_output_node& outer;
NMSType nms_type;

explicit detection_output_cpu(const detection_output_node& outer) : outer(outer) {}
explicit detection_output_cpu(const detection_output_node& outer)
: outer(outer)
, nms_type(outer.get_primitive()->decrease_label_id ? MXNET : CAFFE) {}

static void IntersectBBox(const bounding_box& bbox1,
const bounding_box& bbox2,
bounding_box& intersectBbox) {
static inline void intersect_bbox(const bounding_box& bbox1,
const bounding_box& bbox2,
bounding_box& intersect_bbox) {
if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin ||
bbox2.ymin > bbox1.ymax || bbox2.ymax < bbox1.ymin) {
intersectBbox.xmin = 0;
intersectBbox.ymin = 0;
intersectBbox.xmax = 0;
intersectBbox.ymax = 0;
intersect_bbox.xmin = 0;
intersect_bbox.ymin = 0;
intersect_bbox.xmax = 0;
intersect_bbox.ymax = 0;
} else {
intersectBbox.xmin = std::max<float>(bbox1.xmin, bbox2.xmin);
intersectBbox.ymin = std::max<float>(bbox1.ymin, bbox2.ymin);
intersectBbox.xmax = std::min<float>(bbox1.xmax, bbox2.xmax);
intersectBbox.ymax = std::min<float>(bbox1.ymax, bbox2.ymax);
intersect_bbox.xmin = std::max<float>(bbox1.xmin, bbox2.xmin);
intersect_bbox.ymin = std::max<float>(bbox1.ymin, bbox2.ymin);
intersect_bbox.xmax = std::min<float>(bbox1.xmax, bbox2.xmax);
intersect_bbox.ymax = std::min<float>(bbox1.ymax, bbox2.ymax);
}
}

static float JaccardOverlap(const bounding_box& bbox1, const bounding_box& bbox2) {
bounding_box intersectBbox;
IntersectBBox(bbox1, bbox2, intersectBbox);
static float jaccard_overlap(const bounding_box& bbox1, const bounding_box& bbox2) {
bounding_box inter_bbox;
intersect_bbox(bbox1, bbox2, inter_bbox);

float intersectWidth, intersectHeight;
intersectWidth = intersectBbox.xmax - intersectBbox.xmin;
intersectHeight = intersectBbox.ymax - intersectBbox.ymin;
intersectWidth = inter_bbox.xmax - inter_bbox.xmin;
intersectHeight = inter_bbox.ymax - inter_bbox.ymin;
if (intersectWidth > 0 && intersectHeight > 0) {
float intersect_size = intersectWidth * intersectHeight;
float bbox1_size = bbox1.area();
Expand Down Expand Up @@ -174,15 +178,13 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
}
}

void mxNetNms(const std::vector<std::vector<bounding_box>>& bboxes,
const float nms_threshold,
const int top_k,
const bool share_location,
std::map<int, std::vector<int>>& indices,
std::vector<std::pair<float, std::pair<int, int>>>& scoreIndexPairs) {
std::sort(scoreIndexPairs.begin(),
scoreIndexPairs.end(),
SortScorePairDescend<std::pair<int, int>>);
void mxnet_nms(const std::vector<std::vector<bounding_box>>& bboxes,
const float nms_threshold,
const int top_k,
const bool share_location,
std::map<int, std::vector<int>>& indices,
std::vector<std::pair<float, std::pair<int, int>>>& scoreIndexPairs) {
std::sort(scoreIndexPairs.begin(), scoreIndexPairs.end(), comp_score_descend<std::pair<int, int>>);

if (top_k != -1)
if (scoreIndexPairs.size() > static_cast<size_t>(top_k))
Expand All @@ -195,7 +197,7 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
for (size_t i = 0; i < currInd.size(); i++) {
const int keptIdx = currInd[i];
const auto& currBbox = share_location ? bboxes[0] : bboxes[cls];
float overlap = JaccardOverlap(currBbox[prior], currBbox[keptIdx]);
float overlap = jaccard_overlap(currBbox[prior], currBbox[keptIdx]);
if (overlap > nms_threshold) {
keep = false;
break;
Expand All @@ -208,23 +210,27 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
}
}

static void caffeNMS(const std::vector<bounding_box>& bboxes,
std::vector<std::pair<float, int>>& scores,
const float nms_threshold,
const int top_k,
std::vector<int>& indices) {
std::stable_sort(scores.begin(), scores.end(), SortScorePairDescend<int>);

static void caffe_nms(const std::vector<bounding_box>& bboxes,
std::vector<std::pair<float, int>>& scores,
const float nms_threshold,
const int top_k,
std::vector<int>& indices) {
if (top_k > -1 && static_cast<size_t>(top_k) < static_cast<size_t>(scores.size())) {
std::partial_sort(scores.begin(),
scores.begin() + top_k,
scores.end(),
comp_score_descend<int>);
scores.resize(top_k);
} else {
std::stable_sort(scores.begin(), scores.end(), comp_score_descend<int>);
}
// NMS
for (const auto& s : scores) {
const int idx = s.second;
bool keep = true;
for (int k = 0; k < static_cast<int>(indices.size()); ++k) {
const int kept_idx = indices[k];
float overlap = JaccardOverlap(bboxes[idx], bboxes[kept_idx]);
float overlap = jaccard_overlap(bboxes[idx], bboxes[kept_idx]);
if (overlap > nms_threshold) {
keep = false;
break;
Expand All @@ -237,8 +243,8 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
}

template <typename T>
static bool SortScorePairDescend(const std::pair<float, T>& pair1,
const std::pair<float, T>& pair2) {
static bool comp_score_descend(const std::pair<float, T>& pair1,
const std::pair<float, T>& pair2) {
return pair1.first > pair2.first;
}

Expand All @@ -252,12 +258,11 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
auto out_ptr = lock.begin();

const auto& args = instance.argument;
std::vector<std::vector<std::vector<std::pair<float, int>>>>
final_detections; // Per image -> For each label: Pair (score, prior index)
// Per image -> For each label: Pair (score, prior index)
std::vector<std::vector<std::vector<std::pair<float, int>>>> final_detections;
for (int image = 0; image < num_of_images; ++image) {
const std::vector<std::vector<bounding_box>>& bboxes_per_image = all_bboxes[image];
std::vector<std::vector<std::pair<float, int>>>& conf_per_image = confidences[image];
std::vector<std::pair<float, std::pair<int, int>>>& score_image = scoreIndexPairs[image];
std::map<int, std::vector<int>> indices;
int num_det = 0;
#ifdef FIX_OPENMP_RELEASE_ISSUE
Expand All @@ -269,19 +274,20 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
#pragma omp parallel for num_threads(num_threads_to_use) reduction(+ : num_det)
#endif
#endif
if (!args.decrease_label_id) {
if (nms_type == CAFFE) {
for (int cls = 0; cls < static_cast<int>(args.num_classes); ++cls) {
if (static_cast<int>(cls) == args.background_label_id) {
conf_per_image[cls].clear();
continue; // Skip background class.
}
std::vector<std::pair<float, int>>& scores = conf_per_image[cls];
const int label = args.share_location ? 0 : cls;
caffeNMS(bboxes_per_image[label], scores, args.nms_threshold, args.top_k, indices[cls]);
caffe_nms(bboxes_per_image[label], scores, args.nms_threshold, args.top_k, indices[cls]);
num_det += static_cast<int>(indices[cls].size());
}
} else {
mxNetNms(bboxes_per_image, args.nms_threshold, args.top_k, args.share_location, indices, score_image);
std::vector<std::pair<float, std::pair<int, int>>>& score_image = scoreIndexPairs[image];
mxnet_nms(bboxes_per_image, args.nms_threshold, args.top_k, args.share_location, indices, score_image);
for (auto it = indices.begin(); it != indices.end(); it++) {
num_det += static_cast<int>(it->second.size());
}
Expand All @@ -303,9 +309,7 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
}
}

std::sort(score_index_pairs.begin(),
score_index_pairs.end(),
SortScorePairDescend<std::pair<int, int>>);
std::sort(score_index_pairs.begin(), score_index_pairs.end(), comp_score_descend<std::pair<int, int>>);
score_index_pairs.resize(args.keep_top_k);

std::vector<std::vector<std::pair<float, int>>> new_indices(args.num_classes);
Expand Down Expand Up @@ -499,10 +503,100 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
}

template <typename dtype>
void extract_confidences_per_image(const detection_output_inst& instance,
std::vector<std::vector<std::vector<std::pair<float, int>>>>& confidences,
const int num_of_priors,
std::vector<std::vector<std::pair<float, std::pair<int, int>>>>& scoreIndexPairs) {
void extract_confidences_per_image_caffe(const detection_output_inst& instance,
std::vector<std::vector<std::vector<std::pair<float, int>>>>& confidences,
const int num_of_priors) {
const int num_classes = instance.argument.num_classes;

const int num_of_images = static_cast<int>(confidences.size());
auto& input_confidence = instance.confidence_memory();
const float confidence_threshold = instance.argument.confidence_threshold;

mem_lock<dtype> lock{(memory_impl::ptr) &input_confidence};
auto confidence_data = lock.begin();

assert(num_of_priors * num_classes == input_confidence.get_layout().size.feature[0]);

const auto& input_buffer_size = input_confidence.get_layout().get_buffer_size();
const int input_buffer_size_x = input_buffer_size.spatial[0];
const int input_buffer_size_y = input_buffer_size.spatial[1];
const int input_buffer_size_f = input_buffer_size.feature[0];
const auto& input_padding = input_confidence.get_layout().data_padding;
const int input_padding_lower_x = input_padding.lower_size().spatial[0];
const int input_padding_lower_y = input_padding.lower_size().spatial[1];
const int stride = input_buffer_size_y * input_buffer_size_x;

for (int image = 0; image < num_of_images; ++image) {
std::vector<std::vector<std::pair<float, int>>>& label_to_scores = confidences[image];
std::vector<std::pair<float, std::pair<int, int>>> score_index_per_prior;
label_to_scores.resize(num_classes);
int idx = get_linear_feature_index(image,
0,
input_buffer_size_f,
input_buffer_size_y,
input_buffer_size_x,
input_padding_lower_y,
input_padding_lower_x);
if (stride == 1 && std::is_same<dtype, float>::value) {
float const* confidence_ptr_float = (float const*)(&(*confidence_data));
confidence_ptr_float += idx;
__m128 threshold = _mm_load_ps1(&confidence_threshold);
for (int prior = 0; prior < num_of_priors; ++prior) {
int cls = 0;
for (; cls + 3 < num_classes; cls += 4) {
__m128 scores = _mm_loadu_ps(confidence_ptr_float);
confidence_ptr_float += 4;
__m128i mask128 = _mm_castps_si128(_mm_cmpgt_ps(scores, threshold));
if (_mm_testz_si128(mask128, mask128)) {
continue;
}
int mask = _mm_movemask_ps(_mm_castsi128_ps(mask128));
if (mask & 1) {
label_to_scores[cls + 0].emplace_back(_mm_cvtss_f32(scores), prior);
}
if (mask & 2) {
int score = _mm_extract_ps(scores, 1);
float s = reinterpret_cast<float&>(score);
label_to_scores[cls + 1].emplace_back(s, prior);
}
if (mask & 4) {
int score = _mm_extract_ps(scores, 2);
float s = reinterpret_cast<float&>(score);
label_to_scores[cls + 2].emplace_back(s, prior);
}
if (mask & 8) {
int score = _mm_extract_ps(scores, 3);
float s = reinterpret_cast<float&>(score);
label_to_scores[cls + 3].emplace_back(s, prior);
}
}
for (; cls < num_classes; ++cls) {
float score = *confidence_ptr_float;
if (score > confidence_threshold) {
label_to_scores[cls].emplace_back(score, prior);
}
++confidence_ptr_float;
}
}
} else {
for (int prior = 0; prior < num_of_priors; ++prior) {
for (int cls = 0; cls < num_classes; ++cls) {
float score = static_cast<float>(confidence_data[idx]);
if (score > confidence_threshold) {
label_to_scores[cls].emplace_back(score, prior);
}
idx += stride;
}
}
}
}
}

template <typename dtype>
void extract_confidences_per_image_mxnet(const detection_output_inst& instance,
std::vector<std::vector<std::vector<std::pair<float, int>>>>& confidences,
const int num_of_priors,
std::vector<std::vector<std::pair<float, std::pair<int, int>>>>& scoreIndexPairs) {
const int num_classes = instance.argument.num_classes;

const int num_of_images = static_cast<int>(confidences.size());
Expand Down Expand Up @@ -675,7 +769,11 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
}
}
// Extract confidences per image.
extract_confidences_per_image<dtype>(instance, confidences, num_of_priors, scoreIndexPairs);
if (nms_type == CAFFE) {
extract_confidences_per_image_caffe<dtype>(instance, confidences, num_of_priors);
} else {
extract_confidences_per_image_mxnet<dtype>(instance, confidences, num_of_priors, scoreIndexPairs);
}
}

event_impl::ptr execute_impl(const std::vector<event_impl::ptr>& events, detection_output_inst& instance) override {
Expand All @@ -686,18 +784,16 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
auto ev = instance.get_network().get_engine().create_user_event(instance.get_network().get_id(), false);

const int num_of_images = instance.location_memory().get_layout().size.batch[0]; // batch size
std::vector<std::vector<std::vector<bounding_box>>> bboxes(
num_of_images); // Per image : label -> decoded bounding boxes.
std::vector<std::vector<std::vector<std::pair<float, int>>>> confidences(
num_of_images); // Per image : class -> confidences per bounding box.
// Per image : label -> decoded bounding boxes.
std::vector<std::vector<std::vector<bounding_box>>> bboxes(num_of_images);
// Per image : class -> confidences per bounding box.
std::vector<std::vector<std::vector<std::pair<float, int>>>> confidences(num_of_images);
std::vector<std::vector<std::pair<float, std::pair<int, int>>>> scoreIndexPairs;
if (instance.location_memory().get_layout().data_type == data_types::f32) {
prepare_data<data_type_to_type<data_types::f32>::type>(instance, bboxes, confidences, scoreIndexPairs);

generate_detections<data_type_to_type<data_types::f32>::type>(instance, num_of_images, bboxes, confidences, scoreIndexPairs);
} else {
prepare_data<data_type_to_type<data_types::f16>::type>(instance, bboxes, confidences, scoreIndexPairs);

generate_detections<data_type_to_type<data_types::f16>::type>(instance, num_of_images, bboxes, confidences, scoreIndexPairs);
}

Expand Down

0 comments on commit a40cf0e

Please sign in to comment.