Merge remote-tracking branch 'upstream/3.4' into merge-3.4

figo · Jul 16, 2020 · 9b7b22e · 9b7b22e
2 parents d17ab27 + b2ebd37
commit 9b7b22e
Show file tree

Hide file tree

Showing 15 changed files with 717 additions and 127 deletions.
diff --git a/cmake/OpenCVDetectInferenceEngine.cmake b/cmake/OpenCVDetectInferenceEngine.cmake
@@ -129,9 +129,9 @@ endif()
 
 if(INF_ENGINE_TARGET)
   if(NOT INF_ENGINE_RELEASE)
-    message(WARNING "InferenceEngine version has not been set, 2020.3 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
+    message(WARNING "InferenceEngine version has not been set, 2020.4 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
   endif()
-  set(INF_ENGINE_RELEASE "2020030000" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2020.1.0.2 -> 2020010002)")
+  set(INF_ENGINE_RELEASE "2020040000" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2020.1.0.2 -> 2020010002)")
   set_target_properties(${INF_ENGINE_TARGET} PROPERTIES
     INTERFACE_COMPILE_DEFINITIONS "HAVE_INF_ENGINE=1;INF_ENGINE_RELEASE=${INF_ENGINE_RELEASE}"
   )

diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp
@@ -0,0 +1,95 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace opencv_test {
+
+struct Layer_Slice : public TestBaseWithParam<tuple<Backend, Target> >
+{
+    template<int DIMS>
+    void test_slice(const int* inputShape, const int* begin, const int* end)
+    {
+        int backendId = get<0>(GetParam());
+        int targetId = get<1>(GetParam());
+
+        Mat input(DIMS, inputShape, CV_32FC1, Scalar::all(0));
+        for (int i = 0; i < (int)input.total(); ++i)
+            input.ptr<float>()[i] = (float)(i & 4095);
+
+        std::vector<Range> range(DIMS);
+        for (int i = 0; i < DIMS; ++i)
+            range[i] = Range(begin[i], end[i]);
+
+        Net net;
+        LayerParams lp;
+        lp.type = "Slice";
+        lp.name = "testLayer";
+        lp.set("begin", DictValue::arrayInt<int*>((int*)&begin[0], DIMS));
+        lp.set("end", DictValue::arrayInt<int*>((int*)&end[0], DIMS));
+        net.addLayerToPrev(lp.name, lp.type, lp);
+
+        // warmup
+        {
+            net.setInput(input);
+            net.setPreferableBackend(backendId);
+            net.setPreferableTarget(targetId);
+            Mat out = net.forward();
+
+            EXPECT_GT(cv::norm(out, NORM_INF), 0);
+#if 0
+            //normAssert(out, input(range));
+            cout << input(range).clone().reshape(1, 1) << endl;
+            cout << out.reshape(1, 1) << endl;
+#endif
+        }
+
+        TEST_CYCLE()
+        {
+            Mat res = net.forward();
+        }
+
+        SANITY_CHECK_NOTHING();
+    }
+};
+
+
+
+PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_1)
+{
+    const int inputShape[4] = {1, 64, 104, 104};
+    const int begin[] = {0, 32, 0, 0};
+    const int end[] = {1, 64, 104, 104};
+    test_slice<4>(inputShape, begin, end);
+}
+
+PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_2)
+{
+    const int inputShape[4] = {1, 128, 52, 52};
+    const int begin[] = {0, 64, 0, 0};
+    const int end[] = {1, 128, 52, 52};
+    test_slice<4>(inputShape, begin, end);
+}
+
+PERF_TEST_P_(Layer_Slice, YOLOv4_tiny_3)
+{
+    const int inputShape[4] = {1, 256, 26, 26};
+    const int begin[] = {0, 128, 0, 0};
+    const int end[] = {1, 256, 26, 26};
+    test_slice<4>(inputShape, begin, end);
+}
+
+
+PERF_TEST_P_(Layer_Slice, FastNeuralStyle_eccv16)
+{
+    const int inputShape[4] = {1, 128, 80, 100};
+    const int begin[] = {0, 0, 2, 2};
+    const int end[] = {1, 128, 76, 96};
+    test_slice<4>(inputShape, begin, end);
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Slice, dnnBackendsAndTargets(false, false));
+
+} // namespace
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
@@ -196,6 +196,13 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv3)
 {
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        throw SkipTestException("Test is disabled in OpenVINO 2020.4");
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("Test is disabled in OpenVINO 2020.4");
+#endif
+
     Mat sample = imread(findDataFile("dnn/dog416.png"));
     cvtColor(sample, sample, COLOR_BGR2RGB);
     Mat inp;
@@ -209,6 +216,12 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv4)
         throw SkipTestException("");
     if (target == DNN_TARGET_MYRIAD)
         throw SkipTestException("");
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        throw SkipTestException("Test is disabled in OpenVINO 2020.4");
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        throw SkipTestException("Test is disabled in OpenVINO 2020.4");
+#endif
     Mat sample = imread(findDataFile("dnn/dog416.png"));
     cvtColor(sample, sample, COLOR_BGR2RGB);
     Mat inp;
@@ -220,8 +233,6 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv4_tiny)
 {
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
-    if (target == DNN_TARGET_MYRIAD)
-        throw SkipTestException("");
     Mat sample = imread(findDataFile("dnn/dog416.png"));
     cvtColor(sample, sample, COLOR_BGR2RGB);
     Mat inp;

diff --git a/modules/dnn/src/graph_simplifier.cpp b/modules/dnn/src/graph_simplifier.cpp
@@ -63,9 +63,6 @@ int Subgraph::getInputNodeId(const Ptr<ImportGraphWrapper>& net,
 {
     CV_Assert(inpId < node->getNumInputs());
     std::string name = node->getInputName(inpId);
-    // If operation produces several tensors, they are specified by index
-    // after ':' character. In example, "input:0".
-    name = name.substr(0, name.rfind(':'));
     const int numNodes = net->getNumNodes();
     for (int i = 0; i < numNodes; ++i)
     {

diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
@@ -48,6 +48,8 @@
 #include "layers_common.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
 
+#include <opencv2/core/utils/logger.hpp>
+
 #ifdef HAVE_OPENCL
 #include "opencl_kernels_dnn.hpp"
 #endif
@@ -204,58 +206,168 @@ class SliceLayerImpl : public SliceLayer
                 finalSliceRanges[i][j] = clamp(finalSliceRanges[i][j], inpShape[j]);
             }
         }
+
+#if 0
+        std::cout << "DEBUG: DNN/Slice: " << outputs.size() << " inpShape=" << inpShape << std::endl;
+        for (int i = 0; i < outputs.size(); ++i)
+        {
+            for (int j = 0; j < finalSliceRanges[i].size(); ++j)
+            {
+                std::cout << finalSliceRanges[i][j];
+            }
+            std::cout << std::endl;
+        }
+#endif
     }
 
 #ifdef HAVE_OPENCL
     bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
     {
-#if 1
-        // TODO fix that (brokes YOLOv4-tiny)
-        return false;
-#else
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
-        bool use_half = (inputs_.depth() == CV_16S);
         inputs_.getUMatVector(inputs);
         outputs_.getUMatVector(outputs);
 
-        if (inputs[0].dims < 4 || (total(shape(outputs[0]), 0, 2) % 4 != 0) ||
-            (total(shape(outputs[0]), 2) % 4 != 0))
+        CV_Assert(outputs.size() == finalSliceRanges.size());
+
+        const UMat& input = inputs[0];
+        if (input.dims > 5)
+        {
+            CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << input.dims << ". Fallback to CPU");
             return false;
+        }
 
-        String opts;
-        if (use_half)
-            opts = "-DDtype=half -DDtype4=half4 -DDtype8=half8";
-        else
-            opts = "-DDtype=float -DDtype4=float4 -DDtype8=float8";
-        const UMat& inpMat = inputs[0];
+        size_t WSZ = 128;
+
+        const int dims = input.dims;
+        const int elemSize = (int)input.elemSize();
+        String opts0 = cv::format(
+                "-DDIMS=%d -DELEMSIZE=%d",
+                dims, elemSize
+            );
+        for (int d = 0; d < dims; d++)
+        {
+            opts0 += cv::format(" -DSRC_STEP_%d=%d", d, (int)input.step[dims - 1 - d]);
+        }
+        String kname = cv::format("slice_%d", dims);
         for (size_t i = 0; i < outputs.size(); i++)
         {
-            int groups = outputs[i].size[0];
-            int channels = outputs[i].size[1];
-            int rows = outputs[i].size[2];
-            int cols = outputs[i].size[3];
-
-            ocl::Kernel kernel("slice", ocl::dnn::slice_oclsrc, opts);
-            size_t local[] = { 128 };
-            size_t global[] = { (size_t)groups * channels / 4 * local[0] };
-            int idx = 0;
-            kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inpMat));
-            kernel.set(idx++, (int)(inpMat.size[2] * inpMat.size[3]));
-            kernel.set(idx++, (int)(rows * cols));
-            kernel.set(idx++, (int)inpMat.size[3]);
-            kernel.set(idx++, (int)cols);
-            kernel.set(idx++, (int)finalSliceRanges[i][2].start);
-            kernel.set(idx++, (int)finalSliceRanges[i][3].start);
-            kernel.set(idx++, ocl::KernelArg::PtrWriteOnly(outputs[i]));
-            bool ret = kernel.run(1, global, local, false);
+            UMat& output = outputs[i];
+            const std::vector<Range>& range = finalSliceRanges[i];
+
+            String opts = opts0;
+
+            CV_CheckEQ(output.dims, dims, "");
+            for (int d = 0; d < dims; d++)
+            {
+                opts += cv::format(" -DDST_STEP_%d=%d -DDST_SZ_%d=%d -DSRC_START_%d=%d",
+                        d, (int)output.step[dims - 1 - d],
+                        d, (int)output.size[dims - 1 - d],
+                        d, (int)range[dims - 1 - d].start
+                    );
+                CV_CheckEQ(range[d].size(), (int)output.size[d], "");
+            }
+
+            int block_dims = 0;
+            size_t block_size = elemSize;
+            for (int i = dims - 1; i >= 0; --i)
+            {
+                if (input.step[i] != output.step[i])
+                    break;
+                block_size *= output.size[i];
+                block_dims++;
+            }
+
+            const size_t total = output.total() * elemSize;
+            size_t num_blocks = total / block_size;
+
+            if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= WSZ * 64))
+            {
+                // use 1D copy mode
+                opts += cv::format(" -DUSE_COPY_1D=1");
+
+                opts += cv::format(" -DBLOCK_DIMS=%d", block_dims);
+                opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims);
+                opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
+
+                opts += cv::format(" -DBLOCK_COLS=%d", (int)block_size);
+            }
+            else
+            {
+                // use 2D copy mode
+                int block_cols = block_size;
+                int block_dims_contiguous = block_dims;
+                size_t input_base_step = input.step[dims - 1 - block_dims_contiguous];
+                size_t output_base_step = output.step[dims - 1 - block_dims_contiguous];
+
+                size_t block_rows = 1;
+                for (int i = dims - 1 - block_dims_contiguous; i >= 0; --i)
+                {
+                    if (input.step[i] * output_base_step != output.step[i] * input_base_step)
+                        break;
+                    block_rows *= output.size[i];
+                    block_dims++;
+                }
+
+                block_size *= block_rows;
+
+                num_blocks = total / block_size;
+
+                if (block_rows > 1)
+                {
+                    opts += cv::format(" -DBLOCK_DIMS=%d", block_dims);
+                    opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims_contiguous);
+                    opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
+
+                    opts += cv::format(" -DBLOCK_COLS=%d", (int)block_cols);
+
+                    opts += cv::format(" -DBLOCK_ROWS=%d", (int)block_rows);
+                    opts += cv::format(" -DBLOCK_SRC_STRIDE=%d", (int)input_base_step);
+                }
+                else
+                {
+                    // use 1D copy mode
+                    opts += cv::format(" -DUSE_COPY_1D=1");
+
+                    opts += cv::format(" -DBLOCK_DIMS=%d", block_dims_contiguous);
+                    opts += cv::format(" -DBLOCK_DIMS_CONTIGUOUS=%d", block_dims_contiguous);
+                    opts += cv::format(" -DBLOCK_SIZE=%d", (int)block_size);
+
+                    opts += cv::format(" -DBLOCK_COLS=%d", (int)block_size);
+                }
+            }
+
+            const size_t MIN_WORK_ITEMS = 16;
+            if (block_size <= 4 * MIN_WORK_ITEMS)
+                WSZ = 4;
+            else if (block_size <= 8 * MIN_WORK_ITEMS)
+                WSZ = 8;
+            else if (block_size <= 16 * MIN_WORK_ITEMS)
+                WSZ = 16;
+            else if (block_size <= 32 * MIN_WORK_ITEMS)
+                WSZ = 32;
+            else if (block_size <= 64 * MIN_WORK_ITEMS)
+                WSZ = 64;
+
+            opts += cv::format(" -DWSZ=%d", (int)WSZ);
+
+            size_t local[] = { WSZ, 1 };
+            size_t global[] = { WSZ, num_blocks };
+
+            ocl::Kernel kernel(kname.c_str(), ocl::dnn::slice_oclsrc, opts);
+            if (kernel.empty())
+                return false;
+            bool ret = kernel.args(
+                    ocl::KernelArg::PtrReadOnly(input),
+                    ocl::KernelArg::PtrWriteOnly(output)
+                )
+                .run(2, global, local, false);
             if (!ret)
                 return false;
-        }
+        }  // for outputs.size()
 
         return true;
-#endif
         }
 #endif
 

diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp
@@ -29,8 +29,8 @@
 #define INF_ENGINE_RELEASE_2020_4 2020040000
 
 #ifndef INF_ENGINE_RELEASE
-#warning("IE version have not been provided via command-line. Using 2020.3 by default")
-#define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2020_3
+#warning("IE version have not been provided via command-line. Using 2020.4 by default")
+#define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2020_4
 #endif
 
 #define INF_ENGINE_VER_MAJOR_GT(ver) (((INF_ENGINE_RELEASE) / 10000) > ((ver) / 10000))
@@ -44,7 +44,7 @@
 #pragma GCC diagnostic ignored "-Wsuggest-override"
 #endif
 
-#ifdef HAVE_DNN_IE_NN_BUILDER_2019
+#if defined(HAVE_DNN_IE_NN_BUILDER_2019) || INF_ENGINE_VER_MAJOR_EQ(INF_ENGINE_RELEASE_2020_4)
 //#define INFERENCE_ENGINE_DEPRECATED  // turn off deprecation warnings from IE
 //there is no way to suppress warnings from IE only at this moment, so we are forced to suppress warnings globally
 #if defined(__GNUC__)
@@ -53,7 +53,7 @@
 #ifdef _MSC_VER
 #pragma warning(disable: 4996)  // was declared deprecated
 #endif
-#endif  // HAVE_DNN_IE_NN_BUILDER_2019
+#endif
 
 #if defined(__GNUC__) && INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2020_1)
 #pragma GCC visibility push(default)