OpenVX HIP GPU backend - add support for scalar output sync (ROCm#523)

- this PR fixes the Graph.VirtualArray conformance test for HIP backend. This test makes use of the num_corners (scalar output vlaue) of FastCorner node which was missing support in HIP backend.
eidenyoshida · Jun 15, 2021 · 8ca699c · 8ca699c
1 parent be9dcaa
commit 8ca699c
Show file tree

Hide file tree

Showing 6 changed files with 29 additions and 11 deletions.
diff --git a/amd_openvx/openvx/ago/ago_haf_gpu_corners.cpp b/amd_openvx/openvx/ago/ago_haf_gpu_corners.cpp
@@ -55,13 +55,13 @@ int HafGpu_FastCorners_XY_U8(AgoNode * node)
 	node->opencl_param_atomic_mask = (1 << 0);
 	node->opencl_local_buffer_usage_mask = 0;
 	node->opencl_local_buffer_size_in_bytes = 0;
-	node->opencl_scalar_array_output_sync.enable = false;
+	node->gpu_scalar_array_output_sync.enable = false;
 	if (numCorners) {
 		// discard the scalar argument and inform the framework that it needs to be synched with array output numitems
 		node->opencl_param_discard_mask = (1 << 1);
-		node->opencl_scalar_array_output_sync.enable = true;
-		node->opencl_scalar_array_output_sync.paramIndexArray = 0;
-		node->opencl_scalar_array_output_sync.paramIndexScalar = 1;
+		node->gpu_scalar_array_output_sync.enable = true;
+		node->gpu_scalar_array_output_sync.paramIndexArray = 0;
+		node->gpu_scalar_array_output_sync.paramIndexScalar = 1;
 	}
 
 	if (useNonMax)

diff --git a/amd_openvx/openvx/ago/ago_internal.h b/amd_openvx/openvx/ago/ago_internal.h
@@ -606,12 +606,12 @@ struct AgoNode {
     vx_perf_t perf;
     vx_bool local_data_change_is_enabled;
     vx_bool local_data_set_by_implementation;
+    struct { bool enable; int paramIndexScalar; int paramIndexArray; } gpu_scalar_array_output_sync;
 #if ENABLE_OPENCL
     vx_uint32 opencl_type;
     char opencl_name[VX_MAX_KERNEL_NAME];
     std::string opencl_code;
     std::string opencl_build_options;
-    struct { bool enable; int paramIndexScalar; int paramIndexArray; } opencl_scalar_array_output_sync;
     vx_uint32 opencl_param_mem2reg_mask;
     vx_uint32 opencl_param_discard_mask;
     vx_uint32 opencl_param_as_value_mask;

diff --git a/amd_openvx/openvx/ago/ago_kernel_api.cpp b/amd_openvx/openvx/ago/ago_kernel_api.cpp
@@ -16642,6 +16642,11 @@ int agoKernel_FastCorners_XY_U8_Supression(AgoNode * node, AgoKernelCommand cmd)
         AgoData * oNumCorners = node->paramList[1];
         AgoData * iImg = node->paramList[2];
         vx_float32 strength_threshold = node->paramList[3]->u.scalar.u.f;
+        if (oNumCorners) {
+            node->gpu_scalar_array_output_sync.enable = true;
+            node->gpu_scalar_array_output_sync.paramIndexArray = 0;
+            node->gpu_scalar_array_output_sync.paramIndexScalar = 1;
+        }
         if (HipExec_FastCorners_XY_U8_Supression(node->hip_stream0, (vx_uint32)oXY->u.arr.capacity, oXY->hip_memory, oXY->gpu_buffer_offset,
             iImg->u.img.width, iImg->u.img.height, iImg->hip_memory + iImg->gpu_buffer_offset, iImg->u.img.stride_in_bytes, strength_threshold)) {
             status = VX_FAILURE;
@@ -16714,6 +16719,11 @@ int agoKernel_FastCorners_XY_U8_NoSupression(AgoNode * node, AgoKernelCommand cm
         AgoData * oNumCorners = node->paramList[1];
         AgoData * iImg = node->paramList[2];
         vx_float32 strength_threshold = node->paramList[3]->u.scalar.u.f;
+        if (oNumCorners) {
+            node->gpu_scalar_array_output_sync.enable = true;
+            node->gpu_scalar_array_output_sync.paramIndexArray = 0;
+            node->gpu_scalar_array_output_sync.paramIndexScalar = 1;
+        }
         if (HipExec_FastCorners_XY_U8_NoSupression(node->hip_stream0, (vx_uint32)oXY->u.arr.capacity, oXY->hip_memory, oXY->gpu_buffer_offset,
             iImg->u.img.width, iImg->u.img.height, iImg->hip_memory + iImg->gpu_buffer_offset, iImg->u.img.stride_in_bytes, strength_threshold)) {
             status = VX_FAILURE;

diff --git a/amd_openvx/openvx/ago/ago_util.cpp b/amd_openvx/openvx/ago/ago_util.cpp
@@ -3245,9 +3245,9 @@ AgoNode::AgoNode()
     memset(&paramListForAgeDelay, 0, sizeof(paramListForAgeDelay));
     memset(&funcExchange, 0, sizeof(funcExchange));
     memset(&perf, 0, sizeof(perf));
+    memset(&gpu_scalar_array_output_sync, 0, sizeof(gpu_scalar_array_output_sync));
 #if ENABLE_OPENCL
     memset(&opencl_name, 0, sizeof(opencl_name));
-    memset(&opencl_scalar_array_output_sync, 0, sizeof(opencl_scalar_array_output_sync));
     memset(&opencl_global_work, 0, sizeof(opencl_global_work));
     memset(&opencl_local_work, 0, sizeof(opencl_local_work));
 #endif

diff --git a/amd_openvx/openvx/ago/ago_util_hip.cpp b/amd_openvx/openvx/ago/ago_util_hip.cpp
@@ -757,6 +757,14 @@ int agoGpuHipSingleNodeWait(AgoGraph * graph, AgoNode * node)
         }
     }
 
+    if (node->gpu_scalar_array_output_sync.enable &&
+        node->paramList[node->gpu_scalar_array_output_sync.paramIndexScalar] &&
+        node->paramList[node->gpu_scalar_array_output_sync.paramIndexArray]) {
+        // updated scalar with numitems of array
+        node->paramList[node->gpu_scalar_array_output_sync.paramIndexScalar]->u.scalar.u.s =
+            node->paramList[node->gpu_scalar_array_output_sync.paramIndexArray]->u.arr.numitems;
+    }
+
     // The num items in an array should not exceed the capacity unless kernels need it for reporting number of items detected (ex. FAST corners)
     for (size_t index = 0; index < node->paramCount; index++) {
         if (node->paramList[index]) {

diff --git a/amd_openvx/openvx/ago/ago_util_opencl.cpp b/amd_openvx/openvx/ago/ago_util_opencl.cpp
@@ -2431,13 +2431,13 @@ int agoGpuOclSingleNodeWait(AgoGraph * graph, AgoNode * node)
 #endif
         }
     }
-    if (node->opencl_scalar_array_output_sync.enable && 
-        node->paramList[node->opencl_scalar_array_output_sync.paramIndexScalar] && 
-        node->paramList[node->opencl_scalar_array_output_sync.paramIndexArray])
+    if (node->gpu_scalar_array_output_sync.enable &&
+        node->paramList[node->gpu_scalar_array_output_sync.paramIndexScalar] &&
+        node->paramList[node->gpu_scalar_array_output_sync.paramIndexArray])
     {
         // updated scalar with numitems of array
-        node->paramList[node->opencl_scalar_array_output_sync.paramIndexScalar]->u.scalar.u.s =
-            node->paramList[node->opencl_scalar_array_output_sync.paramIndexArray]->u.arr.numitems;
+        node->paramList[node->gpu_scalar_array_output_sync.paramIndexScalar]->u.scalar.u.s =
+            node->paramList[node->gpu_scalar_array_output_sync.paramIndexArray]->u.arr.numitems;
     }
 
     // The num items in an array should not exceed the capacity unless kernels need it for reporting number of items detected (ex. FAST corners)