Skip to content

Commit

Permalink
OpenVX HIP GPU backend - add support for scalar output sync (ROCm#523)
Browse files Browse the repository at this point in the history
- this PR fixes the Graph.VirtualArray conformance test for HIP backend. This
test makes use of the num_corners (scalar output vlaue) of FastCorner node which
was missing support in HIP backend.
  • Loading branch information
AryanSalmanpour authored Jun 15, 2021
1 parent be9dcaa commit 8ca699c
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 11 deletions.
8 changes: 4 additions & 4 deletions amd_openvx/openvx/ago/ago_haf_gpu_corners.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ int HafGpu_FastCorners_XY_U8(AgoNode * node)
node->opencl_param_atomic_mask = (1 << 0);
node->opencl_local_buffer_usage_mask = 0;
node->opencl_local_buffer_size_in_bytes = 0;
node->opencl_scalar_array_output_sync.enable = false;
node->gpu_scalar_array_output_sync.enable = false;
if (numCorners) {
// discard the scalar argument and inform the framework that it needs to be synched with array output numitems
node->opencl_param_discard_mask = (1 << 1);
node->opencl_scalar_array_output_sync.enable = true;
node->opencl_scalar_array_output_sync.paramIndexArray = 0;
node->opencl_scalar_array_output_sync.paramIndexScalar = 1;
node->gpu_scalar_array_output_sync.enable = true;
node->gpu_scalar_array_output_sync.paramIndexArray = 0;
node->gpu_scalar_array_output_sync.paramIndexScalar = 1;
}

if (useNonMax)
Expand Down
2 changes: 1 addition & 1 deletion amd_openvx/openvx/ago/ago_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -606,12 +606,12 @@ struct AgoNode {
vx_perf_t perf;
vx_bool local_data_change_is_enabled;
vx_bool local_data_set_by_implementation;
struct { bool enable; int paramIndexScalar; int paramIndexArray; } gpu_scalar_array_output_sync;
#if ENABLE_OPENCL
vx_uint32 opencl_type;
char opencl_name[VX_MAX_KERNEL_NAME];
std::string opencl_code;
std::string opencl_build_options;
struct { bool enable; int paramIndexScalar; int paramIndexArray; } opencl_scalar_array_output_sync;
vx_uint32 opencl_param_mem2reg_mask;
vx_uint32 opencl_param_discard_mask;
vx_uint32 opencl_param_as_value_mask;
Expand Down
10 changes: 10 additions & 0 deletions amd_openvx/openvx/ago/ago_kernel_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16642,6 +16642,11 @@ int agoKernel_FastCorners_XY_U8_Supression(AgoNode * node, AgoKernelCommand cmd)
AgoData * oNumCorners = node->paramList[1];
AgoData * iImg = node->paramList[2];
vx_float32 strength_threshold = node->paramList[3]->u.scalar.u.f;
if (oNumCorners) {
node->gpu_scalar_array_output_sync.enable = true;
node->gpu_scalar_array_output_sync.paramIndexArray = 0;
node->gpu_scalar_array_output_sync.paramIndexScalar = 1;
}
if (HipExec_FastCorners_XY_U8_Supression(node->hip_stream0, (vx_uint32)oXY->u.arr.capacity, oXY->hip_memory, oXY->gpu_buffer_offset,
iImg->u.img.width, iImg->u.img.height, iImg->hip_memory + iImg->gpu_buffer_offset, iImg->u.img.stride_in_bytes, strength_threshold)) {
status = VX_FAILURE;
Expand Down Expand Up @@ -16714,6 +16719,11 @@ int agoKernel_FastCorners_XY_U8_NoSupression(AgoNode * node, AgoKernelCommand cm
AgoData * oNumCorners = node->paramList[1];
AgoData * iImg = node->paramList[2];
vx_float32 strength_threshold = node->paramList[3]->u.scalar.u.f;
if (oNumCorners) {
node->gpu_scalar_array_output_sync.enable = true;
node->gpu_scalar_array_output_sync.paramIndexArray = 0;
node->gpu_scalar_array_output_sync.paramIndexScalar = 1;
}
if (HipExec_FastCorners_XY_U8_NoSupression(node->hip_stream0, (vx_uint32)oXY->u.arr.capacity, oXY->hip_memory, oXY->gpu_buffer_offset,
iImg->u.img.width, iImg->u.img.height, iImg->hip_memory + iImg->gpu_buffer_offset, iImg->u.img.stride_in_bytes, strength_threshold)) {
status = VX_FAILURE;
Expand Down
2 changes: 1 addition & 1 deletion amd_openvx/openvx/ago/ago_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3245,9 +3245,9 @@ AgoNode::AgoNode()
memset(&paramListForAgeDelay, 0, sizeof(paramListForAgeDelay));
memset(&funcExchange, 0, sizeof(funcExchange));
memset(&perf, 0, sizeof(perf));
memset(&gpu_scalar_array_output_sync, 0, sizeof(gpu_scalar_array_output_sync));
#if ENABLE_OPENCL
memset(&opencl_name, 0, sizeof(opencl_name));
memset(&opencl_scalar_array_output_sync, 0, sizeof(opencl_scalar_array_output_sync));
memset(&opencl_global_work, 0, sizeof(opencl_global_work));
memset(&opencl_local_work, 0, sizeof(opencl_local_work));
#endif
Expand Down
8 changes: 8 additions & 0 deletions amd_openvx/openvx/ago/ago_util_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,14 @@ int agoGpuHipSingleNodeWait(AgoGraph * graph, AgoNode * node)
}
}

if (node->gpu_scalar_array_output_sync.enable &&
node->paramList[node->gpu_scalar_array_output_sync.paramIndexScalar] &&
node->paramList[node->gpu_scalar_array_output_sync.paramIndexArray]) {
// updated scalar with numitems of array
node->paramList[node->gpu_scalar_array_output_sync.paramIndexScalar]->u.scalar.u.s =
node->paramList[node->gpu_scalar_array_output_sync.paramIndexArray]->u.arr.numitems;
}

// The num items in an array should not exceed the capacity unless kernels need it for reporting number of items detected (ex. FAST corners)
for (size_t index = 0; index < node->paramCount; index++) {
if (node->paramList[index]) {
Expand Down
10 changes: 5 additions & 5 deletions amd_openvx/openvx/ago/ago_util_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2431,13 +2431,13 @@ int agoGpuOclSingleNodeWait(AgoGraph * graph, AgoNode * node)
#endif
}
}
if (node->opencl_scalar_array_output_sync.enable &&
node->paramList[node->opencl_scalar_array_output_sync.paramIndexScalar] &&
node->paramList[node->opencl_scalar_array_output_sync.paramIndexArray])
if (node->gpu_scalar_array_output_sync.enable &&
node->paramList[node->gpu_scalar_array_output_sync.paramIndexScalar] &&
node->paramList[node->gpu_scalar_array_output_sync.paramIndexArray])
{
// updated scalar with numitems of array
node->paramList[node->opencl_scalar_array_output_sync.paramIndexScalar]->u.scalar.u.s =
node->paramList[node->opencl_scalar_array_output_sync.paramIndexArray]->u.arr.numitems;
node->paramList[node->gpu_scalar_array_output_sync.paramIndexScalar]->u.scalar.u.s =
node->paramList[node->gpu_scalar_array_output_sync.paramIndexArray]->u.arr.numitems;
}

// The num items in an array should not exceed the capacity unless kernels need it for reporting number of items detected (ex. FAST corners)
Expand Down

0 comments on commit 8ca699c

Please sign in to comment.