Skip to content

Commit

Permalink
Gl integration (AlexeyAB#8833)
Browse files Browse the repository at this point in the history
* Start adding support for running darknet from OpenGL textures

* Get CUDA-GL interop working (at least without segfaults...)

* Copy the GL texture to the CUDA input buffer

* Remove printf statements

* Code cleanup

* Fix a function call

* Remove the #ifdef GPU guard in darknet.h

* Use the width and height from the network instead of hard-coding

* Install libgles2-mesa-dev for some ubuntu targets

* Get the build working on Windows

* Add an OpenGL-CUDA integration option for CMake

* Bump the version-date in vcpkg.json

* Remove ifdef in darknet.h for the network_predict_gl_texture() function

* Use target_compile_definitions to set CUDA_OPENGL_INTEGRATION flag

* Re-add the #ifdef CUDA_OPENGL_INTEGRATION guard

---------

Co-authored-by: Brian Schwind <[email protected]>
  • Loading branch information
cenit and bschwind authored Aug 26, 2023
1 parent dfc70de commit 2f4ab08
Show file tree
Hide file tree
Showing 10 changed files with 157 additions and 12 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ccpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
- name: Update apt
run: sudo apt update
- name: Install dependencies
run: sudo apt install libopencv-dev
run: sudo apt install libopencv-dev libgles2-mesa-dev

- name: 'Install CUDA'
run: ${{ github.workspace }}/scripts/deploy-cuda.sh
Expand Down Expand Up @@ -293,7 +293,7 @@ jobs:
- name: Update apt
run: sudo apt update
- name: Install dependencies
run: sudo apt install libopencv-dev
run: sudo apt install libopencv-dev libgles2-mesa-dev

- uses: lukka/get-cmake@latest

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
- name: Update apt
run: sudo apt update
- name: Install dependencies
run: sudo apt install libopencv-dev
run: sudo apt install libopencv-dev libgles2-mesa-dev

- name: 'Install CUDA'
run: ${{ github.workspace }}/scripts/deploy-cuda.sh
Expand Down Expand Up @@ -215,7 +215,7 @@ jobs:
- name: Update apt
run: sudo apt update
- name: Install dependencies
run: sudo apt install libopencv-dev
run: sudo apt install libopencv-dev libgles2-mesa-dev

- uses: lukka/get-cmake@latest

Expand Down
16 changes: 16 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ option(BUILD_USELIB_TRACK "Build uselib_track" ON)
option(MANUALLY_EXPORT_TRACK_OPTFLOW "Manually export the TRACK_OPTFLOW=1 define" OFF)
option(ENABLE_OPENCV "Enable OpenCV integration" ON)
option(ENABLE_CUDA "Enable CUDA support" ON)
cmake_dependent_option(ENABLE_CUDA_OPENGL_INTEGRATION "Build darknet with support for running networks straight from OpenGL textures" ON "ENABLE_CUDA" OFF)
option(ENABLE_CUDNN "Enable CUDNN" ON)
option(ENABLE_CUDNN_HALF "Enable CUDNN Half precision" ON)
option(ENABLE_ZED_CAMERA "Enable ZED Camera support" ON)
Expand Down Expand Up @@ -327,6 +328,11 @@ endif()
if(ENABLE_CUDA)
if(MSVC)
set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} /DGPU")

if(ENABLE_CUDA_OPENGL_INTEGRATION)
set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} /DCUDA_OPENGL_INTEGRATION")
endif()

if(CUDNN_FOUND)
set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} /DCUDNN")
endif()
Expand All @@ -337,6 +343,11 @@ if(ENABLE_CUDA)
set(CUDA_HOST_COMPILER_FLAGS "-Wno-deprecated-declarations -Xcompiler=\"${ADDITIONAL_CXX_FLAGS_COMMA_SEPARATED}\"")
else()
set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} -DGPU")

if(ENABLE_CUDA_OPENGL_INTEGRATION)
set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} -DCUDA_OPENGL_INTEGRATION")
endif()

if(CUDNN_FOUND)
set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} -DCUDNN")
endif()
Expand Down Expand Up @@ -544,6 +555,11 @@ if(ENABLE_CUDA)
target_compile_definitions(dark PUBLIC -DGPU)
endif()

if(ENABLE_CUDA_OPENGL_INTEGRATION)
target_compile_definitions(darknet PRIVATE -DCUDA_OPENGL_INTEGRATION)
target_compile_definitions(dark PUBLIC -DCUDA_OPENGL_INTEGRATION)
endif()

if(USE_INTEGRATED_LIBS AND WIN32)
target_compile_definitions(darknet PRIVATE -D_TIMESPEC_DEFINED)
target_compile_definitions(dark PRIVATE -D_TIMESPEC_DEFINED)
Expand Down
4 changes: 4 additions & 0 deletions include/darknet.h
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,10 @@ LIB_API void diounms_sort(detection *dets, int total, int classes, float thresh,
// network.h
LIB_API float *network_predict(network net, float *input);
LIB_API float *network_predict_ptr(network *net, float *input);
#ifdef CUDA_OPENGL_INTEGRATION
LIB_API float *network_predict_gl_texture(network *net, uint32_t texture_id);
#endif // CUDA_OPENGL_INTEGRATION

LIB_API void set_batch_network(network *net, int b);
LIB_API detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num, int letter);
LIB_API det_num_pair* network_predict_batch(network *net, image im, int batch_size, int w, int h, float thresh, float hier, int *map, int relative, int letter);
Expand Down
12 changes: 6 additions & 6 deletions src/blas.c
Original file line number Diff line number Diff line change
Expand Up @@ -342,32 +342,32 @@ void fill_cpu(int N, float ALPHA, float *X, int INCX)
}
}

void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUTPUT)
{
int i, j;
int index = 0;
for(j = 0; j < B; ++j) {
for(i = 0; i < NX; ++i){
if(X) X[j*NX + i] += OUT[index];
if(X) X[j*NX + i] += OUTPUT[index];
++index;
}
for(i = 0; i < NY; ++i){
if(Y) Y[j*NY + i] += OUT[index];
if(Y) Y[j*NY + i] += OUTPUT[index];
++index;
}
}
}

void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUTPUT)
{
int i, j;
int index = 0;
for(j = 0; j < B; ++j) {
for(i = 0; i < NX; ++i){
OUT[index++] = X[j*NX + i];
OUTPUT[index++] = X[j*NX + i];
}
for(i = 0; i < NY; ++i){
OUT[index++] = Y[j*NY + i];
OUTPUT[index++] = Y[j*NY + i];
}
}
}
Expand Down
12 changes: 12 additions & 0 deletions src/dark_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,18 @@ extern int gpu_index;
#include <curand.h>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>

#ifdef CUDA_OPENGL_INTEGRATION
// On Windows, we need to include <windows.h> before
// including OpenGL headers or else we will get various
// compiler errors due to missing macros.
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif // _WIN32

#include <cuda_gl_interop.h>
#endif // CUDA_OPENGL_INTEGRATION
//#include <driver_types.h>

#ifdef CUDNN
Expand Down
15 changes: 15 additions & 0 deletions src/network.c
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,21 @@ float *network_predict(network net, float *input)
return out;
}

#ifdef CUDA_OPENGL_INTEGRATION
float *network_predict_gl_texture(network *net, uint32_t texture_id)
{
if(net->batch != 1) {
set_batch_network(net, 1);
}

if(gpu_index >= 0) {
return network_predict_gpu_gl_texture(*net, texture_id);
}

return NULL;
}
#endif // CUDA_OPENGL_INTEGRATION

int num_detections(network *net, float thresh)
{
int i;
Expand Down
1 change: 1 addition & 0 deletions src/network.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ float train_networks(network *nets, int n, data d, int interval);
void sync_nets(network *nets, int n, int interval);
float train_network_datum_gpu(network net, float *x, float *y);
float *network_predict_gpu(network net, float *input);
float *network_predict_gpu_gl_texture(network net, uint32_t texture_id);
float * get_network_output_gpu_layer(network net, int i);
float * get_network_delta_gpu_layer(network net, int i);
float *get_network_output_gpu(network net);
Expand Down
89 changes: 89 additions & 0 deletions src/network_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -747,3 +747,92 @@ float *network_predict_gpu(network net, float *input)
//cuda_free(state.input); // will be freed in the free_network()
return out;
}

#ifdef CUDA_OPENGL_INTEGRATION
float *network_predict_gpu_gl_texture(network net, uint32_t texture_id)
{
if (net.gpu_index != cuda_get_device())
cuda_set_device(net.gpu_index);
int size = get_network_input_size(net) * net.batch;

// Map the OpenGL texture resource so CUDA can access it.
cudaGraphicsResource_t graphics_resource = NULL;
unsigned int flags = cudaGraphicsRegisterFlagsReadOnly;
CHECK_CUDA(cudaGraphicsGLRegisterImage(&graphics_resource, texture_id, GL_TEXTURE_2D, flags));
CHECK_CUDA(cudaGraphicsMapResources(1, &graphics_resource, 0));

void* dev_ptr = NULL;
cudaArray_t dev_array = NULL;
CHECK_CUDA(cudaGraphicsSubResourceGetMappedArray(&dev_array, graphics_resource, 0, 0));

size_t width = net.w;
size_t height = net.h;
size_t pitch = width * sizeof(float);

CHECK_CUDA(cudaMemcpy2DFromArray(
net.input_state_gpu, // dst
pitch, // dst_pitch
dev_array, // src
0, // width offset
0, // height offset
width * sizeof(float), // width (in bytes)
height * net.c, // height (in rows)
cudaMemcpyDeviceToDevice // Transfer type
));

network_state state;
state.index = 0;
state.net = net;
state.input = net.input_state_gpu;
state.truth = 0;
state.train = 0;
state.delta = 0;

//cudaGraphExec_t instance = (cudaGraphExec_t)net.cuda_graph_exec;
static cudaGraphExec_t instance;

if ((*net.cuda_graph_ready) == 0) {
static cudaGraph_t graph;
if (net.use_cuda_graph == 1) {
int i;
for (i = 0; i < 16; ++i) switch_stream(i);

cudaStream_t stream0 = switch_stream(0);
CHECK_CUDA(cudaDeviceSynchronize());
printf("Try to capture graph... \n");
//cudaGraph_t graph = (cudaGraph_t)net.cuda_graph;
CHECK_CUDA(cudaStreamBeginCapture(stream0, cudaStreamCaptureModeGlobal));
}

// cuda_push_array(state.input, net.input_pinned_cpu, size);
forward_network_gpu(net, state);

if (net.use_cuda_graph == 1) {
cudaStream_t stream0 = switch_stream(0);
CHECK_CUDA(cudaStreamEndCapture(stream0, &graph));
CHECK_CUDA(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
(*net.cuda_graph_ready) = 1;
printf(" graph is captured... \n");
CHECK_CUDA(cudaDeviceSynchronize());
}
CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
}
else {
cudaStream_t stream0 = switch_stream(0);
//printf(" cudaGraphLaunch \n");
CHECK_CUDA( cudaGraphLaunch(instance, stream0) );
CHECK_CUDA( cudaStreamSynchronize(stream0) );
//printf(" ~cudaGraphLaunch \n");
}

float *out = get_network_output_gpu(net);
reset_wait_stream_events();
//cuda_free(state.input); // will be freed in the free_network()

// Unmap the OpenGL texture.
cudaGraphicsUnmapResources(1, &graphics_resource, 0);
cudaGraphicsUnregisterResource(graphics_resource);

return out;
}
#endif // CUDA_OPENGL_INTEGRATION
12 changes: 10 additions & 2 deletions vcpkg.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "darknet",
"version": "0.2.5.5",
"version": "0.2.5.6",
"description": "Darknet is an open source neural network framework written in C and CUDA. You only look once (YOLO) is a state-of-the-art, real-time object detection system, best example of darknet functionalities.",
"homepage": "https://github.com/alexeyab/darknet",
"dependencies": [
Expand All @@ -21,6 +21,13 @@
"cudnn"
]
},
"cuda-opengl-integration": {
"description": "Build darknet with support for running networks straight from OpenGL textures",
"dependencies": [
"cuda",
"opengl"
]
},
"full": {
"description": "Build darknet fully featured",
"dependencies": [
Expand All @@ -29,7 +36,8 @@
"features": [
"cuda",
"cudnn",
"opencv-cuda"
"opencv-cuda",
"cuda-opengl-integration"
]
}
]
Expand Down

0 comments on commit 2f4ab08

Please sign in to comment.