Gl integration (AlexeyAB#8833)

* Start adding support for running darknet from OpenGL textures * Get CUDA-GL interop working (at least without segfaults...) * Copy the GL texture to the CUDA input buffer * Remove printf statements * Code cleanup * Fix a function call * Remove the #ifdef GPU guard in darknet.h * Use the width and height from the network instead of hard-coding * Install libgles2-mesa-dev for some ubuntu targets * Get the build working on Windows * Add an OpenGL-CUDA integration option for CMake * Bump the version-date in vcpkg.json * Remove ifdef in darknet.h for the network_predict_gl_texture() function * Use target_compile_definitions to set CUDA_OPENGL_INTEGRATION flag * Re-add the #ifdef CUDA_OPENGL_INTEGRATION guard --------- Co-authored-by: Brian Schwind <[email protected]>
yview-develop · Aug 26, 2023 · 2f4ab08 · 2f4ab08
1 parent dfc70de
commit 2f4ab08
Show file tree

Hide file tree

Showing 10 changed files with 157 additions and 12 deletions.
diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml
@@ -24,7 +24,7 @@ jobs:
     - name: Update apt
       run: sudo apt update
     - name: Install dependencies
-      run: sudo apt install libopencv-dev
+      run: sudo apt install libopencv-dev libgles2-mesa-dev
 
     - name: 'Install CUDA'
       run: ${{ github.workspace }}/scripts/deploy-cuda.sh
@@ -293,7 +293,7 @@ jobs:
     - name: Update apt
       run: sudo apt update
     - name: Install dependencies
-      run: sudo apt install libopencv-dev
+      run: sudo apt install libopencv-dev libgles2-mesa-dev
 
     - uses: lukka/get-cmake@latest
 

diff --git a/.github/workflows/on_pr.yml b/.github/workflows/on_pr.yml
@@ -14,7 +14,7 @@ jobs:
     - name: Update apt
       run: sudo apt update
     - name: Install dependencies
-      run: sudo apt install libopencv-dev
+      run: sudo apt install libopencv-dev libgles2-mesa-dev
 
     - name: 'Install CUDA'
       run: ${{ github.workspace }}/scripts/deploy-cuda.sh
@@ -215,7 +215,7 @@ jobs:
     - name: Update apt
       run: sudo apt update
     - name: Install dependencies
-      run: sudo apt install libopencv-dev
+      run: sudo apt install libopencv-dev libgles2-mesa-dev
 
     - uses: lukka/get-cmake@latest
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -9,6 +9,7 @@ option(BUILD_USELIB_TRACK "Build uselib_track" ON)
 option(MANUALLY_EXPORT_TRACK_OPTFLOW "Manually export the TRACK_OPTFLOW=1 define" OFF)
 option(ENABLE_OPENCV "Enable OpenCV integration" ON)
 option(ENABLE_CUDA "Enable CUDA support" ON)
+cmake_dependent_option(ENABLE_CUDA_OPENGL_INTEGRATION "Build darknet with support for running networks straight from OpenGL textures" ON "ENABLE_CUDA" OFF)
 option(ENABLE_CUDNN "Enable CUDNN" ON)
 option(ENABLE_CUDNN_HALF "Enable CUDNN Half precision" ON)
 option(ENABLE_ZED_CAMERA "Enable ZED Camera support" ON)
@@ -327,6 +328,11 @@ endif()
 if(ENABLE_CUDA)
   if(MSVC)
     set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} /DGPU")
+
+    if(ENABLE_CUDA_OPENGL_INTEGRATION)
+      set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} /DCUDA_OPENGL_INTEGRATION")
+    endif()
+
     if(CUDNN_FOUND)
       set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} /DCUDNN")
     endif()
@@ -337,6 +343,11 @@ if(ENABLE_CUDA)
     set(CUDA_HOST_COMPILER_FLAGS "-Wno-deprecated-declarations -Xcompiler=\"${ADDITIONAL_CXX_FLAGS_COMMA_SEPARATED}\"")
   else()
     set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} -DGPU")
+
+    if(ENABLE_CUDA_OPENGL_INTEGRATION)
+      set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} -DCUDA_OPENGL_INTEGRATION")
+    endif()
+
     if(CUDNN_FOUND)
       set(ADDITIONAL_CXX_FLAGS "${ADDITIONAL_CXX_FLAGS} -DCUDNN")
     endif()
@@ -544,6 +555,11 @@ if(ENABLE_CUDA)
   target_compile_definitions(dark PUBLIC -DGPU)
 endif()
 
+if(ENABLE_CUDA_OPENGL_INTEGRATION)
+  target_compile_definitions(darknet PRIVATE -DCUDA_OPENGL_INTEGRATION)
+  target_compile_definitions(dark PUBLIC -DCUDA_OPENGL_INTEGRATION)
+endif()
+
 if(USE_INTEGRATED_LIBS AND WIN32)
   target_compile_definitions(darknet PRIVATE -D_TIMESPEC_DEFINED)
   target_compile_definitions(dark PRIVATE -D_TIMESPEC_DEFINED)

diff --git a/include/darknet.h b/include/darknet.h
@@ -1032,6 +1032,10 @@ LIB_API void diounms_sort(detection *dets, int total, int classes, float thresh,
 // network.h
 LIB_API float *network_predict(network net, float *input);
 LIB_API float *network_predict_ptr(network *net, float *input);
+#ifdef CUDA_OPENGL_INTEGRATION
+LIB_API float *network_predict_gl_texture(network *net, uint32_t texture_id);
+#endif // CUDA_OPENGL_INTEGRATION
+
 LIB_API void set_batch_network(network *net, int b);
 LIB_API detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num, int letter);
 LIB_API det_num_pair* network_predict_batch(network *net, image im, int batch_size, int w, int h, float thresh, float hier, int *map, int relative, int letter);

diff --git a/src/blas.c b/src/blas.c
@@ -342,32 +342,32 @@ void fill_cpu(int N, float ALPHA, float *X, int INCX)
     }
 }
 
-void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
+void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUTPUT)
 {
     int i, j;
     int index = 0;
     for(j = 0; j < B; ++j) {
         for(i = 0; i < NX; ++i){
-            if(X) X[j*NX + i] += OUT[index];
+            if(X) X[j*NX + i] += OUTPUT[index];
             ++index;
         }
         for(i = 0; i < NY; ++i){
-            if(Y) Y[j*NY + i] += OUT[index];
+            if(Y) Y[j*NY + i] += OUTPUT[index];
             ++index;
         }
     }
 }
 
-void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
+void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUTPUT)
 {
     int i, j;
     int index = 0;
     for(j = 0; j < B; ++j) {
         for(i = 0; i < NX; ++i){
-            OUT[index++] = X[j*NX + i];
+            OUTPUT[index++] = X[j*NX + i];
         }
         for(i = 0; i < NY; ++i){
-            OUT[index++] = Y[j*NY + i];
+            OUTPUT[index++] = Y[j*NY + i];
         }
     }
 }

diff --git a/src/dark_cuda.h b/src/dark_cuda.h
@@ -25,6 +25,18 @@ extern int gpu_index;
 #include <curand.h>
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
+
+#ifdef CUDA_OPENGL_INTEGRATION
+// On Windows, we need to include <windows.h> before
+// including OpenGL headers or else we will get various
+// compiler errors due to missing macros.
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif // _WIN32
+
+#include <cuda_gl_interop.h>
+#endif // CUDA_OPENGL_INTEGRATION
 //#include <driver_types.h>
 
 #ifdef CUDNN

diff --git a/src/network.c b/src/network.c
@@ -776,6 +776,21 @@ float *network_predict(network net, float *input)
     return out;
 }
 
+#ifdef CUDA_OPENGL_INTEGRATION
+float *network_predict_gl_texture(network *net, uint32_t texture_id)
+{
+    if(net->batch != 1) {
+        set_batch_network(net, 1);
+    }
+
+    if(gpu_index >= 0) {
+        return network_predict_gpu_gl_texture(*net, texture_id);
+    }
+
+    return NULL;
+}
+#endif // CUDA_OPENGL_INTEGRATION
+
 int num_detections(network *net, float thresh)
 {
     int i;

diff --git a/src/network.h b/src/network.h
@@ -107,6 +107,7 @@ float train_networks(network *nets, int n, data d, int interval);
 void sync_nets(network *nets, int n, int interval);
 float train_network_datum_gpu(network net, float *x, float *y);
 float *network_predict_gpu(network net, float *input);
+float *network_predict_gpu_gl_texture(network net, uint32_t texture_id);
 float * get_network_output_gpu_layer(network net, int i);
 float * get_network_delta_gpu_layer(network net, int i);
 float *get_network_output_gpu(network net);

diff --git a/src/network_kernels.cu b/src/network_kernels.cu
@@ -747,3 +747,92 @@ float *network_predict_gpu(network net, float *input)
     //cuda_free(state.input);   // will be freed in the free_network()
     return out;
 }
+
+#ifdef CUDA_OPENGL_INTEGRATION
+float *network_predict_gpu_gl_texture(network net, uint32_t texture_id)
+{
+    if (net.gpu_index != cuda_get_device())
+        cuda_set_device(net.gpu_index);
+    int size = get_network_input_size(net) * net.batch;
+
+    // Map the OpenGL texture resource so CUDA can access it.
+    cudaGraphicsResource_t graphics_resource = NULL;
+    unsigned int flags = cudaGraphicsRegisterFlagsReadOnly;
+    CHECK_CUDA(cudaGraphicsGLRegisterImage(&graphics_resource, texture_id, GL_TEXTURE_2D, flags));
+    CHECK_CUDA(cudaGraphicsMapResources(1, &graphics_resource, 0));
+
+    void* dev_ptr = NULL;
+    cudaArray_t dev_array = NULL;
+    CHECK_CUDA(cudaGraphicsSubResourceGetMappedArray(&dev_array, graphics_resource, 0, 0));
+
+    size_t width = net.w;
+    size_t height = net.h;
+    size_t pitch = width * sizeof(float);
+
+    CHECK_CUDA(cudaMemcpy2DFromArray(
+            net.input_state_gpu,     // dst
+            pitch,                   // dst_pitch
+            dev_array,               // src
+            0,                       // width offset
+            0,                       // height offset
+            width * sizeof(float),   // width (in bytes)
+            height * net.c,          // height (in rows)
+            cudaMemcpyDeviceToDevice // Transfer type
+    ));
+
+    network_state state;
+    state.index = 0;
+    state.net = net;
+    state.input = net.input_state_gpu;
+    state.truth = 0;
+    state.train = 0;
+    state.delta = 0;
+
+    //cudaGraphExec_t instance = (cudaGraphExec_t)net.cuda_graph_exec;
+    static cudaGraphExec_t instance;
+
+    if ((*net.cuda_graph_ready) == 0) {
+        static cudaGraph_t graph;
+        if (net.use_cuda_graph == 1) {
+            int i;
+            for (i = 0; i < 16; ++i) switch_stream(i);
+
+            cudaStream_t stream0 = switch_stream(0);
+            CHECK_CUDA(cudaDeviceSynchronize());
+            printf("Try to capture graph... \n");
+            //cudaGraph_t graph = (cudaGraph_t)net.cuda_graph;
+            CHECK_CUDA(cudaStreamBeginCapture(stream0, cudaStreamCaptureModeGlobal));
+        }
+
+        // cuda_push_array(state.input, net.input_pinned_cpu, size);
+        forward_network_gpu(net, state);
+
+        if (net.use_cuda_graph == 1) {
+            cudaStream_t stream0 = switch_stream(0);
+            CHECK_CUDA(cudaStreamEndCapture(stream0, &graph));
+            CHECK_CUDA(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
+            (*net.cuda_graph_ready) = 1;
+            printf(" graph is captured... \n");
+            CHECK_CUDA(cudaDeviceSynchronize());
+        }
+        CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
+    }
+    else {
+        cudaStream_t stream0 = switch_stream(0);
+        //printf(" cudaGraphLaunch \n");
+        CHECK_CUDA( cudaGraphLaunch(instance, stream0) );
+        CHECK_CUDA( cudaStreamSynchronize(stream0) );
+        //printf(" ~cudaGraphLaunch \n");
+    }
+
+    float *out = get_network_output_gpu(net);
+    reset_wait_stream_events();
+    //cuda_free(state.input);   // will be freed in the free_network()
+
+    // Unmap the OpenGL texture.
+    cudaGraphicsUnmapResources(1, &graphics_resource, 0);
+    cudaGraphicsUnregisterResource(graphics_resource);
+
+    return out;
+}
+#endif // CUDA_OPENGL_INTEGRATION
diff --git a/vcpkg.json b/vcpkg.json
@@ -1,6 +1,6 @@
 {
   "name": "darknet",
-  "version": "0.2.5.5",
+  "version": "0.2.5.6",
   "description": "Darknet is an open source neural network framework written in C and CUDA. You only look once (YOLO) is a state-of-the-art, real-time object detection system, best example of darknet functionalities.",
   "homepage": "https://github.com/alexeyab/darknet",
   "dependencies": [
@@ -21,6 +21,13 @@
         "cudnn"
       ]
     },
+    "cuda-opengl-integration": {
+      "description": "Build darknet with support for running networks straight from OpenGL textures",
+      "dependencies": [
+        "cuda",
+        "opengl"
+      ]
+    },
     "full": {
       "description": "Build darknet fully featured",
       "dependencies": [
@@ -29,7 +36,8 @@
           "features": [
             "cuda",
             "cudnn",
-            "opencv-cuda"
+            "opencv-cuda",
+            "cuda-opengl-integration"
           ]
         }
       ]