Skip to content

Commit

Permalink
cuda: fix context for direct copy and desktop gl 0-copy
Browse files Browse the repository at this point in the history
  • Loading branch information
wang-bin committed Mar 15, 2016
1 parent 8aa8652 commit 1c6b1ea
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 45 deletions.
44 changes: 28 additions & 16 deletions src/codec/video/SurfaceInteropCUDA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@
namespace QtAV {
namespace cuda {

InteropResource::InteropResource(CUdevice d, CUvideodecoder decoder, CUvideoctxlock declock)
InteropResource::InteropResource()
: cuda_api()
, dev(d)
, ctx(NULL)
, dec(decoder)
, lock(declock)
, dev(0)
, ctx(0)
, dec(0)
, lock(0)
{
memset(res, 0, sizeof(res));
}
Expand All @@ -53,7 +53,8 @@ InteropResource::~InteropResource()
CUDA_WARN(cuStreamDestroy(res[1].stream));

// FIXME: we own the context. But why crash to destroy ctx? CUDA_ERROR_INVALID_VALUE
//CUDA_ENSURE(cuCtxDestroy(ctx));
if (!share_ctx && ctx)
CUDA_ENSURE(cuCtxDestroy(ctx));
}

void* InteropResource::mapToHost(const VideoFormat &format, void *handle, int picIndex, const CUVIDPROCPARAMS &param, int width, int height, int coded_height)
Expand Down Expand Up @@ -94,23 +95,25 @@ void* InteropResource::mapToHost(const VideoFormat &format, void *handle, int pi
}

#ifndef QT_NO_OPENGL
HostInteropResource::HostInteropResource(CUdevice d, CUvideodecoder decoder, CUvideoctxlock lk)
: InteropResource(d, decoder, lk)
, ctx(0)
HostInteropResource::HostInteropResource()
: InteropResource()
{
memset(&host_mem, 0, sizeof(host_mem));
host_mem.index = -1;
}

HostInteropResource::~HostInteropResource()
{
if (ctx) { //cuMemFreeHost need the context of mem allocated
if (ctx) { //cuMemFreeHost need the context of mem allocated, it's shared context, or own context
CUDA_WARN(cuCtxPushCurrent(ctx));
}
if (host_mem.data) { //FIXME: CUDA_ERROR_INVALID_VALUE
CUDA_ENSURE(cuMemFreeHost(host_mem.data));
host_mem.data = NULL;
}
if (ctx) {
CUDA_WARN(cuCtxPopCurrent(NULL));
}
}

bool HostInteropResource::map(int picIndex, const CUVIDPROCPARAMS &param, GLuint tex, int w, int h, int H, int plane)
Expand All @@ -119,6 +122,7 @@ bool HostInteropResource::map(int picIndex, const CUVIDPROCPARAMS &param, GLuint
if (host_mem.index != picIndex || !host_mem.data) {
AutoCtxLock locker((cuda_api*)this, lock);
Q_UNUSED(locker);

CUdeviceptr devptr;
unsigned int pitch;
//qDebug("index: %d=>%d, plane: %d", host_mem.index, picIndex, plane);
Expand All @@ -127,6 +131,7 @@ bool HostInteropResource::map(int picIndex, const CUVIDPROCPARAMS &param, GLuint
Q_UNUSED(unmapper);
if (!ensureResource(pitch, H)) //copy height is coded height
return false;
// the same thread (context) as cuMemAllocHost, so no ccontext switch is needed
CUDA_ENSURE(cuMemcpyDtoH(host_mem.data, devptr, pitch*H*3/2), NULL);
host_mem.index = picIndex;
}
Expand Down Expand Up @@ -163,9 +168,14 @@ bool HostInteropResource::ensureResource(int pitch, int height)
if (!ctx) {
CUDA_ENSURE(cuCtxCreate(&ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev), false);
CUDA_WARN(cuCtxPopCurrent(&ctx));
share_ctx = false;
}
if (!share_ctx) // cuMemFreeHost will be called in dtor which is not the current thread.
CUDA_WARN(cuCtxPushCurrent(ctx));
// NV12
CUDA_ENSURE(cuMemAllocHost((void**)&host_mem.data, pitch*height*3/2), NULL);
if (!share_ctx)
CUDA_WARN(cuCtxPopCurrent(NULL)); //can be null or &ctx
return true;
}
#endif //QT_NO_OPENGL
Expand Down Expand Up @@ -242,8 +252,8 @@ class EGL {
#endif //EGL_VERSION_1_5
};

EGLInteropResource::EGLInteropResource(CUdevice d, CUvideodecoder decoder, CUvideoctxlock declock)
: InteropResource(d, decoder, declock)
EGLInteropResource::EGLInteropResource()
: InteropResource()
, egl(new EGL())
, dll9(NULL)
, d3d9(NULL)
Expand All @@ -254,6 +264,8 @@ EGLInteropResource::EGLInteropResource(CUdevice d, CUvideodecoder decoder, CUvid
, surface9_nv12(NULL)
, query9(NULL)
{
ctx = NULL; //need a context created with d3d (TODO: check it?)
share_ctx = false;
}

EGLInteropResource::~EGLInteropResource()
Expand Down Expand Up @@ -337,6 +349,10 @@ bool EGLInteropResource::ensureD3D9CUDA(int w, int h, int W, int H)
TexRes &r = res[0];// 1 NV12 texture
if (r.w == w && r.h == h && r.W == W && r.H == H && r.cuRes)
return true;
if (share_ctx) {
share_ctx = false;
ctx = NULL;
}
if (!ctx) {
// TODO: how to use pop/push decoder's context without the context in opengl context
if (!ensureD3DDevice())
Expand Down Expand Up @@ -589,10 +605,6 @@ bool EGLInteropResource::map(IDirect3DSurface9* surface, GLuint tex, int w, int
#if QTAV_HAVE(CUDA_GL)
namespace QtAV {
namespace cuda {
GLInteropResource::GLInteropResource(CUdevice d, CUvideodecoder decoder, CUvideoctxlock lk)
: InteropResource(d, decoder, lk)
{}

bool GLInteropResource::map(int picIndex, const CUVIDPROCPARAMS &param, GLuint tex, int w, int h, int H, int plane)
{
AutoCtxLock locker((cuda_api*)this, lock);
Expand Down
18 changes: 12 additions & 6 deletions src/codec/video/SurfaceInteropCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,15 @@ namespace cuda {
class InteropResource : protected cuda_api
{
public:
InteropResource(CUdevice d, CUvideodecoder decoder, CUvideoctxlock declock);
~InteropResource();
InteropResource();
void setDevice(CUdevice d) { dev = d;}
void setShareContext(CUcontext c) {
ctx = c;
share_ctx = !!c;
}
void setDecoder(CUvideodecoder d) { dec = d;}
void setLock(CUvideoctxlock l) { lock = l;}
virtual ~InteropResource();
/// copy from gpu (optimized if possible) and convert to target format if necessary
// mapToHost
/*!
Expand All @@ -64,6 +71,7 @@ class InteropResource : protected cuda_api
/// copy from gpu and convert to target format if necessary. used by VideoCapture
void* mapToHost(const VideoFormat &format, void *handle, int picIndex, const CUVIDPROCPARAMS &param, int width, int height, int surface_height);
protected:
bool share_ctx;
CUdevice dev;
CUcontext ctx;
CUvideodecoder dec;
Expand Down Expand Up @@ -112,14 +120,13 @@ class SurfaceInteropCUDA Q_DECL_FINAL: public VideoSurfaceInterop
class HostInteropResource Q_DECL_FINAL: public InteropResource
{
public:
HostInteropResource(CUdevice d, CUvideodecoder decoder, CUvideoctxlock lk);
HostInteropResource();
~HostInteropResource();
bool map(int picIndex, const CUVIDPROCPARAMS& param, GLuint tex, int w, int h, int H, int plane) Q_DECL_OVERRIDE;
bool unmap(GLuint) Q_DECL_OVERRIDE;
private:
bool ensureResource(int pitch, int height);

CUcontext ctx;
struct {
int index;
uchar* data;
Expand All @@ -140,7 +147,7 @@ class EGL;
class EGLInteropResource Q_DECL_FINAL: public InteropResource
{
public:
EGLInteropResource(CUdevice d, CUvideodecoder decoder, CUvideoctxlock declock);
EGLInteropResource();
~EGLInteropResource();
bool map(int picIndex, const CUVIDPROCPARAMS& param, GLuint tex, int w, int h, int H, int plane) Q_DECL_OVERRIDE;
private:
Expand Down Expand Up @@ -168,7 +175,6 @@ class EGLInteropResource Q_DECL_FINAL: public InteropResource
class GLInteropResource Q_DECL_FINAL: public InteropResource
{
public:
GLInteropResource(CUdevice d, CUvideodecoder decoder, CUvideoctxlock lk);
bool map(int picIndex, const CUVIDPROCPARAMS& param, GLuint tex, int w, int h, int H, int plane) Q_DECL_OVERRIDE;
bool unmap(GLuint tex) Q_DECL_OVERRIDE;
private:
Expand Down
58 changes: 35 additions & 23 deletions src/codec/video/VideoDecoderCUDA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ class VideoDecoderCUDAPrivate Q_DECL_FINAL: public VideoDecoderPrivate
return;
if (!isLoaded()) //cuda_api
return;
interop_res = cuda::InteropResourcePtr();
}
~VideoDecoderCUDAPrivate() {
if (bitstream_filter_ctx)
Expand All @@ -198,6 +199,30 @@ class VideoDecoderCUDAPrivate Q_DECL_FINAL: public VideoDecoderPrivate
bool initCuda();
bool releaseCuda();
bool createCUVIDDecoder(cudaVideoCodec cudaCodec, int cw, int ch);
void createInterop() {
if (copy_mode == VideoDecoderCUDA::ZeroCopy) {
#if QTAV_HAVE(CUDA_GL)
if (!OpenGLHelper::isOpenGLES())
interop_res = cuda::InteropResourcePtr(new cuda::GLInteropResource());
#endif //QTAV_HAVE(CUDA_GL)
#if QTAV_HAVE(CUDA_EGL)
if (OpenGLHelper::isOpenGLES())
interop_res = cuda::InteropResourcePtr(new cuda::EGLInteropResource());
#endif //QTAV_HAVE(CUDA_EGL)
}
#ifndef QT_NO_OPENGL
else if (copy_mode == VideoDecoderCUDA::DirectCopy) {
interop_res = cuda::InteropResourcePtr(new cuda::HostInteropResource());
}
#endif //QT_NO_OPENGL
if (!interop_res)
return;
interop_res->setDevice(cudev);
interop_res->setShareContext(cuctx); //it not share the context, interop res will create it's own context, context switch is slow
interop_res->setDecoder(dec);
interop_res->setLock(vid_ctx_lock);
}

bool createCUVIDParser();
bool flushParser();
bool processDecodedData(CUVIDPARSERDISPINFO *cuviddisp, VideoFrame* outFrame = 0);
Expand Down Expand Up @@ -243,6 +268,7 @@ class VideoDecoderCUDAPrivate Q_DECL_FINAL: public VideoDecoderPrivate
// how about parser.ulMaxNumDecodeSurfaces? recreate?
AVCodecID codec = mapCodecToFFmpeg(cuvidfmt->codec);
p->setBSF(codec);
p->createInterop();
}
//TODO: lavfilter
return 1;
Expand Down Expand Up @@ -470,11 +496,15 @@ bool VideoDecoderCUDAPrivate::open()
if (!isLoaded()) //cuda_api
return false;
if (!cuctx)
available = initCuda();
initCuda();
setBSF(codec_ctx->codec_id);
// max decoder surfaces is computed in createCUVIDDecoder. createCUVIDParser use the value
return createCUVIDDecoder(mapCodecFromFFmpeg(codec_ctx->codec_id), codec_ctx->coded_width, codec_ctx->coded_height)
&& createCUVIDParser();
if (!createCUVIDDecoder(mapCodecFromFFmpeg(codec_ctx->codec_id), codec_ctx->coded_width, codec_ctx->coded_height))
return false;
if (!createCUVIDParser())
return false;
available = true;
return true;
}

bool VideoDecoderCUDAPrivate::initCuda()
Expand All @@ -491,7 +521,7 @@ bool VideoDecoderCUDAPrivate::initCuda()
description = QStringLiteral("CUDA device: %1 %2.%3 %4 MHz @%5").arg(QLatin1String((const char*)devname)).arg(major).arg(minor).arg(clockRate/1000).arg(cudev);

// cuD3DCtxCreate > cuGLCtxCreate(deprecated) > cuCtxCreate (fallback if d3d and gl return status is failed)
CUDA_ENSURE(cuCtxCreate(&cuctx, CU_CTX_SCHED_BLOCKING_SYNC, cudev), false); //CU_CTX_SCHED_AUTO?
CUDA_ENSURE(cuCtxCreate(&cuctx, CU_CTX_SCHED_BLOCKING_SYNC, cudev), false); //CU_CTX_SCHED_AUTO: slower in my test
CUDA_ENSURE(cuCtxPopCurrent(&cuctx), false);
CUDA_ENSURE(cuvidCtxLockCreate(&vid_ctx_lock, cuctx), 0);
{
Expand All @@ -510,7 +540,7 @@ bool VideoDecoderCUDAPrivate::releaseCuda()
{
available = false;
if (cuctx)
CUDA_WARN(cuCtxPushCurrent(cuctx)); //cuMemFreeHost need the context
CUDA_WARN(cuCtxPushCurrent(cuctx)); //cuMemFreeHost need the context of cuMemAllocHost which was called in VideoThread, while releaseCuda() in dtor can be called in any thread
if (!can_load)
return true;
if (dec) {
Expand Down Expand Up @@ -584,24 +614,6 @@ bool VideoDecoderCUDAPrivate::createCUVIDDecoder(cudaVideoCodec cudaCodec, int c
available = false;
CUDA_ENSURE(cuvidCreateDecoder(&dec, &dec_create_info), false);
available = true;
if (copy_mode == VideoDecoderCUDA::ZeroCopy) {
#if QTAV_HAVE(CUDA_GL)
// TODO: runtime gles check
if (!OpenGLHelper::isOpenGLES())
interop_res = cuda::InteropResourcePtr(new cuda::GLInteropResource(cudev, dec, vid_ctx_lock));
#endif //QTAV_HAVE(CUDA_GL)
#if QTAV_HAVE(CUDA_EGL)
// TODO: runtime gles check
if (OpenGLHelper::isOpenGLES())
interop_res = cuda::InteropResourcePtr(new cuda::EGLInteropResource(cudev, dec, vid_ctx_lock));
#endif //QTAV_HAVE(CUDA_EGL)

}
#ifndef QT_NO_OPENGL
else if (copy_mode == VideoDecoderCUDA::DirectCopy) {
interop_res = cuda::InteropResourcePtr(new cuda::HostInteropResource(cudev, dec, vid_ctx_lock));
}
#endif //QT_NO_OPENGL
return true;
}

Expand Down

0 comments on commit 1c6b1ea

Please sign in to comment.