Skip to content

Commit

Permalink
fast copy from USWC using sse4.1. not supported by va
Browse files Browse the repository at this point in the history
TODO: add memcpy like function. configurable cache size
  • Loading branch information
wang-bin committed May 3, 2014
1 parent 1d3cff1 commit ddf0a59
Show file tree
Hide file tree
Showing 9 changed files with 393 additions and 0 deletions.
2 changes: 2 additions & 0 deletions QtAV.pro
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ OTHER_FILES += \

EssentialDepends = avutil avcodec avformat swscale
OptionalDepends = \
sse2 \
sse4_1 \
swresample \
avresample \
gl
Expand Down
52 changes: 52 additions & 0 deletions config.tests/sse2/sse2.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/****************************************************************************
**
** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
** Contact: http://www.qt-project.org/legal
**
** This file is part of the config.tests of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** Commercial License Usage
** Licensees holding valid commercial Qt licenses may use this file in
** accordance with the commercial license agreement provided with the
** Software or, alternatively, in accordance with the terms contained in
** a written agreement between you and Digia. For licensing terms and
** conditions see http://qt.digia.com/licensing. For further information
** use the contact form at http://qt.digia.com/contact-us.
**
** GNU Lesser General Public License Usage
** Alternatively, this file may be used under the terms of the GNU Lesser
** General Public License version 2.1 as published by the Free Software
** Foundation and appearing in the file LICENSE.LGPL included in the
** packaging of this file. Please review the following information to
** ensure the GNU Lesser General Public License version 2.1 requirements
** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
**
** In addition, as a special exception, Digia gives you certain additional
** rights. These rights are described in the Digia Qt LGPL Exception
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
**
** GNU General Public License Usage
** Alternatively, this file may be used under the terms of the GNU
** General Public License version 3.0 as published by the Free Software
** Foundation and appearing in the file LICENSE.GPL included in the
** packaging of this file. Please review the following information to
** ensure the GNU General Public License version 3.0 requirements will be
** met: http://www.gnu.org/copyleft/gpl.html.
**
**
** $QT_END_LICENSE$
**
****************************************************************************/

#include <emmintrin.h>
#if defined(__GNUC__) && __GNUC__ < 4 && __GNUC_MINOR__ < 3
#error GCC < 3.2 is known to create internal compiler errors with our MMX code
#endif

int main(int, char**)
{
__m128i a = _mm_setzero_si128();
_mm_maskmoveu_si128(a, _mm_setzero_si128(), 0);
return 0;
}
7 changes: 7 additions & 0 deletions config.tests/sse2/sse2.pro
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
SOURCES = sse2.cpp
CONFIG -= qt dylib release debug_and_release
CONFIG += debug console
#qt5 only has gcc, qcc, vc, linux icc. clang?
win32-icc: QMAKE_CFLAGS_SSE2 *= -arch:SSE2
isEmpty(QMAKE_CFLAGS_SSE2):error("This compiler does not support SSE2")
else:QMAKE_CXXFLAGS += $$QMAKE_CFLAGS_SSE2
51 changes: 51 additions & 0 deletions config.tests/sse4_1/sse4_1.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/****************************************************************************
**
** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies).
** Contact: http://www.qt-project.org/legal
**
** This file is part of the config.tests of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** Commercial License Usage
** Licensees holding valid commercial Qt licenses may use this file in
** accordance with the commercial license agreement provided with the
** Software or, alternatively, in accordance with the terms contained in
** a written agreement between you and Digia. For licensing terms and
** conditions see http://qt.digia.com/licensing. For further information
** use the contact form at http://qt.digia.com/contact-us.
**
** GNU Lesser General Public License Usage
** Alternatively, this file may be used under the terms of the GNU Lesser
** General Public License version 2.1 as published by the Free Software
** Foundation and appearing in the file LICENSE.LGPL included in the
** packaging of this file. Please review the following information to
** ensure the GNU Lesser General Public License version 2.1 requirements
** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
**
** In addition, as a special exception, Digia gives you certain additional
** rights. These rights are described in the Digia Qt LGPL Exception
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
**
** GNU General Public License Usage
** Alternatively, this file may be used under the terms of the GNU
** General Public License version 3.0 as published by the Free Software
** Foundation and appearing in the file LICENSE.GPL included in the
** packaging of this file. Please review the following information to
** ensure the GNU General Public License version 3.0 requirements will be
** met: http://www.gnu.org/copyleft/gpl.html.
**
**
** $QT_END_LICENSE$
**
****************************************************************************/

#include <smmintrin.h>

int main(int, char**)
{
__m128 a = _mm_setzero_ps();
_mm_ceil_ps(a);
__m128i result = _mm_mullo_epi32(_mm_set1_epi32(42), _mm_set1_epi32(64));
(void)result;
return 0;
}
7 changes: 7 additions & 0 deletions config.tests/sse4_1/sse4_1.pro
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
SOURCES = sse4_1.cpp
CONFIG -= qt dylib release debug_and_release
CONFIG += debug console
#qt5 only has gcc, qcc, vc, linux icc. clang?
win32-icc: QMAKE_CFLAGS_SSE4_1 *= -arch:SSE4.1
isEmpty(QMAKE_CFLAGS_SSE4_1):error("This compiler does not support SSE4.1")
else:QMAKE_CXXFLAGS += $$QMAKE_CFLAGS_SSE4_1
11 changes: 11 additions & 0 deletions src/libQtAV.pro
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@ win32 {
OTHER_FILES += $$RC_FILE
TRANSLATIONS = $${PROJECTROOT}/i18n/QtAV_zh_CN.ts

config_sse4_1|contains(TARGET_ARCH_SUB, sse4.1) {
DEFINES += QTAV_HAVE_SSE4_1=1
QMAKE_CXXFLAGS += $$QMAKE_CFLAGS_SSE4_1 #gcc -msse4.1
}
config_sse2|contains(TARGET_ARCH_SUB, sse2) {
DEFINES += QTAV_HAVE_SSE2=1
QMAKE_CXXFLAGS += $$QMAKE_CFLAGS_SSE2 #gcc -msse2
}

*msvc* {
#link FFmpeg and portaudio which are built by gcc need /SAFESEH:NO
QMAKE_LFLAGS += /SAFESEH:NO
Expand Down Expand Up @@ -166,6 +175,7 @@ config_libcedarv {
SOURCES += \
QtAV_Compat.cpp \
QtAV_Global.cpp \
utils/GPUMemCopy.cpp \
AudioThread.cpp \
AVThread.cpp \
AudioDecoder.cpp \
Expand Down Expand Up @@ -256,6 +266,7 @@ SDK_HEADERS *= \

HEADERS *= \
$$SDK_HEADERS \
utils/GPUMemCopy.h \
QtAV/prepost.h \
QtAV/AVDemuxThread.h \
QtAV/AVThread.h \
Expand Down
224 changes: 224 additions & 0 deletions src/utils/GPUMemCopy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
#include "GPUMemCopy.h"

#include <stdlib.h>
#include <string.h>
#include <stddef.h>

#include <algorithm>

#include "QtAV/QtAV_Compat.h"

// for mingw gcc
#if QTAV_HAVE(SSE4_1)
#include <smmintrin.h> //stream load
#endif


/* Branch prediction */
#ifdef __GNUC__
# define likely(p) __builtin_expect(!!(p), 1)
# define unlikely(p) __builtin_expect(!!(p), 0)
#else
# define likely(p) (!!(p))
# define unlikely(p) (!!(p))
#endif

namespace QtAV {

#if QTAV_HAVE(SSE2) //FIXME
// from vlc_common.h begin
#ifdef __MINGW32__
# define Memalign(align, size) (__mingw_aligned_malloc(size, align))
# define Free(base) (__mingw_aligned_free(base))
#elif defined(_MSC_VER)
# define Memalign(align, size) (_aligned_malloc(size, align))
# define Free(base) (_aligned_free(base))
#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
static inline void *Memalign(size_t align, size_t size)
{
long diff;
void *ptr;

ptr = malloc(size+align);
if(!ptr)
return ptr;
diff = ((-(long)ptr - 1)&(align-1)) + 1;
ptr = (char*)ptr + diff;
((char*)ptr)[-1]= diff;
return ptr;
}

static void Free(void *ptr)
{
if (ptr)
free((char*)ptr - ((char*)ptr)[-1]);
}
#else
static inline void *Memalign(size_t align, size_t size)
{
void *base;
if (unlikely(posix_memalign(&base, align, size)))
base = NULL;
return base;
}
# define Free(base) free(base)
#endif
#endif //QTAV_HAVE(SSE2)

// from vlc_common.h end

// from https://software.intel.com/en-us/articles/copying-accelerated-video-decode-frame-buffers
/*
* 1. Fill a 4K byte cached (WB) memory buffer from the USWC video frame
* 2. Copy the 4K byte cache contents to the destination WB frame
* 3. Repeat steps 1 and 2 until the whole frame buffer has been copied.
*
* _mm_store_si128 and _mm_load_si128 intrinsics will compile to the MOVDQA instruction, _mm_stream_load_si128 and _mm_stream_si128 intrinsics compile to the MOVNTDQA and MOVNTDQ instructions
*
* using the same pitch (which is assumed to be a multiple of 64 bytes), and expecting 64 byte alignment of every row of the source, cached 4K buffer and destination buffers.
* The MOVNTDQA streaming load instruction and the MOVNTDQ streaming store instruction require at least 16 byte alignment in their memory addresses.
*/
// CopyFrame( )
//
// COPIES VIDEO FRAMES FROM USWC MEMORY TO WB SYSTEM MEMORY VIA CACHED BUFFER
// ASSUMES PITCH IS A MULTIPLE OF 64B CACHE LINE SIZE, WIDTH MAY NOT BE

#define CACHED_BUFFER_SIZE 4096
typedef unsigned int UINT;
void CopyGPUFrame_SSE4_1(void *pSrc, void *pDest, void * pCacheBlock, UINT width, UINT height, UINT pitch);

bool GPUMemCopy::isAvailable()
{
#ifdef __SSE4_1__
return true;
#endif
#if QTAV_HAVE(SSE4_1)
return true;
#endif
static bool is_sse41 = !!(av_get_cpu_flags() & AV_CPU_FLAG_SSE4);
return is_sse41;
}

GPUMemCopy::GPUMemCopy()
{
#if QTAV_HAVE(SSE2)
mCache.buffer = 0;
mCache.size = 0;
#endif
}

GPUMemCopy::~GPUMemCopy()
{
cleanCache();
}

bool GPUMemCopy::initCache(unsigned width)
{
#if QTAV_HAVE(SSE2)
mCache.size = std::max<size_t>((width + 0x0f) & ~ 0x0f, CACHED_BUFFER_SIZE);
mCache.buffer = (unsigned char*)Memalign(16, mCache.size);
return !!mCache.buffer;
#endif
return false;
}

void GPUMemCopy::cleanCache()
{
#if QTAV_HAVE(SSE2)
if (mCache.buffer) {
Free(mCache.buffer);
}
mCache.buffer = 0;
mCache.size = 0;
#endif
}

void GPUMemCopy::copyFrame(void *pSrc, void *pDest, unsigned width, unsigned height, unsigned pitch)
{
CopyGPUFrame_SSE4_1(pSrc, pDest, mCache.buffer, width, height, pitch);
}

void CopyGPUFrame_SSE4_1(void *pSrc, void *pDest, void *pCacheBlock, UINT width, UINT height, UINT pitch)
{
#if QTAV_HAVE(SSE4_1)
__m128i x0, x1, x2, x3;
__m128i *pLoad;
__m128i *pStore;
__m128i *pCache;
UINT x, y, yLoad, yStore;
UINT rowsPerBlock;
UINT width64;
UINT extraPitch;

rowsPerBlock = CACHED_BUFFER_SIZE / pitch;
width64 = (width + 63) & ~0x03f;
extraPitch = (pitch - width64) / 16;

pLoad = (__m128i *)pSrc;
pStore = (__m128i *)pDest;

// COPY THROUGH 4KB CACHED BUFFER
for (y = 0; y < height; y += rowsPerBlock) {
// ROWS LEFT TO COPY AT END
if (y + rowsPerBlock > height)
rowsPerBlock = height - y;

pCache = (__m128i *)pCacheBlock;

_mm_mfence();

// LOAD ROWS OF PITCH WIDTH INTO CACHED BLOCK
for (yLoad = 0; yLoad < rowsPerBlock; yLoad++) {
// COPY A ROW, CACHE LINE AT A TIME
for (x = 0; x < pitch; x +=64) {
x0 = _mm_stream_load_si128( pLoad +0 );
x1 = _mm_stream_load_si128( pLoad +1 );
x2 = _mm_stream_load_si128( pLoad +2 );
x3 = _mm_stream_load_si128( pLoad +3 );

_mm_store_si128( pCache +0, x0 );
_mm_store_si128( pCache +1, x1 );
_mm_store_si128( pCache +2, x2 );
_mm_store_si128( pCache +3, x3 );

pCache += 4;
pLoad += 4;
}
}

_mm_mfence();

pCache = (__m128i *)pCacheBlock;

// STORE ROWS OF FRAME WIDTH FROM CACHED BLOCK
for (yStore = 0; yStore < rowsPerBlock; yStore++) {
// copy a row, cache line at a time
for (x = 0; x < width64; x +=64) {
x0 = _mm_load_si128( pCache );
x1 = _mm_load_si128( pCache +1 );
x2 = _mm_load_si128( pCache +2 );
x3 = _mm_load_si128( pCache +3 );

_mm_stream_si128( pStore, x0 );
_mm_stream_si128( pStore +1, x1 );
_mm_stream_si128( pStore +2, x2 );
_mm_stream_si128( pStore +3, x3 );

pCache += 4;
pStore += 4;
}
pCache += extraPitch;
pStore += extraPitch;
}
}
#else
Q_UNUSED(pSrc);
Q_UNUSED(pDest);
Q_UNUSED(pCacheBlock);
Q_UNUSED(width);
Q_UNUSED(height);
Q_UNUSED(pitch);
#endif //QTAV_HAVE(SSE4_1)
}

} //namespace QtAV
Loading

0 comments on commit ddf0a59

Please sign in to comment.