forked from wang-bin/QtAV
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fast copy from USWC using sse4.1. not supported by va
TODO: add memcpy like function. configurable cache size
- Loading branch information
Showing
9 changed files
with
393 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
/**************************************************************************** | ||
** | ||
** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies). | ||
** Contact: http://www.qt-project.org/legal | ||
** | ||
** This file is part of the config.tests of the Qt Toolkit. | ||
** | ||
** $QT_BEGIN_LICENSE:LGPL$ | ||
** Commercial License Usage | ||
** Licensees holding valid commercial Qt licenses may use this file in | ||
** accordance with the commercial license agreement provided with the | ||
** Software or, alternatively, in accordance with the terms contained in | ||
** a written agreement between you and Digia. For licensing terms and | ||
** conditions see http://qt.digia.com/licensing. For further information | ||
** use the contact form at http://qt.digia.com/contact-us. | ||
** | ||
** GNU Lesser General Public License Usage | ||
** Alternatively, this file may be used under the terms of the GNU Lesser | ||
** General Public License version 2.1 as published by the Free Software | ||
** Foundation and appearing in the file LICENSE.LGPL included in the | ||
** packaging of this file. Please review the following information to | ||
** ensure the GNU Lesser General Public License version 2.1 requirements | ||
** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. | ||
** | ||
** In addition, as a special exception, Digia gives you certain additional | ||
** rights. These rights are described in the Digia Qt LGPL Exception | ||
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. | ||
** | ||
** GNU General Public License Usage | ||
** Alternatively, this file may be used under the terms of the GNU | ||
** General Public License version 3.0 as published by the Free Software | ||
** Foundation and appearing in the file LICENSE.GPL included in the | ||
** packaging of this file. Please review the following information to | ||
** ensure the GNU General Public License version 3.0 requirements will be | ||
** met: http://www.gnu.org/copyleft/gpl.html. | ||
** | ||
** | ||
** $QT_END_LICENSE$ | ||
** | ||
****************************************************************************/ | ||
|
||
#include <emmintrin.h> | ||
#if defined(__GNUC__) && __GNUC__ < 4 && __GNUC_MINOR__ < 3 | ||
#error GCC < 3.2 is known to create internal compiler errors with our MMX code | ||
#endif | ||
|
||
int main(int, char**) | ||
{ | ||
__m128i a = _mm_setzero_si128(); | ||
_mm_maskmoveu_si128(a, _mm_setzero_si128(), 0); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
SOURCES = sse2.cpp | ||
CONFIG -= qt dylib release debug_and_release | ||
CONFIG += debug console | ||
#qt5 only has gcc, qcc, vc, linux icc. clang? | ||
win32-icc: QMAKE_CFLAGS_SSE2 *= -arch:SSE2 | ||
isEmpty(QMAKE_CFLAGS_SSE2):error("This compiler does not support SSE2") | ||
else:QMAKE_CXXFLAGS += $$QMAKE_CFLAGS_SSE2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
/**************************************************************************** | ||
** | ||
** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies). | ||
** Contact: http://www.qt-project.org/legal | ||
** | ||
** This file is part of the config.tests of the Qt Toolkit. | ||
** | ||
** $QT_BEGIN_LICENSE:LGPL$ | ||
** Commercial License Usage | ||
** Licensees holding valid commercial Qt licenses may use this file in | ||
** accordance with the commercial license agreement provided with the | ||
** Software or, alternatively, in accordance with the terms contained in | ||
** a written agreement between you and Digia. For licensing terms and | ||
** conditions see http://qt.digia.com/licensing. For further information | ||
** use the contact form at http://qt.digia.com/contact-us. | ||
** | ||
** GNU Lesser General Public License Usage | ||
** Alternatively, this file may be used under the terms of the GNU Lesser | ||
** General Public License version 2.1 as published by the Free Software | ||
** Foundation and appearing in the file LICENSE.LGPL included in the | ||
** packaging of this file. Please review the following information to | ||
** ensure the GNU Lesser General Public License version 2.1 requirements | ||
** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. | ||
** | ||
** In addition, as a special exception, Digia gives you certain additional | ||
** rights. These rights are described in the Digia Qt LGPL Exception | ||
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. | ||
** | ||
** GNU General Public License Usage | ||
** Alternatively, this file may be used under the terms of the GNU | ||
** General Public License version 3.0 as published by the Free Software | ||
** Foundation and appearing in the file LICENSE.GPL included in the | ||
** packaging of this file. Please review the following information to | ||
** ensure the GNU General Public License version 3.0 requirements will be | ||
** met: http://www.gnu.org/copyleft/gpl.html. | ||
** | ||
** | ||
** $QT_END_LICENSE$ | ||
** | ||
****************************************************************************/ | ||
|
||
#include <smmintrin.h> | ||
|
||
int main(int, char**) | ||
{ | ||
__m128 a = _mm_setzero_ps(); | ||
_mm_ceil_ps(a); | ||
__m128i result = _mm_mullo_epi32(_mm_set1_epi32(42), _mm_set1_epi32(64)); | ||
(void)result; | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
SOURCES = sse4_1.cpp | ||
CONFIG -= qt dylib release debug_and_release | ||
CONFIG += debug console | ||
#qt5 only has gcc, qcc, vc, linux icc. clang? | ||
win32-icc: QMAKE_CFLAGS_SSE4_1 *= -arch:SSE4.1 | ||
isEmpty(QMAKE_CFLAGS_SSE4_1):error("This compiler does not support SSE4.1") | ||
else:QMAKE_CXXFLAGS += $$QMAKE_CFLAGS_SSE4_1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,224 @@ | ||
#include "GPUMemCopy.h" | ||
|
||
#include <stdlib.h> | ||
#include <string.h> | ||
#include <stddef.h> | ||
|
||
#include <algorithm> | ||
|
||
#include "QtAV/QtAV_Compat.h" | ||
|
||
// for mingw gcc | ||
#if QTAV_HAVE(SSE4_1) | ||
#include <smmintrin.h> //stream load | ||
#endif | ||
|
||
|
||
/* Branch prediction */ | ||
#ifdef __GNUC__ | ||
# define likely(p) __builtin_expect(!!(p), 1) | ||
# define unlikely(p) __builtin_expect(!!(p), 0) | ||
#else | ||
# define likely(p) (!!(p)) | ||
# define unlikely(p) (!!(p)) | ||
#endif | ||
|
||
namespace QtAV { | ||
|
||
#if QTAV_HAVE(SSE2) //FIXME | ||
// from vlc_common.h begin | ||
#ifdef __MINGW32__ | ||
# define Memalign(align, size) (__mingw_aligned_malloc(size, align)) | ||
# define Free(base) (__mingw_aligned_free(base)) | ||
#elif defined(_MSC_VER) | ||
# define Memalign(align, size) (_aligned_malloc(size, align)) | ||
# define Free(base) (_aligned_free(base)) | ||
#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6) | ||
static inline void *Memalign(size_t align, size_t size) | ||
{ | ||
long diff; | ||
void *ptr; | ||
|
||
ptr = malloc(size+align); | ||
if(!ptr) | ||
return ptr; | ||
diff = ((-(long)ptr - 1)&(align-1)) + 1; | ||
ptr = (char*)ptr + diff; | ||
((char*)ptr)[-1]= diff; | ||
return ptr; | ||
} | ||
|
||
static void Free(void *ptr) | ||
{ | ||
if (ptr) | ||
free((char*)ptr - ((char*)ptr)[-1]); | ||
} | ||
#else | ||
static inline void *Memalign(size_t align, size_t size) | ||
{ | ||
void *base; | ||
if (unlikely(posix_memalign(&base, align, size))) | ||
base = NULL; | ||
return base; | ||
} | ||
# define Free(base) free(base) | ||
#endif | ||
#endif //QTAV_HAVE(SSE2) | ||
|
||
// from vlc_common.h end | ||
|
||
// from https://software.intel.com/en-us/articles/copying-accelerated-video-decode-frame-buffers | ||
/* | ||
* 1. Fill a 4K byte cached (WB) memory buffer from the USWC video frame | ||
* 2. Copy the 4K byte cache contents to the destination WB frame | ||
* 3. Repeat steps 1 and 2 until the whole frame buffer has been copied. | ||
* | ||
* _mm_store_si128 and _mm_load_si128 intrinsics will compile to the MOVDQA instruction, _mm_stream_load_si128 and _mm_stream_si128 intrinsics compile to the MOVNTDQA and MOVNTDQ instructions | ||
* | ||
* using the same pitch (which is assumed to be a multiple of 64 bytes), and expecting 64 byte alignment of every row of the source, cached 4K buffer and destination buffers. | ||
* The MOVNTDQA streaming load instruction and the MOVNTDQ streaming store instruction require at least 16 byte alignment in their memory addresses. | ||
*/ | ||
// CopyFrame( ) | ||
// | ||
// COPIES VIDEO FRAMES FROM USWC MEMORY TO WB SYSTEM MEMORY VIA CACHED BUFFER | ||
// ASSUMES PITCH IS A MULTIPLE OF 64B CACHE LINE SIZE, WIDTH MAY NOT BE | ||
|
||
#define CACHED_BUFFER_SIZE 4096 | ||
typedef unsigned int UINT; | ||
void CopyGPUFrame_SSE4_1(void *pSrc, void *pDest, void * pCacheBlock, UINT width, UINT height, UINT pitch); | ||
|
||
bool GPUMemCopy::isAvailable() | ||
{ | ||
#ifdef __SSE4_1__ | ||
return true; | ||
#endif | ||
#if QTAV_HAVE(SSE4_1) | ||
return true; | ||
#endif | ||
static bool is_sse41 = !!(av_get_cpu_flags() & AV_CPU_FLAG_SSE4); | ||
return is_sse41; | ||
} | ||
|
||
GPUMemCopy::GPUMemCopy() | ||
{ | ||
#if QTAV_HAVE(SSE2) | ||
mCache.buffer = 0; | ||
mCache.size = 0; | ||
#endif | ||
} | ||
|
||
GPUMemCopy::~GPUMemCopy() | ||
{ | ||
cleanCache(); | ||
} | ||
|
||
bool GPUMemCopy::initCache(unsigned width) | ||
{ | ||
#if QTAV_HAVE(SSE2) | ||
mCache.size = std::max<size_t>((width + 0x0f) & ~ 0x0f, CACHED_BUFFER_SIZE); | ||
mCache.buffer = (unsigned char*)Memalign(16, mCache.size); | ||
return !!mCache.buffer; | ||
#endif | ||
return false; | ||
} | ||
|
||
void GPUMemCopy::cleanCache() | ||
{ | ||
#if QTAV_HAVE(SSE2) | ||
if (mCache.buffer) { | ||
Free(mCache.buffer); | ||
} | ||
mCache.buffer = 0; | ||
mCache.size = 0; | ||
#endif | ||
} | ||
|
||
void GPUMemCopy::copyFrame(void *pSrc, void *pDest, unsigned width, unsigned height, unsigned pitch) | ||
{ | ||
CopyGPUFrame_SSE4_1(pSrc, pDest, mCache.buffer, width, height, pitch); | ||
} | ||
|
||
void CopyGPUFrame_SSE4_1(void *pSrc, void *pDest, void *pCacheBlock, UINT width, UINT height, UINT pitch) | ||
{ | ||
#if QTAV_HAVE(SSE4_1) | ||
__m128i x0, x1, x2, x3; | ||
__m128i *pLoad; | ||
__m128i *pStore; | ||
__m128i *pCache; | ||
UINT x, y, yLoad, yStore; | ||
UINT rowsPerBlock; | ||
UINT width64; | ||
UINT extraPitch; | ||
|
||
rowsPerBlock = CACHED_BUFFER_SIZE / pitch; | ||
width64 = (width + 63) & ~0x03f; | ||
extraPitch = (pitch - width64) / 16; | ||
|
||
pLoad = (__m128i *)pSrc; | ||
pStore = (__m128i *)pDest; | ||
|
||
// COPY THROUGH 4KB CACHED BUFFER | ||
for (y = 0; y < height; y += rowsPerBlock) { | ||
// ROWS LEFT TO COPY AT END | ||
if (y + rowsPerBlock > height) | ||
rowsPerBlock = height - y; | ||
|
||
pCache = (__m128i *)pCacheBlock; | ||
|
||
_mm_mfence(); | ||
|
||
// LOAD ROWS OF PITCH WIDTH INTO CACHED BLOCK | ||
for (yLoad = 0; yLoad < rowsPerBlock; yLoad++) { | ||
// COPY A ROW, CACHE LINE AT A TIME | ||
for (x = 0; x < pitch; x +=64) { | ||
x0 = _mm_stream_load_si128( pLoad +0 ); | ||
x1 = _mm_stream_load_si128( pLoad +1 ); | ||
x2 = _mm_stream_load_si128( pLoad +2 ); | ||
x3 = _mm_stream_load_si128( pLoad +3 ); | ||
|
||
_mm_store_si128( pCache +0, x0 ); | ||
_mm_store_si128( pCache +1, x1 ); | ||
_mm_store_si128( pCache +2, x2 ); | ||
_mm_store_si128( pCache +3, x3 ); | ||
|
||
pCache += 4; | ||
pLoad += 4; | ||
} | ||
} | ||
|
||
_mm_mfence(); | ||
|
||
pCache = (__m128i *)pCacheBlock; | ||
|
||
// STORE ROWS OF FRAME WIDTH FROM CACHED BLOCK | ||
for (yStore = 0; yStore < rowsPerBlock; yStore++) { | ||
// copy a row, cache line at a time | ||
for (x = 0; x < width64; x +=64) { | ||
x0 = _mm_load_si128( pCache ); | ||
x1 = _mm_load_si128( pCache +1 ); | ||
x2 = _mm_load_si128( pCache +2 ); | ||
x3 = _mm_load_si128( pCache +3 ); | ||
|
||
_mm_stream_si128( pStore, x0 ); | ||
_mm_stream_si128( pStore +1, x1 ); | ||
_mm_stream_si128( pStore +2, x2 ); | ||
_mm_stream_si128( pStore +3, x3 ); | ||
|
||
pCache += 4; | ||
pStore += 4; | ||
} | ||
pCache += extraPitch; | ||
pStore += extraPitch; | ||
} | ||
} | ||
#else | ||
Q_UNUSED(pSrc); | ||
Q_UNUSED(pDest); | ||
Q_UNUSED(pCacheBlock); | ||
Q_UNUSED(width); | ||
Q_UNUSED(height); | ||
Q_UNUSED(pitch); | ||
#endif //QTAV_HAVE(SSE4_1) | ||
} | ||
|
||
} //namespace QtAV |
Oops, something went wrong.