From ddf0a591309641cd0755e4781d79b8457e90fe0d Mon Sep 17 00:00:00 2001 From: wang-bin Date: Sat, 3 May 2014 01:23:13 +0800 Subject: [PATCH] fast copy from USWC using sse4.1. not supported by va TODO: add memcpy like function. configurable cache size --- QtAV.pro | 2 + config.tests/sse2/sse2.cpp | 52 ++++++++ config.tests/sse2/sse2.pro | 7 ++ config.tests/sse4_1/sse4_1.cpp | 51 ++++++++ config.tests/sse4_1/sse4_1.pro | 7 ++ src/libQtAV.pro | 11 ++ src/utils/GPUMemCopy.cpp | 224 +++++++++++++++++++++++++++++++++ src/utils/GPUMemCopy.h | 30 +++++ tests/arch/arch.pro | 9 ++ 9 files changed, 393 insertions(+) create mode 100644 config.tests/sse2/sse2.cpp create mode 100644 config.tests/sse2/sse2.pro create mode 100644 config.tests/sse4_1/sse4_1.cpp create mode 100644 config.tests/sse4_1/sse4_1.pro create mode 100644 src/utils/GPUMemCopy.cpp create mode 100644 src/utils/GPUMemCopy.h diff --git a/QtAV.pro b/QtAV.pro index c06d1d18e..e180b8b27 100644 --- a/QtAV.pro +++ b/QtAV.pro @@ -32,6 +32,8 @@ OTHER_FILES += \ EssentialDepends = avutil avcodec avformat swscale OptionalDepends = \ + sse2 \ + sse4_1 \ swresample \ avresample \ gl diff --git a/config.tests/sse2/sse2.cpp b/config.tests/sse2/sse2.cpp new file mode 100644 index 000000000..6a93916e4 --- /dev/null +++ b/config.tests/sse2/sse2.cpp @@ -0,0 +1,52 @@ +/**************************************************************************** +** +** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies). +** Contact: http://www.qt-project.org/legal +** +** This file is part of the config.tests of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and Digia. For licensing terms and +** conditions see http://qt.digia.com/licensing. For further information +** use the contact form at http://qt.digia.com/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Digia gives you certain additional +** rights. These rights are described in the Digia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include +#if defined(__GNUC__) && __GNUC__ < 4 && __GNUC_MINOR__ < 3 +#error GCC < 3.2 is known to create internal compiler errors with our MMX code +#endif + +int main(int, char**) +{ + __m128i a = _mm_setzero_si128(); + _mm_maskmoveu_si128(a, _mm_setzero_si128(), 0); + return 0; +} diff --git a/config.tests/sse2/sse2.pro b/config.tests/sse2/sse2.pro new file mode 100644 index 000000000..dadc18ed8 --- /dev/null +++ b/config.tests/sse2/sse2.pro @@ -0,0 +1,7 @@ +SOURCES = sse2.cpp +CONFIG -= qt dylib release debug_and_release +CONFIG += debug console +#qt5 only has gcc, qcc, vc, linux icc. clang? +win32-icc: QMAKE_CFLAGS_SSE2 *= -arch:SSE2 +isEmpty(QMAKE_CFLAGS_SSE2):error("This compiler does not support SSE2") +else:QMAKE_CXXFLAGS += $$QMAKE_CFLAGS_SSE2 diff --git a/config.tests/sse4_1/sse4_1.cpp b/config.tests/sse4_1/sse4_1.cpp new file mode 100644 index 000000000..7746bb147 --- /dev/null +++ b/config.tests/sse4_1/sse4_1.cpp @@ -0,0 +1,51 @@ +/**************************************************************************** +** +** Copyright (C) 2013 Digia Plc and/or its subsidiary(-ies). +** Contact: http://www.qt-project.org/legal +** +** This file is part of the config.tests of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and Digia. For licensing terms and +** conditions see http://qt.digia.com/licensing. For further information +** use the contact form at http://qt.digia.com/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Digia gives you certain additional +** rights. These rights are described in the Digia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include + +int main(int, char**) +{ + __m128 a = _mm_setzero_ps(); + _mm_ceil_ps(a); + __m128i result = _mm_mullo_epi32(_mm_set1_epi32(42), _mm_set1_epi32(64)); + (void)result; + return 0; +} diff --git a/config.tests/sse4_1/sse4_1.pro b/config.tests/sse4_1/sse4_1.pro new file mode 100644 index 000000000..7fbcebaf2 --- /dev/null +++ b/config.tests/sse4_1/sse4_1.pro @@ -0,0 +1,7 @@ +SOURCES = sse4_1.cpp +CONFIG -= qt dylib release debug_and_release +CONFIG += debug console +#qt5 only has gcc, qcc, vc, linux icc. clang? +win32-icc: QMAKE_CFLAGS_SSE4_1 *= -arch:SSE4.1 +isEmpty(QMAKE_CFLAGS_SSE4_1):error("This compiler does not support SSE4.1") +else:QMAKE_CXXFLAGS += $$QMAKE_CFLAGS_SSE4_1 diff --git a/src/libQtAV.pro b/src/libQtAV.pro index de79c1923..293e4c033 100644 --- a/src/libQtAV.pro +++ b/src/libQtAV.pro @@ -32,6 +32,15 @@ win32 { OTHER_FILES += $$RC_FILE TRANSLATIONS = $${PROJECTROOT}/i18n/QtAV_zh_CN.ts +config_sse4_1|contains(TARGET_ARCH_SUB, sse4.1) { + DEFINES += QTAV_HAVE_SSE4_1=1 + QMAKE_CXXFLAGS += $$QMAKE_CFLAGS_SSE4_1 #gcc -msse4.1 +} +config_sse2|contains(TARGET_ARCH_SUB, sse2) { + DEFINES += QTAV_HAVE_SSE2=1 + QMAKE_CXXFLAGS += $$QMAKE_CFLAGS_SSE2 #gcc -msse2 +} + *msvc* { #link FFmpeg and portaudio which are built by gcc need /SAFESEH:NO QMAKE_LFLAGS += /SAFESEH:NO @@ -166,6 +175,7 @@ config_libcedarv { SOURCES += \ QtAV_Compat.cpp \ QtAV_Global.cpp \ + utils/GPUMemCopy.cpp \ AudioThread.cpp \ AVThread.cpp \ AudioDecoder.cpp \ @@ -256,6 +266,7 @@ SDK_HEADERS *= \ HEADERS *= \ $$SDK_HEADERS \ + utils/GPUMemCopy.h \ QtAV/prepost.h \ QtAV/AVDemuxThread.h \ QtAV/AVThread.h \ diff --git a/src/utils/GPUMemCopy.cpp b/src/utils/GPUMemCopy.cpp new file mode 100644 index 000000000..e01110ae7 --- /dev/null +++ b/src/utils/GPUMemCopy.cpp @@ -0,0 +1,224 @@ +#include "GPUMemCopy.h" + +#include +#include +#include + +#include + +#include "QtAV/QtAV_Compat.h" + +// for mingw gcc +#if QTAV_HAVE(SSE4_1) +#include //stream load +#endif + + +/* Branch prediction */ +#ifdef __GNUC__ +# define likely(p) __builtin_expect(!!(p), 1) +# define unlikely(p) __builtin_expect(!!(p), 0) +#else +# define likely(p) (!!(p)) +# define unlikely(p) (!!(p)) +#endif + +namespace QtAV { + +#if QTAV_HAVE(SSE2) //FIXME +// from vlc_common.h begin +#ifdef __MINGW32__ +# define Memalign(align, size) (__mingw_aligned_malloc(size, align)) +# define Free(base) (__mingw_aligned_free(base)) +#elif defined(_MSC_VER) +# define Memalign(align, size) (_aligned_malloc(size, align)) +# define Free(base) (_aligned_free(base)) +#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6) +static inline void *Memalign(size_t align, size_t size) +{ + long diff; + void *ptr; + + ptr = malloc(size+align); + if(!ptr) + return ptr; + diff = ((-(long)ptr - 1)&(align-1)) + 1; + ptr = (char*)ptr + diff; + ((char*)ptr)[-1]= diff; + return ptr; +} + +static void Free(void *ptr) +{ + if (ptr) + free((char*)ptr - ((char*)ptr)[-1]); +} +#else +static inline void *Memalign(size_t align, size_t size) +{ + void *base; + if (unlikely(posix_memalign(&base, align, size))) + base = NULL; + return base; +} +# define Free(base) free(base) +#endif +#endif //QTAV_HAVE(SSE2) + +// from vlc_common.h end + +// from https://software.intel.com/en-us/articles/copying-accelerated-video-decode-frame-buffers +/* + * 1. Fill a 4K byte cached (WB) memory buffer from the USWC video frame + * 2. Copy the 4K byte cache contents to the destination WB frame + * 3. Repeat steps 1 and 2 until the whole frame buffer has been copied. + * + * _mm_store_si128 and _mm_load_si128 intrinsics will compile to the MOVDQA instruction, _mm_stream_load_si128 and _mm_stream_si128 intrinsics compile to the MOVNTDQA and MOVNTDQ instructions + * + * using the same pitch (which is assumed to be a multiple of 64 bytes), and expecting 64 byte alignment of every row of the source, cached 4K buffer and destination buffers. + * The MOVNTDQA streaming load instruction and the MOVNTDQ streaming store instruction require at least 16 byte alignment in their memory addresses. + */ +// CopyFrame( ) +// +// COPIES VIDEO FRAMES FROM USWC MEMORY TO WB SYSTEM MEMORY VIA CACHED BUFFER +// ASSUMES PITCH IS A MULTIPLE OF 64B CACHE LINE SIZE, WIDTH MAY NOT BE + +#define CACHED_BUFFER_SIZE 4096 +typedef unsigned int UINT; +void CopyGPUFrame_SSE4_1(void *pSrc, void *pDest, void * pCacheBlock, UINT width, UINT height, UINT pitch); + +bool GPUMemCopy::isAvailable() +{ +#ifdef __SSE4_1__ + return true; +#endif +#if QTAV_HAVE(SSE4_1) + return true; +#endif + static bool is_sse41 = !!(av_get_cpu_flags() & AV_CPU_FLAG_SSE4); + return is_sse41; +} + +GPUMemCopy::GPUMemCopy() +{ +#if QTAV_HAVE(SSE2) + mCache.buffer = 0; + mCache.size = 0; +#endif +} + +GPUMemCopy::~GPUMemCopy() +{ + cleanCache(); +} + +bool GPUMemCopy::initCache(unsigned width) +{ +#if QTAV_HAVE(SSE2) + mCache.size = std::max((width + 0x0f) & ~ 0x0f, CACHED_BUFFER_SIZE); + mCache.buffer = (unsigned char*)Memalign(16, mCache.size); + return !!mCache.buffer; +#endif + return false; +} + +void GPUMemCopy::cleanCache() +{ +#if QTAV_HAVE(SSE2) + if (mCache.buffer) { + Free(mCache.buffer); + } + mCache.buffer = 0; + mCache.size = 0; +#endif +} + +void GPUMemCopy::copyFrame(void *pSrc, void *pDest, unsigned width, unsigned height, unsigned pitch) +{ + CopyGPUFrame_SSE4_1(pSrc, pDest, mCache.buffer, width, height, pitch); +} + +void CopyGPUFrame_SSE4_1(void *pSrc, void *pDest, void *pCacheBlock, UINT width, UINT height, UINT pitch) +{ +#if QTAV_HAVE(SSE4_1) + __m128i x0, x1, x2, x3; + __m128i *pLoad; + __m128i *pStore; + __m128i *pCache; + UINT x, y, yLoad, yStore; + UINT rowsPerBlock; + UINT width64; + UINT extraPitch; + + rowsPerBlock = CACHED_BUFFER_SIZE / pitch; + width64 = (width + 63) & ~0x03f; + extraPitch = (pitch - width64) / 16; + + pLoad = (__m128i *)pSrc; + pStore = (__m128i *)pDest; + + // COPY THROUGH 4KB CACHED BUFFER + for (y = 0; y < height; y += rowsPerBlock) { + // ROWS LEFT TO COPY AT END + if (y + rowsPerBlock > height) + rowsPerBlock = height - y; + + pCache = (__m128i *)pCacheBlock; + + _mm_mfence(); + + // LOAD ROWS OF PITCH WIDTH INTO CACHED BLOCK + for (yLoad = 0; yLoad < rowsPerBlock; yLoad++) { + // COPY A ROW, CACHE LINE AT A TIME + for (x = 0; x < pitch; x +=64) { + x0 = _mm_stream_load_si128( pLoad +0 ); + x1 = _mm_stream_load_si128( pLoad +1 ); + x2 = _mm_stream_load_si128( pLoad +2 ); + x3 = _mm_stream_load_si128( pLoad +3 ); + + _mm_store_si128( pCache +0, x0 ); + _mm_store_si128( pCache +1, x1 ); + _mm_store_si128( pCache +2, x2 ); + _mm_store_si128( pCache +3, x3 ); + + pCache += 4; + pLoad += 4; + } + } + + _mm_mfence(); + + pCache = (__m128i *)pCacheBlock; + + // STORE ROWS OF FRAME WIDTH FROM CACHED BLOCK + for (yStore = 0; yStore < rowsPerBlock; yStore++) { + // copy a row, cache line at a time + for (x = 0; x < width64; x +=64) { + x0 = _mm_load_si128( pCache ); + x1 = _mm_load_si128( pCache +1 ); + x2 = _mm_load_si128( pCache +2 ); + x3 = _mm_load_si128( pCache +3 ); + + _mm_stream_si128( pStore, x0 ); + _mm_stream_si128( pStore +1, x1 ); + _mm_stream_si128( pStore +2, x2 ); + _mm_stream_si128( pStore +3, x3 ); + + pCache += 4; + pStore += 4; + } + pCache += extraPitch; + pStore += extraPitch; + } + } +#else + Q_UNUSED(pSrc); + Q_UNUSED(pDest); + Q_UNUSED(pCacheBlock); + Q_UNUSED(width); + Q_UNUSED(height); + Q_UNUSED(pitch); +#endif //QTAV_HAVE(SSE4_1) +} + +} //namespace QtAV diff --git a/src/utils/GPUMemCopy.h b/src/utils/GPUMemCopy.h new file mode 100644 index 000000000..7d087c54c --- /dev/null +++ b/src/utils/GPUMemCopy.h @@ -0,0 +1,30 @@ +#ifndef GPUMemCopy_H +#define GPUMemCopy_H + +#include + +namespace QtAV { + +class GPUMemCopy +{ +public: + static bool isAvailable(); + + GPUMemCopy(); + ~GPUMemCopy(); + + bool initCache(unsigned int width); + void cleanCache(); + void copyFrame(void *pSrc, void *pDest, unsigned int width, unsigned int height, unsigned int pitch); + //memcpy +private: + typedef struct { + unsigned char* buffer; + size_t size; + } cache_t; + cache_t mCache; +}; + +} //namespace QtAV + +#endif // GPUMemCopy_H diff --git a/tests/arch/arch.pro b/tests/arch/arch.pro index 8c67b47a1..a91c62e48 100644 --- a/tests/arch/arch.pro +++ b/tests/arch/arch.pro @@ -3,6 +3,15 @@ SOURCES = arch.cpp CONFIG -= qt dylib release debug_and_release CONFIG += debug console warn_on +win32-icc { + QMAKE_CXXFLAGS *= -arch:SSE4.1 #AVX +} *msvc* { + +} else { +## gcc like. can not add here otherwise other archs can not be detected +# QMAKE_CXXFLAGS *= -msse4.1 +} + arch_pp.target = preprocess arch_pp.commands = $$QMAKE_CXX \$< #for gnu make arch_pp.depends = $$PWD/arch.h #TODO: win path. shell_path()?