forked from wang-bin/QtAV
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CopyFrame_SSE2.cpp
240 lines (210 loc) · 9.13 KB
/
CopyFrame_SSE2.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
/******************************************************************************
QtAV: Media play library based on Qt and FFmpeg
Copyright (C) 2015 Wang Bin <[email protected]>
* This file is part of QtAV
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
******************************************************************************/
#ifndef INC_FROM_NAMESPACE
#include <stdint.h> //intptr_t
#include <string.h>
#include <emmintrin.h>
#endif
// from https://software.intel.com/en-us/articles/copying-accelerated-video-decode-frame-buffers
// modified by wang-bin to support unaligned src/dest and sse2
/*
* 1. Fill a 4K byte cached (WB) memory buffer from the USWC video frame
* 2. Copy the 4K byte cache contents to the destination WB frame
* 3. Repeat steps 1 and 2 until the whole frame buffer has been copied.
*
* _mm_store_si128 and _mm_load_si128 intrinsics will compile to the MOVDQA instruction, _mm_stream_load_si128 and _mm_stream_si128 intrinsics compile to the MOVNTDQA and MOVNTDQ instructions
*
* using the same pitch (which is assumed to be a multiple of 64 bytes), and expecting 64 byte alignment of every row of the source, cached 4K buffer and destination buffers.
* The MOVNTDQA streaming load instruction and the MOVNTDQ streaming store instruction require at least 16 byte alignment in their memory addresses.
*/
//
// COPIES VIDEO FRAMES FROM USWC MEMORY TO WB SYSTEM MEMORY VIA CACHED BUFFER
// ASSUMES PITCH IS A MULTIPLE OF 64B CACHE LINE SIZE, WIDTH MAY NOT BE
#ifndef STREAM_LOAD_SI128
#define STREAM_LOAD_SI128(x) _mm_load_si128(x)
#endif //STREAM_LOAD_SI128
#define CACHED_BUFFER_SIZE 4096
#define UINT unsigned int
// copy plane
//QT_FUNCTION_TARGET("sse2")
void CopyFrame_SSE2(void *pSrc, void *pDest, void *pCacheBlock, UINT width, UINT height, UINT pitch)
{
//assert(((intptr_t)pCacheBlock & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
__m128i x0, x1, x2, x3;
__m128i *pCache;
UINT x, y, yLoad, yStore;
UINT rowsPerBlock = CACHED_BUFFER_SIZE / pitch;
const UINT width64 = (width + 63) & ~0x03f;
const UINT extraPitch = (pitch - width64) / 16;
__m128i *pLoad = (__m128i*)pSrc;
__m128i *pStore = (__m128i*)pDest;
const bool src_unaligned = !!((intptr_t)pSrc & 0x0f);
const bool dst_unaligned = !!((intptr_t)pDest & 0x0f);
//if (src_unaligned || dst_unaligned)
// qDebug("===========unaligned: src %d, dst: %d, extraPitch: %d", src_unaligned, dst_unaligned, extraPitch);
// COPY THROUGH 4KB CACHED BUFFER
for (y = 0; y < height; y += rowsPerBlock) {
// ROWS LEFT TO COPY AT END
if (y + rowsPerBlock > height)
rowsPerBlock = height - y;
pCache = (__m128i *)pCacheBlock;
_mm_mfence();
// LOAD ROWS OF PITCH WIDTH INTO CACHED BLOCK
for (yLoad = 0; yLoad < rowsPerBlock; yLoad++) {
// COPY A ROW, CACHE LINE AT A TIME
for (x = 0; x < pitch; x +=64) {
// movntdqa
x0 = STREAM_LOAD_SI128(pLoad + 0);
x1 = STREAM_LOAD_SI128(pLoad + 1);
x2 = STREAM_LOAD_SI128(pLoad + 2);
x3 = STREAM_LOAD_SI128(pLoad + 3);
if (src_unaligned) {
// movdqu
_mm_storeu_si128(pCache +0, x0);
_mm_storeu_si128(pCache +1, x1);
_mm_storeu_si128(pCache +2, x2);
_mm_storeu_si128(pCache +3, x3);
} else {
// movdqa
_mm_store_si128(pCache +0, x0);
_mm_store_si128(pCache +1, x1);
_mm_store_si128(pCache +2, x2);
_mm_store_si128(pCache +3, x3);
}
pCache += 4;
pLoad += 4;
}
}
_mm_mfence();
pCache = (__m128i *)pCacheBlock;
// STORE ROWS OF FRAME WIDTH FROM CACHED BLOCK
for (yStore = 0; yStore < rowsPerBlock; yStore++) {
// copy a row, cache line at a time
for (x = 0; x < width64; x += 64) {
// movdqa
x0 = _mm_load_si128(pCache);
x1 = _mm_load_si128(pCache + 1);
x2 = _mm_load_si128(pCache + 2);
x3 = _mm_load_si128(pCache + 3);
if (dst_unaligned) {
// movdqu
_mm_storeu_si128(pStore, x0);
_mm_storeu_si128(pStore + 1, x1);
_mm_storeu_si128(pStore + 2, x2);
_mm_storeu_si128(pStore + 3, x3);
} else {
// movntdq
_mm_stream_si128(pStore, x0);
_mm_stream_si128(pStore + 1, x1);
_mm_stream_si128(pStore + 2, x2);
_mm_stream_si128(pStore + 3, x3);
}
pCache += 4;
pStore += 4;
}
pCache += extraPitch;
pStore += extraPitch;
}
}
}
//Taken from the QuickSync decoder by Eric Gur
// a memcpy style function that copied data very fast from a GPU tiled memory (write back)
// Performance tip: page offset (12 lsb) of both addresses should be different
// optimally use a 2K offset between them.
void *memcpy_sse2(void* dst, const void* src, size_t size)
{
static const size_t kRegsInLoop = sizeof(size_t) * 2; // 8 or 16
if (!dst || !src)
return NULL;
// If memory is not aligned, use memcpy
// TODO: only check dst aligned
const bool isAligned = !(((size_t)(src) | (size_t)(dst)) & 0x0F);
if (!isAligned)
return memcpy(dst, src, size);
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
#ifdef __x86_64__
__m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
#endif
size_t reminder = size & (kRegsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop
size_t end = 0;
__m128i* pTrg = (__m128i*)dst;
__m128i* pTrgEnd = pTrg + ((size - reminder) >> 4);
__m128i* pSrc = (__m128i*)src;
// Make sure source is synced - doesn't hurt if not needed.
_mm_sfence();
while (pTrg < pTrgEnd) {
// _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA
// Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad)
xmm0 = STREAM_LOAD_SI128(pSrc);
xmm1 = STREAM_LOAD_SI128(pSrc + 1);
xmm2 = STREAM_LOAD_SI128(pSrc + 2);
xmm3 = STREAM_LOAD_SI128(pSrc + 3);
xmm4 = STREAM_LOAD_SI128(pSrc + 4);
xmm5 = STREAM_LOAD_SI128(pSrc + 5);
xmm6 = STREAM_LOAD_SI128(pSrc + 6);
xmm7 = STREAM_LOAD_SI128(pSrc + 7);
#ifdef __x86_64__ // Use all 16 xmm registers
xmm8 = STREAM_LOAD_SI128(pSrc + 8);
xmm9 = STREAM_LOAD_SI128(pSrc + 9);
xmm10 = STREAM_LOAD_SI128(pSrc + 10);
xmm11 = STREAM_LOAD_SI128(pSrc + 11);
xmm12 = STREAM_LOAD_SI128(pSrc + 12);
xmm13 = STREAM_LOAD_SI128(pSrc + 13);
xmm14 = STREAM_LOAD_SI128(pSrc + 14);
xmm15 = STREAM_LOAD_SI128(pSrc + 15);
#endif
pSrc += kRegsInLoop;
// _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store)
// TODO: why not _mm_stream_si128? it works
_mm_store_si128(pTrg , xmm0);
_mm_store_si128(pTrg + 1, xmm1);
_mm_store_si128(pTrg + 2, xmm2);
_mm_store_si128(pTrg + 3, xmm3);
_mm_store_si128(pTrg + 4, xmm4);
_mm_store_si128(pTrg + 5, xmm5);
_mm_store_si128(pTrg + 6, xmm6);
_mm_store_si128(pTrg + 7, xmm7);
#ifdef __x86_64__ // Use all 16 xmm registers
_mm_store_si128(pTrg + 8, xmm8);
_mm_store_si128(pTrg + 9, xmm9);
_mm_store_si128(pTrg + 10, xmm10);
_mm_store_si128(pTrg + 11, xmm11);
_mm_store_si128(pTrg + 12, xmm12);
_mm_store_si128(pTrg + 13, xmm13);
_mm_store_si128(pTrg + 14, xmm14);
_mm_store_si128(pTrg + 15, xmm15);
#endif
pTrg += kRegsInLoop;
}
// Copy in 16 byte steps
if (reminder >= 16) {
size = reminder;
reminder = size & 15;
end = size >> 4;
for (size_t i = 0; i < end; ++i)
pTrg[i] = STREAM_LOAD_SI128(pSrc + i);
}
// Copy last bytes - shouldn't happen as strides are modulu 16
if (reminder) {
__m128i temp = STREAM_LOAD_SI128(pSrc + end);
char* ps = (char*)(&temp);
char* pt = (char*)(pTrg + end);
for (size_t i = 0; i < reminder; ++i)
pt[i] = ps[i];
}
return dst;
}