forked from videolan/vlc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
nv12_rgb.S
207 lines (174 loc) · 4.71 KB
/
nv12_rgb.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
@*****************************************************************************
@ nv12_rgb.S : ARM NEONv1 NV12 to RGB chroma conversion
@*****************************************************************************
@ Copyright (C) 2011 Sébastien Toque
@ Rémi Denis-Courmont
@
@ This program is free software; you can redistribute it and/or modify it
@ under the terms of the GNU Lesser General Public License as published by
@ the Free Software Foundation; either version 2.1 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public License
@ along with this program; if not, write to the Free Software Foundation,
@ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
@****************************************************************************/
.syntax unified
.fpu neon
.text
/* ARM */
#define O1 r0
#define O2 r1
#define WIDTH r2
#define HEIGHT r3
#define Y1 r4
#define Y2 r5
#define U r6
#define V r7
#define YPITCH r8
#define OPAD r10
#define YPAD r11
#define COUNT ip
#define OPITCH lr
/* NEON */
#define coefY D0
#define coefRV D1
#define coefGU D2
#define coefGV D3
#define coefBU D4
#define Rc Q3
#define Gc Q4
#define Bc Q5
#define u D24
#define v D25
#define y1 D28
#define y2 D29
#define chro_r Q6
#define chro_g Q7
#define chro_b Q8
#define red Q9
#define green Q10
#define blue Q11
#define lumi Q15
#define red1 D24
#define green1 D25
#define blue1 D26
#define alpha1 D27
#define red2 D28
#define green2 D29
#define blue2 D30
#define alpha2 D31
coefficients:
.short -15872
.short 4992
.short -18432
.align 2
.global nv12_rgb_neon
.type nv12_rgb_neon, %function
nv12_rgb_neon:
push {r4-r8,r10-r11,lr}
vpush {q4-q7}
/* load arguments */
ldmia r0, {O1, OPITCH}
ldmia r1, {Y1, U, V, YPITCH}
/* round the width to be a multiple of 16 */
ands OPAD, WIDTH, #15
sub WIDTH, WIDTH, OPAD
addne WIDTH, WIDTH, #16
/* init constants (scale value by 64) */
vmov.u8 coefY, #74
vmov.u8 coefRV, #115
vmov.u8 coefGU, #14
vmov.u8 coefGV, #34
vmov.u8 coefBU, #135
adr OPAD, coefficients
vld1.s16 {d6[], d7[]}, [OPAD]!
vld1.s16 {d8[], d9[]}, [OPAD]!
vld1.s16 {d10[], d11[]}, [OPAD]!
vmov.u8 alpha1, #255
/* init padding */
cmp HEIGHT, #0
sub OPAD, OPITCH, WIDTH, lsl #2
sub YPAD, YPITCH, WIDTH
loop_row:
movsgt COUNT, WIDTH
add O2, O1, OPITCH
add Y2, Y1, YPITCH
/* exit if all rows have been processed */
vpople {q4-q7}
pople {r4-r8,r10-r11,pc}
loop_col:
/* Common U & V */
vld2.u8 {u,v}, [U,:128]!
vmull.u8 chro_r, v, coefRV
vmull.u8 chro_g, u, coefGU
vmlal.u8 chro_g, v, coefGV
vmull.u8 chro_b, u, coefBU
vadd.s16 chro_r, Rc, chro_r
vsub.s16 chro_g, Gc, chro_g
vadd.s16 chro_b, Bc, chro_b
pld [U]
/* Y Top Row */
vld2.u8 {y1,y2}, [Y1,:128]!
/* y1 : chrominance + luminance, then clamp (divide by 64) */
vmull.u8 lumi, y1, coefY
vqadd.s16 red, lumi, chro_r
vqadd.s16 green, lumi, chro_g
vqadd.s16 blue, lumi, chro_b
vqrshrun.s16 red1, red, #6
vqrshrun.s16 green1, green, #6
vqrshrun.s16 blue1, blue, #6
/* y2 : chrominance + luminance, then clamp (divide by 64) */
vmull.u8 lumi, y2, coefY
vqadd.s16 red, lumi, chro_r
vqadd.s16 green, lumi, chro_g
vqadd.s16 blue, lumi, chro_b
vqrshrun.s16 red2, red, #6
vqrshrun.s16 green2, green, #6
vqrshrun.s16 blue2, blue, #6
pld [Y1]
vmov.u8 alpha2, #255
vzip.u8 red1, red2
vzip.u8 green1, green2
vzip.u8 blue1, blue2
vst4.u8 {red1,green1,blue1,alpha1}, [O1,:128]!
vst4.u8 {red2,green2,blue2,alpha2}, [O1,:128]!
/* Y Bottom Row */
vld2.u8 {y1,y2}, [Y2,:128]!
/* y1 : chrominance + luminance, then clamp (divide by 64) */
vmull.u8 lumi, y1, coefY
vqadd.s16 red, lumi, chro_r
vqadd.s16 green, lumi, chro_g
vqadd.s16 blue, lumi, chro_b
vqrshrun.s16 red1, red, #6
vqrshrun.s16 green1, green, #6
vqrshrun.s16 blue1, blue, #6
/* y2 : chrominance + luminance, then clamp (divide by 64) */
vmull.u8 lumi, y2, coefY
vqadd.s16 red, lumi, chro_r
vqadd.s16 green, lumi, chro_g
vqadd.s16 blue, lumi, chro_b
vqrshrun.s16 red2, red, #6
vqrshrun.s16 green2, green, #6
vqrshrun.s16 blue2, blue, #6
pld [Y2]
vmov.u8 alpha2, #255
vzip.u8 red1, red2
vzip.u8 green1, green2
vzip.u8 blue1, blue2
vst4.u8 {red1,green1,blue1,alpha1}, [O2,:128]!
vst4.u8 {red2,green2,blue2,alpha2}, [O2,:128]!
/* next columns (x16) */
subs COUNT, COUNT, #16
bgt loop_col
/* next rows (x2) */
subs HEIGHT, #2
add O1, O2, OPAD
add Y1, Y2, YPAD
add U, U, YPAD
b loop_row