Skip to content

Commit

Permalink
Merge branch 'rext' of https://github.com/OpenHEVC/FFmpeg.git into rext
Browse files Browse the repository at this point in the history
  • Loading branch information
mraulet committed Oct 18, 2014
2 parents fd6f960 + 44a9a1f commit ffd243f
Show file tree
Hide file tree
Showing 4 changed files with 265 additions and 0 deletions.
115 changes: 115 additions & 0 deletions libavcodec/x86/hevc_sao.asm
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

SECTION_RODATA 32

edge_shuffle: db 1, 2, 0, 3, 4
times 11 db -1

SECTION_TEXT 32

Expand Down Expand Up @@ -362,6 +364,119 @@ cglobal hevc_sao_band_filter_0_64_8, 7, 7, 6, dst, src, dststride, srcstride, of
LOOP_END dst, dststride, src, srcstride
RET

cglobal hevc_sao_edge_filter_border_8_8, 3, 3, 2, value, src, dst
movh m0, valueq
SPLATW m0, m0, 0
movh m1, [srcq]
paddb m0, m1
movh [dstq], m0
RET


cglobal hevc_sao_edge_filter_border_16_8, 3, 3, 2, value, src, dst
movd m0, valued
SPLATW m0, m0, 0
movu m1, [srcq]
paddb m0, m1
movu [dstq], m0
RET

INIT_XMM avx

cglobal hevc_sao_edge_filter_main_8_8, 8, 13, 8, src0, src1, src2, dst, srcstride, dststride, sao_offset_val, height, rtmp0, rtmp1, rtmp2, rtmp3, rtmp4

movu m0, [sao_offset_valq]
packsswb m0, m0
movu m1, [edge_shuffle]
pshufb m0, m1
xor rtmp0q, rtmp0q
mov rtmp0q, 2
.loop
movq m1, [src0q]
movq m2, [src1q]
movq m3, [src2q]

pminub m4, m1, m2
pcmpeqb m5, m2, m4
pcmpeqb m6, m1, m4
psubb m5, m6, m5
pminub m4, m1, m3
pcmpeqb m7, m3, m4
pcmpeqb m6, m1, m4
psubb m7, m6, m7
; movq [dstq], m7

paddb m5, m7
movq m6, rtmp0q
punpcklbw m6, m6
SPLATW m6, m6
paddb m5, m6

pshufb m2, m0, m5 ;SSSE3 instruction
pmovsxbw m2, m2
; pxor m3, m3
; pcmpgtb m3, m2 ;do not mix instruction with 256b registers
; punpcklbw m2, m3
pxor m3, m3
punpcklbw m1, m3
paddw m2, m1
packuswb m2, m2
movq [dstq], m2

add src0q, srcstrideq
add src1q, srcstrideq
add src2q, srcstrideq
add dstq, dststrideq
dec heightd
jnz .loop
RET


;#if HAVE_SSE42
;#define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16
;
;#else
;static inline __m128i _MM_CVTEPI8_EPI16(__m128i m0) {
; return _mm_unpacklo_epi8(m0, _mm_cmplt_epi8(m0, _mm_setzero_si128()));
;}
;#endif

; ff_hevc_sao_edge_filter_main_8_8_sse2(src + y_stride_src, src + y_stride_0_1, src + y_stride_1_1, dst + y_stride_dst, sao_offset_val[edge_idx[4]],
; sao_offset_val[edge_idx[3]], sao_offset_val[edge_idx[2]], sao_offset_val[edge_idx[1]], sao_offset_val[edge_idx[0]], height);
; offset0 = _mm_set_epi8(0, 0, 0, 0,
; 0, 0, 0, 0,
; 0, 0, 0, sao_offset_val[edge_idx[4]],
; sao_offset_val[edge_idx[3]], sao_offset_val[edge_idx[2]], sao_offset_val[edge_idx[1]], sao_offset_val[edge_idx[0]]);
; for (y = init_y; y < height; y++) {
; for (x = 0; x < width; x += 8) {
; x0 = _mm_loadl_epi64((__m128i *) (src + x + y_stride_src));
; cmp0 = _mm_loadl_epi64((__m128i *) (src + x + y_stride_0_1));
; cmp1 = _mm_loadl_epi64((__m128i *) (src + x + y_stride_1_1));
; r2 = _mm_min_epu8(x0, cmp0);
; x1 = _mm_cmpeq_epi8(cmp0, r2);
; x2 = _mm_cmpeq_epi8(x0, r2);
; x1 = _mm_sub_epi8(x2, x1);
; r2 = _mm_min_epu8(x0, cmp1);
; x3 = _mm_cmpeq_epi8(cmp1, r2);
; x2 = _mm_cmpeq_epi8(x0, r2);
; x3 = _mm_sub_epi8(x2, x3);
; x1 = _mm_add_epi8(x1, x3);
; x1 = _mm_add_epi8(x1, _mm_set1_epi8(2));
; r0 = _mm_shuffle_epi8(offset0, x1);
; r0 = _MM_CVTEPI8_EPI16(r0);
; x0 = _mm_unpacklo_epi8(x0, _mm_setzero_si128());
; r0 = _mm_add_epi16(r0, x0);
; r0 = _mm_packus_epi16(r0, r0);
; _mm_storel_epi64((__m128i *) (dst + x + y_stride_dst), r0);
; }
; y_stride_dst += stride_dst;
; y_stride_src += stride_src;
; y_stride_0_1 += stride_src;
; y_stride_1_1 += stride_src;
; }



INIT_YMM avx2

cglobal hevc_sao_band_filter_0_32_8, 7, 7, 6, dst, src, dststride, srcstride, offset, left, height
Expand Down
135 changes: 135 additions & 0 deletions libavcodec/x86/hevc_sao_sse.c
Original file line number Diff line number Diff line change
Expand Up @@ -517,4 +517,139 @@ SAO_EDGE_FILTER_1(10)

SAO_EDGE_FILTER_0(12)
SAO_EDGE_FILTER_1(12)

void ff_hevc_sao_edge_filter_8_avx( uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);

void ff_hevc_sao_edge_filter_0_8_avx(uint8_t *_dst, uint8_t *_src, \
ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
struct SAOParams *sao, int *borders, int _width, \
int _height, int c_idx, uint8_t *vert_edge, uint8_t *horiz_edge, \
uint8_t *diag_edge) { \
ff_hevc_sao_edge_filter_8_avx(_dst, _src, _stride_dst, _stride_src, \
sao, borders, _width, _height, c_idx, \
vert_edge, horiz_edge, diag_edge); \
}

void ff_hevc_sao_edge_filter_border_16_8_sse2(int value,uint8_t *src,uint8_t *dst);

void ff_hevc_sao_edge_filter_border_8_8_sse2(int value,uint8_t *src,uint8_t *dst);

void ff_hevc_sao_edge_filter_main_8_8_avx(uint8_t *src0, uint8_t *src1, uint8_t *src2, uint8_t *dst, ptrdiff_t stride_src, ptrdiff_t stride_dst,
int16_t * sao_offset, int height);


void ff_hevc_sao_edge_filter_8_avx(
uint8_t *_dst, uint8_t *_src,
ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
struct SAOParams *sao,
int *borders, int _width, int _height, int c_idx, uint8_t *vert_edge,
uint8_t *horiz_edge, uint8_t *diag_edge) {
int x, y;
int16_t *sao_offset_val = sao->offset_val[c_idx];
int sao_eo_class = sao->eo_class[c_idx];
const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
const int8_t pos[4][2][2] = {
{ {-1, 0}, { 1, 0} }, /* horizontal */
{ { 0,-1}, { 0, 1} }, /* vertical */
{ {-1,-1}, { 1, 1} }, /* 45 degree */
{ { 1,-1}, {-1, 1} }, /* 135 degree */
};
int init_y = 0, width = _width, height = _height;

uint8_t *dst = _dst;
uint8_t *src = _src;
ptrdiff_t stride_dst= _stride_dst;
ptrdiff_t stride_src= _stride_src;

if (sao_eo_class != SAO_EO_HORIZ) {
if (borders[1]) {
if((width & 15) == 1){
for (x = 0; x < width; x += 16)
ff_hevc_sao_edge_filter_border_16_8_sse2(sao_offset_val[0], src, dst);
}
else{
for (x = 0; x < width; x += 8)
ff_hevc_sao_edge_filter_border_8_8_sse2(sao_offset_val[0], src, dst);
}
init_y = 1;
}
if (borders[3]) {
int y_stride_dst = stride_dst * (_height - 1);
int y_stride_src = stride_src * (_height - 1);
if((width & 15) == 1){
for (x = 0; x < width; x += 16)
ff_hevc_sao_edge_filter_border_16_8_sse2(sao_offset_val[0], src + y_stride_src, dst + y_stride_dst);
}
else{
for (x = 0; x < width; x += 8)
ff_hevc_sao_edge_filter_border_8_8_sse2(sao_offset_val[0], src + y_stride_src, dst + y_stride_dst);
}

height--;
}
}
{
int y_stride_dst = init_y * stride_dst;
int y_stride_src = init_y * stride_src;
int pos_0_0 = pos[sao_eo_class][0][0];
int pos_0_1 = pos[sao_eo_class][0][1];
int pos_1_0 = pos[sao_eo_class][1][0];
int pos_1_1 = pos[sao_eo_class][1][1];
int y_stride_0_1 = (init_y + pos_0_1) * stride_src + pos_0_0;
int y_stride_1_1 = (init_y + pos_1_1) * stride_src + pos_1_0;

for (x = 0; x < width; x += 8) {
ff_hevc_sao_edge_filter_main_8_8_avx(src + y_stride_src+x, src + y_stride_0_1+x, src + y_stride_1_1+x, dst + y_stride_dst+x, stride_src, stride_dst,
sao_offset_val, height - init_y);
}

}
if (sao_eo_class != SAO_EO_VERT) {
if (borders[0]) {
int idx_dst = 0;
int idx_src = 0;
int16_t offset_val = sao_offset_val[0];
for (y = 0; y < height; y++) {
dst[idx_dst] = av_clip_uintp2_c(src[idx_src] + offset_val, 8);
idx_dst += stride_dst;
idx_src += stride_src;
}
}
if (borders[2]) {
int idx_dst = _width - 1;
int idx_src = idx_dst;
int16_t offset_val = sao_offset_val[0];
for (y = 0; y < height; y++) {
dst[idx_dst] = av_clip_uintp2_c(src[idx_src] + offset_val, 8);
idx_dst += stride_dst;
idx_src += stride_src;
}
}
}
}























#endif //HAVE_SSE42
10 changes: 10 additions & 0 deletions libavcodec/x86/hevcdsp.h
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,10 @@ void ff_hevc_sao_band_filter_0_56_8_avx2( uint8_t *_dst, uint8_t *_src, ptrdiff_
void ff_hevc_sao_band_filter_0_64_8_avx2( uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,int16_t* sao_val, int8_t sao_left, int height);


void ff_hevc_sao_edge_filter_0_8_avx(uint8_t *_dst, uint8_t *_src,
ptrdiff_t _stride_dst, ptrdiff_t _stride_src, struct SAOParams *sao,
int *borders, int _width, int _height, int c_idx,
uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);

//#ifndef OPTI_ASM
void ff_hevc_sao_edge_filter_0_8_sse(uint8_t *_dst, uint8_t *_src,
Expand All @@ -335,6 +339,12 @@ void ff_hevc_sao_edge_filter_1_8_sse(uint8_t *_dst, uint8_t *_src,
ptrdiff_t _stride_dst, ptrdiff_t _stride_src, struct SAOParams *sao,
int *borders, int _width, int _height, int c_idx,
uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);

void ff_hevc_sao_edge_filter_0_8_sse2(uint8_t *_dst, uint8_t *_src,
ptrdiff_t _stride_dst, ptrdiff_t _stride_src, struct SAOParams *sao,
int *borders, int _width, int _height, int c_idx,
uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);

//#endif
void ff_hevc_sao_edge_filter_0_10_sse(uint8_t *_dst, uint8_t *_src,
ptrdiff_t _stride_dst, ptrdiff_t _stride_src, struct SAOParams *sao,
Expand Down
5 changes: 5 additions & 0 deletions libavcodec/x86/hevcdsp_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
// c->sao_edge_filter[1] = ff_hevc_sao_edge_filter_1_8_sse;
#endif
#ifndef OPTI_ASM

c->sao_band_filter = ff_hevc_sao_band_filter_0_8_sse;

#endif
Expand Down Expand Up @@ -747,6 +748,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (ARCH_X86_64) {
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;

#ifdef OPTI_ASM
c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_0_8_avx;
#endif
}
}
if (EXTERNAL_AVX2(cpu_flags)) {
Expand Down

0 comments on commit ffd243f

Please sign in to comment.