Skip to content

Commit

Permalink
deinterlace: arm64 NEON merge asm
Browse files Browse the repository at this point in the history
Approximately factor 2 faster.

Also adds build system support / cpu "detection" for arm64 neon.
Advanced SIMD (neon) is mandatory for general purpose ARMv8-a CPU so the
CPU feature detection is a constant 1.

Signed-off-by: Felix Paul Kühne <[email protected]>
  • Loading branch information
jannau authored and fkuehne committed Aug 13, 2016
1 parent 8de7723 commit 8df6a21
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 0 deletions.
20 changes: 20 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -1432,6 +1432,26 @@ asm volatile("vqmovun.s64 d0, q1":::"d0");
])
AM_CONDITIONAL(HAVE_NEON, [test "${ac_cv_arm_neon}" = "yes"])

AC_ARG_ENABLE(arm64,
[AS_HELP_STRING([--disable-arm64],
[disable arm 64-bit optimizations (default auto)])],, [
AS_IF([test "${host_cpu}" = "aarch64"], [enable_arm64="yes"] ,[enable_arm64="no"])
])
AS_IF([test "${enable_arm64}" != "no"], [
AC_CACHE_CHECK([if $CCAS groks ARM 64 SIMD assembly], [ac_cv_arm64], [
AC_COMPILE_IFELSE([
AC_LANG_PROGRAM(,[[
asm volatile("uhadd v0.8b, v0.8b, v1.8b":::"v0");
]])
], [
ac_cv_arm64="yes"
], [
ac_cv_arm64="no"
])
])
])
AM_CONDITIONAL(HAVE_ARM64, [test "${ac_cv_arm64}" = "yes"])


AC_ARG_ENABLE(altivec,
[AS_HELP_STRING([--disable-altivec],
Expand Down
2 changes: 2 additions & 0 deletions include/vlc_cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ VLC_API unsigned vlc_CPU(void);

# elif defined (__aarch64__)
# define HAVE_FPU 1
// NEON is mandatory for general purpose ARMv8-a CPUs
# define vlc_CPU_ARM64_NEON() (1)

# elif defined (__sparc__)
# define HAVE_FPU 1
Expand Down
4 changes: 4 additions & 0 deletions modules/video_filter/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ if HAVE_NEON
libdeinterlace_plugin_la_SOURCES += video_filter/deinterlace/merge_arm.S
libdeinterlace_plugin_la_CFLAGS += -DCAN_COMPILE_ARM
endif
if HAVE_ARM64
libdeinterlace_plugin_la_SOURCES += video_filter/deinterlace/merge_arm64.S
libdeinterlace_plugin_la_CFLAGS += -DCAN_COMPILE_ARM64
endif
video_filter_LTLIBRARIES += libdeinterlace_plugin.la

libdynamicoverlay_plugin_la_SOURCES = \
Expand Down
5 changes: 5 additions & 0 deletions modules/video_filter/deinterlace/deinterlace.c
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,11 @@ int Open( vlc_object_t *p_this )
if( vlc_CPU_ARMv6() )
p_sys->pf_merge = pixel_size == 1 ? merge8_armv6 : merge16_armv6;
else
#endif
#if defined(CAN_COMPILE_ARM64)
if( vlc_CPU_ARM64_NEON() )
p_sys->pf_merge = pixel_size == 1 ? merge8_arm64_neon : merge16_arm64_neon;
else
#endif
{
p_sys->pf_merge = pixel_size == 1 ? Merge8BitGeneric : Merge16BitGeneric;
Expand Down
9 changes: 9 additions & 0 deletions modules/video_filter/deinterlace/merge.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,15 @@ void merge8_armv6 (void *, const void *, const void *, size_t);
void merge16_armv6 (void *, const void *, const void *, size_t);
#endif

#if defined(CAN_COMPILE_ARM64)
/**
* ARM64 NEON routine to blend pixels from two picture lines.
*/
void merge8_arm64_neon (void *, const void *, const void *, size_t);
void merge16_arm64_neon (void *, const void *, const void *, size_t);

#endif

/*****************************************************************************
* EndMerge routines
*****************************************************************************/
Expand Down
102 changes: 102 additions & 0 deletions modules/video_filter/deinterlace/merge_arm64.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
//*****************************************************************************
// merge_arm64.S : ARM64 NEON mean
//*****************************************************************************
// Copyright (C) 2009-2012 Rémi Denis-Courmont
// Copyright (C) 2016- Janne Grunau
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation; either version 2.1 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this program; if not, write to the Free Software Foundation,
// Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
//****************************************************************************/

.text

#define DEST x0
#define SRC1 x1
#define SRC2 x2
#define SIZE x3

.align 2
.global merge8_arm64_neon
.type merge8_arm64_neon, %function
// NOTE: Offset and pitch must be multiple of 16-bytes in VLC.
merge8_arm64_neon:
ands x5, SIZE, #~63
b.eq 2f
mov x10, #64
add x11, SRC1, #32
add x12, SRC2, #32
1:
ld1 {v0.16b,v1.16b}, [SRC1], x10
ld1 {v4.16b,v5.16b}, [SRC2], x10
ld1 {v2.16b,v3.16b}, [x11], x10
uhadd v0.16b, v0.16b, v4.16b
ld1 {v6.16b,v7.16b}, [x12], x10
subs x5, x5, #64
uhadd v0.16b, v0.16b, v4.16b
uhadd v1.16b, v1.16b, v5.16b
uhadd v2.16b, v2.16b, v6.16b
uhadd v3.16b, v3.16b, v7.16b
st1 {v0.16b,v1.16b}, [DEST], #32
st1 {v2.16b,v3.16b}, [DEST], #32
b.gt 1b
2:
tbz SIZE, #32, 3f
ld1 {v0.16b,v1.16b}, [SRC1], #32
ld1 {v4.16b,v5.16b}, [SRC2], #32
uhadd v0.16b, v0.16b, v4.16b
uhadd v1.16b, v1.16b, v5.16b
st1 {v0.16b,v1.16b}, [DEST], #32
3:
tbz SIZE, #16, 4f
ld1 {v0.16b}, [SRC1]
ld1 {v4.16b}, [SRC2]
uhadd v0.16b, v0.16b, v4.16b
st1 {v0.16b}, [DEST]
4:
ret

.align 2
.global merge16_arm64_neon
.type merge16_arm64_neon, %function
merge16_arm64_neon:
ands x5, SIZE, #~63
b.eq 2f
1:
ld1 {v0.8h,v1.8h}, [SRC1], #32
ld1 {v4.8h,v5.8h}, [SRC2], #32
ld1 {v2.8h,v3.8h}, [SRC1], #32
uhadd v0.8h, v0.8h, v4.8h
ld1 {v6.8h,v7.8h}, [SRC2], #32
uhadd v1.8h, v1.8h, v5.8h
uhadd v2.8h, v2.8h, v6.8h
uhadd v3.8h, v3.8h, v7.8h
st1 {v0.8h,v1.8h}, [DEST], #32
st1 {v2.8h,v3.8h}, [DEST], #32
subs x5, x5, #64
b.gt 1b
2:
tbz SIZE, #32, 3f
ld1 {v0.8h,v1.8h}, [SRC1], #32
ld1 {v4.8h,v5.8h}, [SRC2], #32
uhadd v0.8h, v0.8h, v4.8h
uhadd v1.8h, v1.8h, v5.8h
st1 {v0.8h,v1.8h}, [DEST], #32
3:
tbz SIZE, #16, 4f
ld1 {v0.8h}, [SRC1]
ld1 {v4.8h}, [SRC2]
uhadd v0.8h, v0.8h, v4.8h
st1 {v0.8h}, [DEST]
4:
ret

0 comments on commit 8df6a21

Please sign in to comment.