Skip to content

Commit

Permalink
Fix Neon SIMD build issues with Visual Studio
Browse files Browse the repository at this point in the history
- Use the _M_ARM and _M_ARM64 macros provided by Visual Studio for
  compile-time detection of Arm builds, since __arm__ and __aarch64__
  are only present in GNU-compatible compilers.
- Neon/intrinsics: Use the _CountLeadingZeros() and
  _CountLeadingZeros64() intrinsics provided by Visual Studio, since
  __builtin_clz() and __builtin_clzl() are only present in
  GNU-compatible compilers.
- Neon/intrinsics: Since Visual Studio does not support static vector
  initialization, replace static initialization of Neon vectors with the
  appropriate intrinsics.  Compared to the static initialization
  approach, this produces identical assembly code with both GCC and
  Clang.
- Neon/intrinsics: Since Visual Studio does not support inline assembly
  code, provide alternative code paths for Visual Studio whenever inline
  assembly is used.
- Build: Set FLOATTEST appropriately for AArch64 Visual Studio builds
  (Visual Studio does not emit fused multiply-add [FMA] instructions by
  default for such builds.)
- Neon/intrinsics: Move temporary buffer allocation outside of nested
  loops.  Since Visual Studio configures Arm builds with a relatively
  small amount of stack memory, attempting to allocate those buffers
  within the inner loops caused a stack overflow.

Closes libjpeg-turbo#461
Closes libjpeg-turbo#475
  • Loading branch information
jwright-arm authored and dcommander committed Nov 25, 2020
1 parent 91dd3b2 commit eb14189
Show file tree
Hide file tree
Showing 14 changed files with 102 additions and 41 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -892,7 +892,7 @@ if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386")
endif()
else()
if((CPU_TYPE STREQUAL "powerpc" OR CPU_TYPE STREQUAL "arm64") AND
NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
NOT CMAKE_C_COMPILER_ID STREQUAL "Clang" AND NOT MSVC)
set(DEFAULT_FLOATTEST fp-contract)
else()
set(DEFAULT_FLOATTEST no-fp-contract)
Expand Down
4 changes: 3 additions & 1 deletion ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ default.
for merged upsampling/color conversion, 1.5.1[5] is no longer necessary and has
been reverted.

14. The build system can now be used to generate a universal x86-64 + Armv8
14. The Arm Neon SIMD extensions can now be built using Visual Studio.

15. The build system can now be used to generate a universal x86-64 + Armv8
libjpeg-turbo SDK package for both iOS and macOS.


Expand Down
4 changes: 3 additions & 1 deletion jchuff.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* Copyright (C) 2009-2011, 2014-2016, 2018-2020, D. R. Commander.
* Copyright (C) 2015, Matthieu Darbois.
* Copyright (C) 2018, Matthias Räncker.
* Copyright (C) 2020, Arm Limited.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
Expand Down Expand Up @@ -76,7 +77,8 @@ typedef size_t bit_buf_type;
* intrinsics implementation of the Arm Neon SIMD extensions, which is why we
* retain the old Huffman encoder behavior when using the GAS implementation.
*/
#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__))
#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__) || \
defined(_M_ARM) || defined(_M_ARM64))
typedef unsigned long long simd_bit_buf_type;
#else
typedef bit_buf_type simd_bit_buf_type;
Expand Down
3 changes: 2 additions & 1 deletion jdsample.c
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,8 @@ jinit_upsampler(j_decompress_ptr cinfo)
} else if (h_in_group == h_out_group &&
v_in_group * 2 == v_out_group && do_fancy) {
/* Non-fancy upsampling is handled by the generic method */
#if defined(__arm__) || defined(__aarch64__)
#if defined(__arm__) || defined(__aarch64__) || \
defined(_M_ARM) || defined(_M_ARM64)
if (jsimd_can_h1v2_fancy_upsample())
upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
else
Expand Down
3 changes: 2 additions & 1 deletion simd/arm/aarch32/jccolext-neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
JSAMPROW inptr;
/* Pointers to Y, Cb, and Cr output data */
JSAMPROW outptr0, outptr1, outptr2;
/* Allocate temporary buffer for final (image_width % 8) pixels in row. */
ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];

/* Set up conversion constants. */
#ifdef HAVE_VLD1_U16_X2
Expand Down Expand Up @@ -79,7 +81,6 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
* buffer large enough to accommodate the vector load.
*/
if (cols_remaining < 8) {
ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
}
Expand Down
8 changes: 5 additions & 3 deletions simd/arm/aarch32/jchuff-neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "../../../jsimddct.h"
#include "../../jsimd.h"
#include "../jchuff.h"
#include "neon-compat.h"

#include <limits.h>

Expand Down Expand Up @@ -231,8 +232,9 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));

/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
const uint8x8_t bitmap_mask =
{ 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));

row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
Expand Down Expand Up @@ -278,7 +280,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
const unsigned int size_0xf0 = actbl->ehufsi[0xf0];

while (bitmap_1_32 != 0) {
r = __builtin_clz(bitmap_1_32);
r = BUILTIN_CLZ(bitmap_1_32);
i += r;
bitmap_1_32 <<= r;
nbits = block_nbits[i];
Expand All @@ -299,7 +301,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
i = 33;

while (bitmap_33_63 != 0) {
unsigned int leading_zeros = __builtin_clz(bitmap_33_63);
unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
r += leading_zeros;
i += leading_zeros;
bitmap_33_63 <<= leading_zeros;
Expand Down
4 changes: 2 additions & 2 deletions simd/arm/aarch64/jccolext-neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
JSAMPROW inptr;
/* Pointers to Y, Cb, and Cr output data */
JSAMPROW outptr0, outptr1, outptr2;
/* Allocate temporary buffer for final (image_width % 16) pixels in row. */
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];

/* Set up conversion constants. */
const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
Expand Down Expand Up @@ -162,7 +164,6 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
* (image_width % 16) columns of data are first memcopied to a temporary
* buffer large enough to accommodate the vector load.
*/
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;

Expand Down Expand Up @@ -255,7 +256,6 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
* (image_width % 8) columns of data are first memcopied to a temporary
* buffer large enough to accommodate the vector load.
*/
ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;

Expand Down
13 changes: 9 additions & 4 deletions simd/arm/aarch64/jchuff-neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,9 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
uint8x8_t abs_row7_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row7),
vdupq_n_u16(0)));

/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
const uint8x8_t bitmap_mask =
{ 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));

abs_row0_gt0 = vand_u8(abs_row0_gt0, bitmap_mask);
abs_row1_gt0 = vand_u8(abs_row1_gt0, bitmap_mask);
Expand Down Expand Up @@ -241,8 +242,12 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
/* Encode DC coefficient. */

/* Find nbits required to specify sign and amplitude of coefficient. */
#if defined(_MSC_VER) && !defined(__clang__)
unsigned int lz = BUILTIN_CLZ(vgetq_lane_s16(abs_row0, 0));
#else
unsigned int lz;
__asm__("clz %w0, %w1" : "=r"(lz) : "r"(vgetq_lane_s16(abs_row0, 0)));
#endif
unsigned int nbits = 32 - lz;
/* Emit Huffman-coded symbol and additional diff bits. */
unsigned int diff = (unsigned int)(vgetq_lane_u16(row0_diff, 0) << lz) >> lz;
Expand Down Expand Up @@ -326,7 +331,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);

while (bitmap != 0) {
r = __builtin_clzl(bitmap);
r = BUILTIN_CLZL(bitmap);
i += r;
bitmap <<= r;
nbits = block_nbits[i];
Expand Down Expand Up @@ -365,10 +370,10 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,

/* Same as above but must mask diff bits and compute nbits on demand. */
while (bitmap != 0) {
r = __builtin_clzl(bitmap);
r = BUILTIN_CLZL(bitmap);
i += r;
bitmap <<= r;
lz = __builtin_clz(block_abs[i]);
lz = BUILTIN_CLZ(block_abs[i]);
nbits = 32 - lz;
diff = (unsigned int)(block_diff[i] << lz) >> lz;
while (r > 15) {
Expand Down
14 changes: 7 additions & 7 deletions simd/arm/jccolor-neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {

/* Include inline routines for colorspace extensions. */

#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
Expand All @@ -68,7 +68,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
#define RGB_BLUE EXT_RGB_BLUE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
Expand All @@ -84,7 +84,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
#define RGB_BLUE EXT_RGBX_BLUE
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
Expand All @@ -100,7 +100,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
#define RGB_BLUE EXT_BGR_BLUE
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
Expand All @@ -116,7 +116,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
#define RGB_BLUE EXT_BGRX_BLUE
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
Expand All @@ -132,7 +132,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
#define RGB_BLUE EXT_XBGR_BLUE
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
Expand All @@ -148,7 +148,7 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
#define RGB_BLUE EXT_XRGB_BLUE
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
#include "aarch64/jccolext-neon.c"
#else
#include "aarch32/jccolext-neon.c"
Expand Down
3 changes: 2 additions & 1 deletion simd/arm/jcgryext-neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
{
JSAMPROW inptr;
JSAMPROW outptr;
/* Allocate temporary buffer for final (image_width % 16) pixels in row. */
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];

while (--num_rows >= 0) {
inptr = *input_buf++;
Expand All @@ -55,7 +57,6 @@ void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
* buffer large enough to accommodate the vector load.
*/
if (cols_remaining < 16) {
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
}
Expand Down
42 changes: 36 additions & 6 deletions simd/arm/jchuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* but must not be updated permanently until we complete the MCU.
*/

#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
#define BIT_BUF_SIZE 64
#else
#define BIT_BUF_SIZE 32
Expand Down Expand Up @@ -54,7 +54,25 @@ typedef struct {
* directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to
* encode 0xFF as 0xFF 0x00.
*/
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)

#if defined(_MSC_VER) && !defined(__clang__)
#define SPLAT() { \
buffer[0] = (JOCTET)(put_buffer >> 56); \
buffer[1] = (JOCTET)(put_buffer >> 48); \
buffer[2] = (JOCTET)(put_buffer >> 40); \
buffer[3] = (JOCTET)(put_buffer >> 32); \
buffer[4] = (JOCTET)(put_buffer >> 24); \
buffer[5] = (JOCTET)(put_buffer >> 16); \
buffer[6] = (JOCTET)(put_buffer >> 8); \
buffer[7] = (JOCTET)(put_buffer ); \
}
#else
#define SPLAT() { \
__asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
*((uint64_t *)buffer) = put_buffer; \
}
#endif

#define FLUSH() { \
if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
Expand All @@ -67,23 +85,35 @@ typedef struct {
EMIT_BYTE(put_buffer >> 8) \
EMIT_BYTE(put_buffer ) \
} else { \
__asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
*((uint64_t *)buffer) = put_buffer; \
SPLAT() \
buffer += 8; \
} \
}

#else

#if defined(_MSC_VER) && !defined(__clang__)
#define SPLAT() { \
buffer[0] = (JOCTET)(put_buffer >> 24); \
buffer[1] = (JOCTET)(put_buffer >> 16); \
buffer[2] = (JOCTET)(put_buffer >> 8); \
buffer[3] = (JOCTET)(put_buffer ); \
}
#else
#define SPLAT() { \
__asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
*((uint32_t *)buffer) = put_buffer; \
}
#endif

#define FLUSH() { \
if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
EMIT_BYTE(put_buffer >> 24) \
EMIT_BYTE(put_buffer >> 16) \
EMIT_BYTE(put_buffer >> 8) \
EMIT_BYTE(put_buffer ) \
} else { \
__asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
*((uint32_t *)buffer) = put_buffer; \
SPLAT() \
buffer += 4; \
} \
}
Expand Down
Loading

0 comments on commit eb14189

Please sign in to comment.