Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ARM Neon and scalar implementations of SIMD functions #359

Merged
merged 4 commits into from
Aug 31, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
On other platforms, emulate SSE2 SIMD calls using scalar code
  • Loading branch information
jmarshall committed Jun 26, 2022
commit ac612b6f5a079750684ef67044fd131d209cc69f
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
is.o: malloc_wrap.h
kopen.o: malloc_wrap.h
kstring.o: kstring.h malloc_wrap.h
ksw.o: ksw.h neon_sse.h malloc_wrap.h
ksw.o: ksw.h neon_sse.h scalar_sse.h malloc_wrap.h
main.o: kstring.h malloc_wrap.h utils.h
malloc_wrap.o: malloc_wrap.h
maxk.o: bwa.h bntseq.h bwt.h bwamem.h kseq.h malloc_wrap.h
Expand Down
10 changes: 10 additions & 0 deletions ksw.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
#include <emmintrin.h>
#elif defined __ARM_NEON
#include "neon_sse.h"
#else
#include "scalar_sse.h"
#endif
#include "ksw.h"

Expand Down Expand Up @@ -139,6 +141,10 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del
#elif defined __ARM_NEON
#define __max_16(ret, xx) (ret) = vmaxvq_u8((xx))
#define allzero_16(xx) (vmaxvq_u8((xx)) == 0)

#else
#define __max_16(ret, xx) (ret) = m128i_max_u8((xx))
#define allzero_16(xx) (m128i_allzero((xx)))
#endif

// initialization
Expand Down Expand Up @@ -267,6 +273,10 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_de
#elif defined __ARM_NEON
#define __max_8(ret, xx) (ret) = vmaxvq_s16(vreinterpretq_s16_u8((xx)))
#define allzero_0f_8(xx) (vmaxvq_u16(vreinterpretq_u16_u8((xx))) == 0)

#else
#define __max_8(ret, xx) (ret) = m128i_max_s16((xx))
#define allzero_0f_8(xx) (m128i_allzero((xx)))
#endif

// initialization
Expand Down
119 changes: 119 additions & 0 deletions scalar_sse.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#ifndef SCALAR_SSE_H
#define SCALAR_SSE_H

#include <assert.h>
#include <stdint.h>
#include <string.h>

typedef union m128i {
uint8_t u8[16];
int16_t i16[8];
} __m128i;

static inline __m128i _mm_set1_epi32(int32_t n) {
assert(n >= 0 && n <= 255);
__m128i r; memset(&r, n, sizeof r); return r;
}

static inline __m128i _mm_load_si128(const __m128i *ptr) { __m128i r; memcpy(&r, ptr, sizeof r); return r; }
static inline void _mm_store_si128(__m128i *ptr, __m128i a) { memcpy(ptr, &a, sizeof a); }

static inline int m128i_allzero(__m128i a) {
static const char zero[] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
return memcmp(&a, zero, sizeof a) == 0;
}

static inline __m128i _mm_slli_si128(__m128i a, int n) {
int i;
memmove(&a.u8[n], &a.u8[0], 16 - n);
for (i = 0; i < n; i++) a.u8[i] = 0;
return a;
}

static inline __m128i _mm_adds_epu8(__m128i a, __m128i b) {
int i;
for (i = 0; i < 16; i++) {
uint16_t aa = a.u8[i];
aa += b.u8[i];
a.u8[i] = (aa < 256)? aa : 255;
}
return a;
}

static inline __m128i _mm_max_epu8(__m128i a, __m128i b) {
int i;
for (i = 0; i < 16; i++)
if (a.u8[i] < b.u8[i]) a.u8[i] = b.u8[i];
return a;
}

static inline uint8_t m128i_max_u8(__m128i a) {
uint8_t max = 0;
int i;
for (i = 0; i < 16; i++)
if (max < a.u8[i]) max = a.u8[i];
return max;
}

static inline __m128i _mm_set1_epi8(int8_t n) { __m128i r; memset(&r, n, sizeof r); return r; }

static inline __m128i _mm_subs_epu8(__m128i a, __m128i b) {
int i;
for (i = 0; i < 16; i++) {
int16_t aa = a.u8[i];
aa -= b.u8[i];
a.u8[i] = (aa >= 0)? aa : 0;
}
return a;
}

static inline __m128i _mm_adds_epi16(__m128i a, __m128i b) {
int i;
for (i = 0; i < 8; i++) {
int32_t aa = a.i16[i];
aa += b.i16[i];
a.i16[i] = (aa < 32768)? aa : 32767;
}
return a;
}

static inline __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) {
int i;
for (i = 0; i < 8; i++)
a.i16[i] = (a.i16[i] > b.i16[i])? 0xffff : 0x0000;
return a;
}

static inline __m128i _mm_max_epi16(__m128i a, __m128i b) {
int i;
for (i = 0; i < 8; i++)
if (a.i16[i] < b.i16[i]) a.i16[i] = b.i16[i];
return a;
}

static inline __m128i _mm_set1_epi16(int16_t n) {
__m128i r;
r.i16[0] = r.i16[1] = r.i16[2] = r.i16[3] =
r.i16[4] = r.i16[5] = r.i16[6] = r.i16[7] = n;
return r;
}

static inline int16_t m128i_max_s16(__m128i a) {
int16_t max = -32768;
int i;
for (i = 0; i < 8; i++)
if (max < a.i16[i]) max = a.i16[i];
return max;
}

static inline __m128i _mm_subs_epu16(__m128i a, __m128i b) {
int i;
for (i = 0; i < 8; i++) {
int32_t aa = a.i16[i];
aa -= b.i16[i];
a.i16[i] = (aa >= 0)? aa : 0;
}
return a;
}

#endif