Skip to content

Commit

Permalink
ggml : add SSE3 and fp16 conversion lookup table (ggerganov#368)
Browse files Browse the repository at this point in the history
* Improves WASM performance:
  On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome

* Add support for SSE3 SIMD

* Add SSE3 to system information

* Add Imath support for fp16-fp32 conversions

* Add Imath to system information

* Wrap Imath calls to avoid static function warnings

* Drop Imath; Add lookup table for f16 -> f32 conversions

* Remove TODO comments

* Update SSE3 to new macro arguments

* Correct updated macro definitions

* Prefer static inline where possible

* ggml : static inlines + add public f16 <-> f32 conversions

Co-authored-by: Georgi Gerganov <[email protected]>
  • Loading branch information
abitofevrything and ggerganov authored Jan 6, 2023
1 parent 1944e7c commit a62170c
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 21 deletions.
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
ifneq (,$(findstring f16c,$(F16C_M)))
CFLAGS += -mf16c
endif
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
ifneq (,$(findstring sse3,$(SSE3_M)))
CFLAGS += -msse3
endif
else ifeq ($(UNAME_S),Haiku)
AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
ifneq (,$(findstring avx,$(AVX1_M)))
Expand Down
162 changes: 141 additions & 21 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,8 @@ typedef double ggml_float;
//
#include <arm_neon.h>

float ggml_fp16_to_fp32(ggml_fp16_t x) {
return x;
}

ggml_fp16_t ggml_fp32_to_fp16(float x) {
return x;
}
#define GGML_COMPUTE_FP16_TO_FP32(x) (x)
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)

#define GGML_FP16_TO_FP32(x) (x)
#define GGML_FP32_TO_FP16(x) (x)
Expand All @@ -150,15 +145,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
#endif

#ifdef __F16C__
float ggml_fp16_to_fp32(ggml_fp16_t h) {
return _cvtsh_ss(h);
}
ggml_fp16_t ggml_fp32_to_fp16(float f) {
return _cvtss_sh(f, 0);
}

#define GGML_FP16_TO_FP32(x) _cvtsh_ss(x)
#define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0)
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)

#else

Expand All @@ -183,7 +172,7 @@ static inline uint32_t fp32_to_bits(float f) {
return fp32.as_bits;
}

float ggml_fp16_to_fp32(ggml_fp16_t h) {
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t two_w = w + w;
Expand All @@ -206,7 +195,7 @@ float ggml_fp16_to_fp32(ggml_fp16_t h) {
return fp32_from_bits(result);
}

ggml_fp16_t ggml_fp32_to_fp16(float f) {
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float scale_to_inf = 0x1.0p+112f;
const float scale_to_zero = 0x1.0p-110f;
Expand All @@ -232,8 +221,8 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}

#define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
#define GGML_FP32_TO_FP16(x) ggml_fp32_to_fp16(x)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)

#endif // __F16C__

Expand All @@ -249,6 +238,34 @@ static ggml_fp16_t table_gelu_f16[1 << 16];
// precomputed exp table for f16 (128 KB)
static ggml_fp16_t table_exp_f16[1 << 16];

// precomputed f32 table for f16 (256 KB)
static float table_f32_f16[1 << 16];

// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)

inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
uint16_t s;
memcpy(&s, &f, sizeof(uint16_t));
return table_f32_f16[s];
}

#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)

#endif

// note: do not use these inside ggml.c
// these are meant to be used via the ggml.h API
float ggml_fp16_to_fp32(ggml_fp16_t x) {
return GGML_FP16_TO_FP32(x);
}

ggml_fp16_t ggml_fp32_to_fp16(float x) {
return GGML_FP32_TO_FP16(x);
}

//
// timing
//
Expand Down Expand Up @@ -692,6 +709,101 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE

#elif defined(__SSE3__)

#define GGML_SIMD

// F32 SSE

#define GGML_F32_STEP 32
#define GGML_F32_EPR 4

#define GGML_F32x4 __m128
#define GGML_F32x4_ZERO _mm_setzero_ps()
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
#define GGML_F32x4_LOAD _mm_loadu_ps
#define GGML_F32x4_STORE _mm_storeu_ps
#if defined(__FMA__)
// TODO: Does this work?
#define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
#else
#define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
#endif
#define GGML_F32x4_ADD _mm_add_ps
#define GGML_F32x4_MUL _mm_mul_ps
#define GGML_F32x4_REDUCE(res, x) \
{ \
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
} \
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
} \
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
} \
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
}
// TODO: is this optimal ?

#define GGML_F32_VEC GGML_F32x4
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE

// F16 SSE

#define GGML_F16_STEP 32
#define GGML_F16_EPR 4

static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
float tmp[4];

tmp[0] = GGML_FP16_TO_FP32(x[0]);
tmp[1] = GGML_FP16_TO_FP32(x[1]);
tmp[2] = GGML_FP16_TO_FP32(x[2]);
tmp[3] = GGML_FP16_TO_FP32(x[3]);

return _mm_loadu_ps(tmp);
}

static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
float arr[4];

_mm_storeu_ps(arr, y);

x[0] = GGML_FP32_TO_FP16(arr[0]);
x[1] = GGML_FP32_TO_FP16(arr[1]);
x[2] = GGML_FP32_TO_FP16(arr[2]);
x[3] = GGML_FP32_TO_FP16(arr[3]);
}

#define GGML_F32Cx4 __m128
#define GGML_F32Cx4_ZERO _mm_setzero_ps()
#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
#define GGML_F32Cx4_ADD _mm_add_ps
#define GGML_F32Cx4_MUL _mm_mul_ps
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE

#define GGML_F16_VEC GGML_F32Cx4
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE

#endif

// GGML_F32_ARR / GGML_F16_ARR
Expand Down Expand Up @@ -1269,15 +1381,15 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
static bool is_first_call = true;

if (is_first_call) {
// initialize GELU and EXP tables
// initialize GELU, EXP and F32 tables
{
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);

ggml_fp16_t ii;
for (int i = 0; i < (1 << 16); ++i) {
uint16_t ui = i;
memcpy(&ii, &ui, sizeof(ii));
const float f = GGML_FP16_TO_FP32(ii);
const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
}
Expand Down Expand Up @@ -8232,6 +8344,14 @@ int ggml_cpu_has_blas(void) {
#endif
}

int ggml_cpu_has_sse3(void) {
#if defined(__SSE3__)
return 1;
#else
return 0;
#endif
}

int ggml_cpu_has_vsx(void) {
#if defined(__POWER9_VECTOR__)
return 1;
Expand Down
1 change: 1 addition & 0 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,7 @@ int ggml_cpu_has_f16c(void);
int ggml_cpu_has_fp16_va(void);
int ggml_cpu_has_wasm_simd(void);
int ggml_cpu_has_blas(void);
int ggml_cpu_has_sse3(void);
int ggml_cpu_has_vsx(void);

#ifdef __cplusplus
Expand Down
1 change: 1 addition & 0 deletions whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2582,6 +2582,7 @@ const char * whisper_print_system_info(void) {
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";

return s.c_str();
Expand Down

0 comments on commit a62170c

Please sign in to comment.