Skip to content

Commit

Permalink
[L2Space] Perf improvement for dimension not of factor 4 and 16
Browse files Browse the repository at this point in the history
Currently SIMD (SSE or AVX) is used for the cases when dimension is
multiple of 4 or 16, when dimension size is not strictly equal to
multiple of 4 or 16 a slower non-vectorized method is used.

To improve performance for these cases new methods are added:
`L2SqrSIMD(4|16)ExtResidual` - relies on existing `L2SqrSIMD(4|16)Ext`
to compute up to *4 and *16 dimensions and finishes residual
computation by method `L2Sqr`.

Performance improvement compared to baseline is x3-4 times depending on
dimension. Benchmark results:

Run on (4 X 3300 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x2)
  L1 Instruction 32 KiB (x2)
  L2 Unified 256 KiB (x2)
  L3 Unified 4096 KiB (x1)
Load Average: 2.18, 2.35, 3.88
-----------------------------------------------------------
Benchmark          Time             CPU        Iterations
-----------------------------------------------------------
TstDim65        14.7 ns         14.7 ns     20 * 47128209
RefDim65        50.2 ns         50.1 ns     20 * 10373751
TstDim101       24.7 ns         24.7 ns     20 * 28064436
RefDim101       90.4 ns         90.2 ns     20 *  7592191
TstDim129       31.4 ns         31.3 ns     20 * 22397921
RefDim129        125 ns          124 ns     20 *  5548862
TstDim257       59.3 ns         59.2 ns     20 * 10856753
RefDim257        266 ns          266 ns     20 *  2630926
  • Loading branch information
2ooom committed Apr 19, 2020
1 parent 050044d commit df20d5d
Showing 1 changed file with 50 additions and 29 deletions.
79 changes: 50 additions & 29 deletions hnswlib/space_l2.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@
namespace hnswlib {

static float
L2Sqr(const void *pVect1, const void *pVect2, const void *qty_ptr) {
//return *((float *)pVect2);
L2Sqr(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
float *pVect1 = (float *) pVect1v;
float *pVect2 = (float *) pVect2v;
size_t qty = *((size_t *) qty_ptr);

float res = 0;
for (unsigned i = 0; i < qty; i++) {
float t = ((float *) pVect1)[i] - ((float *) pVect2)[i];
for (size_t i = 0; i < qty; i++) {
float t = *pVect1 - *pVect2;
pVect1++;
pVect2++;
res += t * t;
}
return (res);

}

#if defined(USE_AVX)
Expand Down Expand Up @@ -49,10 +52,8 @@ namespace hnswlib {
}

_mm256_store_ps(TmpRes, sum);
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];

return (res);
}
return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
}

#elif defined(USE_SSE)

Expand All @@ -62,12 +63,9 @@ namespace hnswlib {
float *pVect2 = (float *) pVect2v;
size_t qty = *((size_t *) qty_ptr);
float PORTABLE_ALIGN32 TmpRes[8];
// size_t qty4 = qty >> 2;
size_t qty16 = qty >> 4;

const float *pEnd1 = pVect1 + (qty16 << 4);
// const float* pEnd2 = pVect1 + (qty4 << 2);
// const float* pEnd3 = pVect1 + qty;

__m128 diff, v1, v2;
__m128 sum = _mm_set1_ps(0);
Expand Down Expand Up @@ -102,10 +100,24 @@ namespace hnswlib {
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
}

_mm_store_ps(TmpRes, sum);
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
}
#endif

return (res);
#if defined(USE_SSE) || defined(USE_AVX)
static float
L2SqrSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
size_t qty = *((size_t *) qty_ptr);
size_t qty16 = qty >> 4 << 4;
float res = L2SqrSIMD16Ext(pVect1v, pVect2v, &qty16);
float *pVect1 = (float *) pVect1v + qty16;
float *pVect2 = (float *) pVect2v + qty16;

size_t qty_left = qty - qty16;
float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
return (res + res_tail);
}
#endif

Expand All @@ -119,10 +131,9 @@ namespace hnswlib {
size_t qty = *((size_t *) qty_ptr);


// size_t qty4 = qty >> 2;
size_t qty16 = qty >> 2;
size_t qty4 = qty >> 2;

const float *pEnd1 = pVect1 + (qty16 << 2);
const float *pEnd1 = pVect1 + (qty4 << 2);

__m128 diff, v1, v2;
__m128 sum = _mm_set1_ps(0);
Expand All @@ -136,9 +147,22 @@ namespace hnswlib {
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
}
_mm_store_ps(TmpRes, sum);
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
}

return (res);
static float
L2SqrSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
size_t qty = *((size_t *) qty_ptr);
size_t qty4 = qty >> 2 << 2;

float res = L2SqrSIMD4Ext(pVect1v, pVect2v, &qty4);
size_t qty_left = qty - qty4;

float *pVect1 = (float *) pVect1v + qty4;
float *pVect2 = (float *) pVect2v + qty4;
float res_tail = L2Sqr(pVect1, pVect2, &qty_left);

return (res + res_tail);
}
#endif

Expand All @@ -151,13 +175,14 @@ namespace hnswlib {
L2Space(size_t dim) {
fstdistfunc_ = L2Sqr;
#if defined(USE_SSE) || defined(USE_AVX)
if (dim % 4 == 0)
fstdistfunc_ = L2SqrSIMD4Ext;
if (dim % 16 == 0)
fstdistfunc_ = L2SqrSIMD16Ext;
/*else{
throw runtime_error("Data type not supported!");
}*/
else if (dim % 4 == 0)
fstdistfunc_ = L2SqrSIMD4Ext;
else if (dim > 16)
fstdistfunc_ = L2SqrSIMD16ExtResiduals;
else if (dim > 4)
fstdistfunc_ = L2SqrSIMD4ExtResiduals;
#endif
dim_ = dim;
data_size_ = dim * sizeof(float);
Expand Down Expand Up @@ -185,10 +210,6 @@ namespace hnswlib {
int res = 0;
unsigned char *a = (unsigned char *) pVect1;
unsigned char *b = (unsigned char *) pVect2;
/*for (int i = 0; i < qty; i++) {
int t = int((a)[i]) - int((b)[i]);
res += t*t;
}*/

qty = qty >> 2;
for (size_t i = 0; i < qty; i++) {
Expand Down Expand Up @@ -241,4 +262,4 @@ namespace hnswlib {
};


}
}

0 comments on commit df20d5d

Please sign in to comment.