[L2Space] Perf improvement for dimension not of factor 4 and 16

Currently SIMD (SSE or AVX) is used for the cases when dimension is multiple of 4 or 16, when dimension size is not strictly equal to multiple of 4 or 16 a slower non-vectorized method is used. To improve performance for these cases new methods are added: `L2SqrSIMD(4|16)ExtResidual` - relies on existing `L2SqrSIMD(4|16)Ext` to compute up to *4 and *16 dimensions and finishes residual computation by method `L2Sqr`. Performance improvement compared to baseline is x3-4 times depending on dimension. Benchmark results: Run on (4 X 3300 MHz CPU s) CPU Caches: L1 Data 32 KiB (x2) L1 Instruction 32 KiB (x2) L2 Unified 256 KiB (x2) L3 Unified 4096 KiB (x1) Load Average: 2.18, 2.35, 3.88 ----------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------- TstDim65 14.7 ns 14.7 ns 20 * 47128209 RefDim65 50.2 ns 50.1 ns 20 * 10373751 TstDim101 24.7 ns 24.7 ns 20 * 28064436 RefDim101 90.4 ns 90.2 ns 20 * 7592191 TstDim129 31.4 ns 31.3 ns 20 * 22397921 RefDim129 125 ns 124 ns 20 * 5548862 TstDim257 59.3 ns 59.2 ns 20 * 10856753 RefDim257 266 ns 266 ns 20 * 2630926
sjwsl · Apr 19, 2020 · df20d5d · df20d5d
1 parent 050044d
commit df20d5d
Showing 1 changed file with 50 additions and 29 deletions.
diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h
@@ -4,16 +4,19 @@
 namespace hnswlib {
 
     static float
-    L2Sqr(const void *pVect1, const void *pVect2, const void *qty_ptr) {
-        //return *((float *)pVect2);
+    L2Sqr(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        float *pVect1 = (float *) pVect1v;
+        float *pVect2 = (float *) pVect2v;
         size_t qty = *((size_t *) qty_ptr);
+
         float res = 0;
-        for (unsigned i = 0; i < qty; i++) {
-            float t = ((float *) pVect1)[i] - ((float *) pVect2)[i];
+        for (size_t i = 0; i < qty; i++) {
+            float t = *pVect1 - *pVect2;
+            pVect1++;
+            pVect2++;
             res += t * t;
         }
         return (res);
-
     }
 
 #if defined(USE_AVX)
@@ -49,10 +52,8 @@ namespace hnswlib {
         }
 
         _mm256_store_ps(TmpRes, sum);
-        float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
-
-        return (res);
-}
+        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
+    }
 
 #elif defined(USE_SSE)
 
@@ -62,12 +63,9 @@ namespace hnswlib {
         float *pVect2 = (float *) pVect2v;
         size_t qty = *((size_t *) qty_ptr);
         float PORTABLE_ALIGN32 TmpRes[8];
-        // size_t qty4 = qty >> 2;
         size_t qty16 = qty >> 4;
 
         const float *pEnd1 = pVect1 + (qty16 << 4);
-        // const float* pEnd2 = pVect1 + (qty4 << 2);
-        // const float* pEnd3 = pVect1 + qty;
 
         __m128 diff, v1, v2;
         __m128 sum = _mm_set1_ps(0);
@@ -102,10 +100,24 @@ namespace hnswlib {
             diff = _mm_sub_ps(v1, v2);
             sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
         }
+
         _mm_store_ps(TmpRes, sum);
-        float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+    }
+#endif
 
-        return (res);
+#if defined(USE_SSE) || defined(USE_AVX)
+    static float
+    L2SqrSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        size_t qty = *((size_t *) qty_ptr);
+        size_t qty16 = qty >> 4 << 4;
+        float res = L2SqrSIMD16Ext(pVect1v, pVect2v, &qty16);
+        float *pVect1 = (float *) pVect1v + qty16;
+        float *pVect2 = (float *) pVect2v + qty16;
+
+        size_t qty_left = qty - qty16;
+        float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
+        return (res + res_tail);
     }
 #endif
 
@@ -119,10 +131,9 @@ namespace hnswlib {
         size_t qty = *((size_t *) qty_ptr);
 
 
-        // size_t qty4 = qty >> 2;
-        size_t qty16 = qty >> 2;
+        size_t qty4 = qty >> 2;
 
-        const float *pEnd1 = pVect1 + (qty16 << 2);
+        const float *pEnd1 = pVect1 + (qty4 << 2);
 
         __m128 diff, v1, v2;
         __m128 sum = _mm_set1_ps(0);
@@ -136,9 +147,22 @@ namespace hnswlib {
             sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
         }
         _mm_store_ps(TmpRes, sum);
-        float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+    }
 
-        return (res);
+    static float
+    L2SqrSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        size_t qty = *((size_t *) qty_ptr);
+        size_t qty4 = qty >> 2 << 2;
+
+        float res = L2SqrSIMD4Ext(pVect1v, pVect2v, &qty4);
+        size_t qty_left = qty - qty4;
+
+        float *pVect1 = (float *) pVect1v + qty4;
+        float *pVect2 = (float *) pVect2v + qty4;
+        float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
+
+        return (res + res_tail);
     }
 #endif
 
@@ -151,13 +175,14 @@ namespace hnswlib {
         L2Space(size_t dim) {
             fstdistfunc_ = L2Sqr;
         #if defined(USE_SSE) || defined(USE_AVX)
-            if (dim % 4 == 0)
-                fstdistfunc_ = L2SqrSIMD4Ext;
             if (dim % 16 == 0)
                 fstdistfunc_ = L2SqrSIMD16Ext;
-            /*else{
-                throw runtime_error("Data type not supported!");
-            }*/
+            else if (dim % 4 == 0)
+                fstdistfunc_ = L2SqrSIMD4Ext;
+            else if (dim > 16)
+                fstdistfunc_ = L2SqrSIMD16ExtResiduals;
+            else if (dim > 4)
+                fstdistfunc_ = L2SqrSIMD4ExtResiduals;
         #endif
             dim_ = dim;
             data_size_ = dim * sizeof(float);
@@ -185,10 +210,6 @@ namespace hnswlib {
         int res = 0;
         unsigned char *a = (unsigned char *) pVect1;
         unsigned char *b = (unsigned char *) pVect2;
-        /*for (int i = 0; i < qty; i++) {
-            int t = int((a)[i]) - int((b)[i]);
-            res += t*t;
-        }*/
 
         qty = qty >> 2;
         for (size_t i = 0; i < qty; i++) {
@@ -241,4 +262,4 @@ namespace hnswlib {
     };
 
 
-}
+}