Allow unaligned stocks (Menooker#25)

Jie-Lei · Aug 27, 2024 · 87d4e08 · 87d4e08
1 parent a73c1b6
commit 87d4e08
Show file tree

Hide file tree

Showing 21 changed files with 335 additions and 94 deletions.
diff --git a/Customize.md b/Customize.md
@@ -147,4 +147,13 @@ Note that `MyLib` corresponds to the directory name in `projects/`, and `"my_lib
 
 You can check the script in `projects/` for more examples using KunQuant to convert expressions to C++ source code file.
 
-More reading on operators provided by KunQuant: See [Operators.md](./Operators.md)
+More reading on operators provided by KunQuant: See [Operators.md](./Operators.md)
+
+## Performance tuning
+
+There are some configurable options of function `compileit(...)` above that may improve the performance (and maybe at the cost of accuracy).
+
+ * Input and output memory layout: `compileit(input_layout=?, output_layout=?)`. This affects how data are arranged in memory. Usually `STs` layout is faster than `TS` but may require some additional memory movement when you call the factor library.
+ * Partition factor: `compileit(partition_factor=some_int)`. A larger Partition factor will put more computations in a single generated function in C++. Enlarging Partition factor may reduce the overhead of thread-scheduling and eliminate some of the temp buffers. However, if the factor is too high, the generated C++ code will suffer from register-spilling.
+ * Blocking len: `compileit(blocking_len=some_int)`. It selects AVX2 or AVX512 instruction sets. Using AVX512 might have some slight performance gain over AVX2.
+ * Unaligned stock number: `compileit(allow_unaligned=some_bool)`. By default `True`. When `allow_unaligned` is set to false, the generated C++ code will assume the number of stocks to be aligned with the SIMD length (e.g., 8 float32 on AVX2). This will slightly improve the performance.
diff --git a/KunQuant/Driver.py b/KunQuant/Driver.py
@@ -1,13 +1,13 @@
 from KunQuant.passes import *
 from KunQuant.Stage import Function
 from KunQuant.Op import Input, Output, OpBase, CrossSectionalOp
-from typing import Dict, List
+from typing import Dict, List, Union
 import typing
 from collections import OrderedDict
 from dataclasses import dataclass
 from KunQuant.passes import Util as PassUtil
 
-required_version = "0x64100002"
+required_version = "0x64100003"
 
 def optimize(f: Function, options: dict)->Dict[str, int]:
     if PassUtil.debug_mode:
@@ -65,7 +65,7 @@ def _deprecation_check(name: str, argname: str) -> str:
         return "STs"
     return name
 
-def compileit(f: Function, module_name: str, partition_factor = 3, dtype = "float", blocking_len = None, input_layout = "STs", output_layout = "STs", options = {}):
+def compileit(f: Function, module_name: str, partition_factor = 3, dtype = "float", blocking_len = None, input_layout = "STs", output_layout = "STs", allow_unaligned: Union[bool, None] = None, options = {}):
     input_layout = _deprecation_check(input_layout, "input_layout")
     output_layout = _deprecation_check(output_layout, "input_layout")
     if dtype not in ["float", "double"]:
@@ -84,6 +84,13 @@ def compileit(f: Function, module_name: str, partition_factor = 3, dtype = "floa
 
     if stream_mode and options.get("opt_reduce", False):
         raise RuntimeError("Currently opt_reduce in stream mode is not supported.")
+    if stream_mode and allow_unaligned is None:
+        allow_unaligned = False
+    elif allow_unaligned is None:
+        allow_unaligned = True
+    if allow_unaligned and stream_mode:
+        raise RuntimeError("Currently allow_unaligned in stream mode is not supported.")
+
     input_name_to_idx: Dict[str, int] = dict()
     buffer_names: List[_Buffer] = []
     partitions: typing.OrderedDict[str, _Partition] = OrderedDict()
@@ -152,7 +159,7 @@ def set_buffer_layout(op: OpBase, buf: _Buffer):
         def query_temp_buf_id(tempname: str, window: int) -> int:
             input_windows[tempname] = window
             return insert_name_str(tempname, "TEMP").idx
-        src = codegen_cpp(func, input_name_to_idx, ins, outs, options, stream_mode, query_temp_buf_id, input_windows, dtype, blocking_len)
+        src = codegen_cpp(func, input_name_to_idx, ins, outs, options, stream_mode, query_temp_buf_id, input_windows, dtype, blocking_len, not allow_unaligned)
         impl_src.append(src)
         newparti = _Partition(func.name, len(partitions), pins, pouts)
         if len(func.ops) == 3 and isinstance(func.ops[1], CrossSectionalOp):
@@ -212,6 +219,7 @@ def query_temp_buf_id(tempname: str, window: int) -> int:
     MemoryLayout::{input_layout},
     MemoryLayout::{output_layout},
     {blocking_len},
-    Datatype::{dty}
+    Datatype::{dty},
+    {"0" if allow_unaligned else "1"}
 }};''')
     return "\n\n".join(impl_src)
diff --git a/KunQuant/passes/CodegenCpp.py b/KunQuant/passes/CodegenCpp.py
@@ -70,7 +70,7 @@ def _value_to_float(op: OpBase, dtype: str) -> str:
 
 vector_len = 8
 
-def codegen_cpp(f: Function, input_name_to_idx: Dict[str, int], inputs: List[Tuple[Input, bool]], outputs: List[Tuple[Output, bool]], options: dict, stream_mode: bool, query_temp_buffer_id, stream_window_size: Dict[str, int], elem_type: str, simd_lanes: int) -> str:
+def codegen_cpp(f: Function, input_name_to_idx: Dict[str, int], inputs: List[Tuple[Input, bool]], outputs: List[Tuple[Output, bool]], options: dict, stream_mode: bool, query_temp_buffer_id, stream_window_size: Dict[str, int], elem_type: str, simd_lanes: int, aligned: bool) -> str:
     if len(f.ops) == 3 and isinstance(f.ops[1], CrossSectionalOp):
         return f'''static auto stage_{f.name} = {f.ops[1].__class__.__name__}Stocks<Mapper{f.ops[0].attrs["layout"]}<{elem_type}, {simd_lanes}>, Mapper{f.ops[2].attrs["layout"]}<{elem_type}, {simd_lanes}>>;'''
     header = f'''static void stage_{f.name}(Context* __ctx, size_t __stock_idx, size_t __total_time, size_t __start, size_t __length) '''
@@ -92,7 +92,9 @@ def codegen_cpp(f: Function, input_name_to_idx: Dict[str, int], inputs: List[Tup
             buffer_type[inp] = f"Input{layout}<{elem_type}, {simd_lanes}>"
             code = f"Input{layout}<{elem_type}, {simd_lanes}> buf_{name}{{__ctx->buffers[{idx_in_ctx}].ptr{ptrname}, __stock_idx, __ctx->stock_count, {total_str}, {start_str}}};"
         toplevel.scope.append(_CppSingleLine(toplevel, code))
-
+    if not aligned:
+        toplevel.scope.append(_CppSingleLine(toplevel, f'''auto todo_count = __ctx->stock_count - __stock_idx  * {simd_lanes};'''))
+        toplevel.scope.append(_CppSingleLine(toplevel, f'''auto mask = kun_simd::vec<{elem_type}, {simd_lanes}>::make_mask(todo_count > {simd_lanes} ? {simd_lanes} : todo_count);'''))
     for idx, (outp, is_tmp) in enumerate(outputs):
         name = outp.attrs["name"]
         layout = outp.attrs["layout"]
@@ -131,10 +133,18 @@ def codegen_cpp(f: Function, input_name_to_idx: Dict[str, int], inputs: List[Tup
         scope = loop_to_cpp_loop[op.get_parent()]
         if isinstance(op, Input):
             name = op.attrs["name"]
-            scope.scope.append(_CppSingleLine(scope, f"auto v{idx} = buf_{name}.step(i);"))
+            if not aligned and op.attrs["layout"] == "TS":
+                mask_str = ", mask"
+            else:
+                mask_str = ""
+            scope.scope.append(_CppSingleLine(scope, f"auto v{idx} = buf_{name}.step(i{mask_str});"))
         elif isinstance(op, Output):
             name = op.attrs["name"]
-            scope.scope.append(_CppSingleLine(scope, f"buf_{name}.store(i, v{inp[0]});"))
+            if not aligned and op.attrs["layout"] == "TS":
+                mask_str = ", mask"
+            else:
+                mask_str = ""
+            scope.scope.append(_CppSingleLine(scope, f"buf_{name}.store(i, v{inp[0]}{mask_str});"))
             scope.scope.append(_CppSingleLine(scope, f"auto v{idx} = v{inp[0]};"))
         elif isinstance(op, WindowedTempOutput):
             scope.scope.append(_CppSingleLine(scope, f"temp_{idx}.store(i, v{inp[0]});"))
@@ -226,5 +236,5 @@ def codegen_cpp(f: Function, input_name_to_idx: Dict[str, int], inputs: List[Tup
         elif isinstance(op, Select):
             scope.scope.append(_CppSingleLine(scope, f"auto v{idx} = Select(v{inp[0]}, v{inp[1]}, v{inp[2]});"))
         else:
-            raise RuntimeError("Cannot generate " + str(op))
+            raise RuntimeError(f"Cannot generate {op} of function {f}")
     return header + str(toplevel)
diff --git a/Readme.md b/Readme.md
@@ -52,7 +52,7 @@ g++=11.4.0
 * x86-64 CPU with at least AVX2-FMA instruction set
 * Optionally requires AVX512 on CPU for better performance
 
-**Important node**: Currently KunQuant only supports a multiple of `{blocking_len}` as the number of stocks as inputs. For single-precision float type and AVX2 instruction set, `blocking_len=8`. That is, you can only input 8, 16, 24, ..., etc. stocks in a batch, if your code is compiled with AVX2 (without AVX512) and `float` datatype.
+**Important node**: For better performance compared with Pandas, KunQuant suggests to use a multiple of `{blocking_len}` as the number of stocks in inputs. For single-precision float type and AVX2 instruction set, `blocking_len=8`. That is, you are suggested to input 8, 16, 24, ..., etc. stocks in a batch, if your code is compiled with AVX2 (without AVX512) and `float` datatype. Other numbers of stocks **are supported**, with lower execution performance.
 
 ## Compiling and running Alpha101
 

diff --git a/cpp/Kun/Module.hpp b/cpp/Kun/Module.hpp
@@ -22,6 +22,7 @@ struct Module {
     MemoryLayout output_layout;
     size_t blocking_len;
     Datatype dtype;
+    size_t aligned;
 };
 
 struct Library {

diff --git a/cpp/Kun/Ops.hpp b/cpp/Kun/Ops.hpp
@@ -67,6 +67,9 @@ struct InputTS : DataSource<true> {
     }
 
     simd_t step(size_t index) { return simd_t::load(getPtr(index)); }
+    simd_t step(size_t index, const typename simd_t::Masktype &mask) {
+        return simd_t::masked_load(getPtr(index), mask);
+    }
 
     simd_t getWindow(size_t index, size_t offset) {
         if (index < offset) {
@@ -97,7 +100,6 @@ struct OutputSTs : DataSource<true> {
     void store(size_t index, const simd_t &v) {
         simd_t::store(v, &buf[index * stride]);
     }
-
     simd_t getWindow(size_t index, size_t offset) {
         if (index < offset) {
             return NAN;
@@ -133,6 +135,10 @@ struct OutputTS : DataSource<true> {
     void store(size_t index, const simd_t &v) {
         simd_t::store(v, getPtr(index));
     }
+    void store(size_t index, const simd_t &v,
+               const typename simd_t::Masktype &mask) {
+        simd_t::masked_store(v, getPtr(index), mask);
+    }
 
     simd_t getWindow(size_t index, size_t offset) {
         if (index < offset) {
@@ -256,6 +262,27 @@ struct RequireWindow {
 using kun_simd::sc_isnan;
 using kun_simd::sc_select;
 
+template <typename T, int stride>
+INLINE kun_simd::vec<T, stride>
+kahanAdd(typename kun_simd::vec<T, stride>::Masktype isnan_small,
+         kun_simd::vec<T, stride> sum, kun_simd::vec<T, stride> small,
+         kun_simd::vec<T, stride> &compensation) {
+    auto y = small - compensation;
+    auto t = sum + y;
+    compensation = sc_select(isnan_small, compensation, t - sum - y);
+    return t;
+}
+
+template <typename T, int stride>
+INLINE kun_simd::vec<T, stride>
+kahanAdd(kun_simd::vec<T, stride> sum, kun_simd::vec<T, stride> small,
+         kun_simd::vec<T, stride> &compensation) {
+    auto y = small - compensation;
+    auto t = sum + y;
+    compensation = t - sum - y;
+    return t;
+}
+
 template <typename T, int stride, int window>
 struct FastWindowedSum {
     using simd_t = kun_simd::vec<T, stride>;
@@ -264,6 +291,8 @@ struct FastWindowedSum {
     using int_mask_t = typename simd_int_t::Masktype;
     using float_mask_t = typename simd_t::Masktype;
     simd_t v = 0;
+    simd_t compensationAdd = 0;
+    simd_t compensationSub = 0;
     simd_int_t num_nans = window;
     template <typename TInput>
     simd_t step(TInput &input, simd_t cur, size_t index) {
@@ -272,9 +301,11 @@ struct FastWindowedSum {
         auto old_is_nan = sc_isnan(old);
         auto new_is_nan = sc_isnan(cur);
         // v = old_is_nan? v : (v-old)
-        v = sc_select(old_is_nan, v, v - old);
+        v = sc_select(old_is_nan, v,
+                      kahanAdd(old_is_nan, v, 0 - old, compensationSub));
         // v = new_is_nan? v : (v+cur)
-        v = sc_select(new_is_nan, v, v + cur);
+        v = sc_select(new_is_nan, v,
+                      kahanAdd(new_is_nan, v, cur, compensationAdd));
         num_nans =
             num_nans - sc_select(kun_simd::bitcast<int_mask_t>(old_is_nan),
                                  simd_int_t{1}, simd_int_t{0});
@@ -384,7 +415,10 @@ template <typename T, int stride>
 struct ReduceAdd {
     using simd_t = kun_simd::vec<T, stride>;
     simd_t v = 0;
-    void step(simd_t input, size_t index) { v = v + input; }
+    simd_t compensation = 0;
+    void step(simd_t input, size_t index) {
+        v = kahanAdd(v, input, compensation);
+    }
     operator simd_t() { return v; }
 };
 

diff --git a/cpp/Kun/Rank.hpp b/cpp/Kun/Rank.hpp
@@ -48,7 +48,9 @@ void KUN_TEMPLATE_EXPORT RankStocks(RuntimeStage *stage, size_t time_idx,
                 auto pos = std::equal_range(data.begin(), data.end(), in);
                 auto start = pos.first - data.begin();
                 auto end = pos.second - data.begin();
-                out = ((start + end - 1) / 2.0f + 1.0f) / data.size();
+                auto sum = (start + end + 1) * (end - start) / 2;
+                out = T(sum) / T(end - start) / T(data.size());
+                // out = ((start + end - 1) / T{2.0} + T{1.0}) / data.size();
             } else {
                 out = NAN;
             }

diff --git a/cpp/Kun/Runtime.cpp b/cpp/Kun/Runtime.cpp
@@ -71,7 +71,7 @@ void checkedDealloc(void *ptr, size_t sz) {
 #endif
 
 namespace kun {
-static const uint64_t VERSION = 0x64100002;
+static const uint64_t VERSION = 0x64100003;
 
 void Buffer::alloc(size_t count, size_t use_count, size_t elem_size) {
     if (!ptr) {
@@ -195,7 +195,7 @@ void runGraph(std::shared_ptr<Executor> exec, const Module *m,
     Context ctx{std::move(rtlbuffers),
                 {},
                 exec,
-                num_stocks * length,
+                divideAndCeil(num_stocks, m->blocking_len) * m->blocking_len * length,
                 num_stocks,
                 total_time,
                 cur_time,

diff --git a/cpp/KunSIMD/cpu/cast.hpp b/cpp/KunSIMD/cpu/cast.hpp
@@ -1,10 +1,10 @@
+#include "f32x16.hpp"
 #include "f32x8.hpp"
 #include "f64x4.hpp"
+#include "f64x8.hpp"
+#include "s32x16.hpp"
 #include "s32x8.hpp"
 #include "s64x4.hpp"
-#include "s32x16.hpp"
-#include "f32x16.hpp"
-#include "f64x8.hpp"
 #include "s64x8.hpp"
 
 namespace kun_simd {
@@ -64,7 +64,6 @@ INLINE vec_s64x8 bitcast(vec_f64x8 v) {
 }
 #endif
 
-
 template <typename T1, typename T2>
 inline T1 bitcast(T2 v) {
     static_assert(sizeof(T1) == sizeof(T2), "unmatched bitcast");
@@ -128,7 +127,6 @@ INLINE vec_s32x16 fast_cast(vec_f32x16 v) {
 
 /////// end of f32 <==> s32
 
-
 /////// start of f64x4 <==> s64x4
 #if !defined(__AVX512DQ__) || !defined(__AVX512VL__)
 
@@ -192,7 +190,6 @@ INLINE vec_f64x4 fast_cast(vec_s64x4 v) {
 
 /////// end of f64x4 <==> s64x4
 
-
 /////// end of f64x8 <==> s64x8 with AVX512DQ
 #if defined(__AVX512F__) && defined(__AVX512DQ__)
 template <>
@@ -250,4 +247,14 @@ INLINE vec_s64x8 cast(vec_f64x8 v) {
     return _mm512_add_epi64(_mm512_slli_epi64(v_hi.v, 32), v_lo);
 }
 #endif
+
+INLINE vec_f32x8::Masktype vec_f32x8::make_mask(int N) {
+    return bitcast<vec_f32x8::Masktype>(vec_s32x8{N} >
+                                        vec_s32x8{0, 1, 2, 3, 4, 5, 6, 7});
+}
+
+INLINE vec_f64x4::Masktype vec_f64x4::make_mask(int N) {
+    return bitcast<vec_f64x4::Masktype>(vec_s64x4{N} > vec_s64x4{0, 1, 2, 3});
+}
+
 } // namespace kun_simd
diff --git a/cpp/KunSIMD/cpu/f32x16.hpp b/cpp/KunSIMD/cpu/f32x16.hpp
@@ -49,7 +49,16 @@ struct alignas(64) vec<float, 16> {
     static INLINE void store_aligned(vec v, float *p) {
         _mm512_store_ps(p, v.v);
     }
+    static INLINE vec masked_load(const float *p, Masktype mask) {
+        return _mm512_mask_loadu_ps(vec{0}, mask, p);
+    }
+    static INLINE void masked_store(vec v, float *p, Masktype mask) {
+        _mm512_mask_storeu_ps(p, mask, v.v);
+    }
 
+    static INLINE Masktype make_mask(int N) {
+        return (Masktype(1) << Masktype(N)) - Masktype(1);
+    }
     operator __m512() const { return v; }
 };
 

diff --git a/cpp/KunSIMD/cpu/f32x8.hpp b/cpp/KunSIMD/cpu/f32x8.hpp
@@ -51,6 +51,14 @@ struct alignas(32) vec<float, 8> {
     static INLINE void store_aligned(vec v, float *p) {
         _mm256_store_ps(p, v.v);
     }
+    static INLINE vec masked_load(const float *p, Masktype mask) {
+        return _mm256_maskload_ps(p, _mm256_castps_si256(mask));
+    }
+    static INLINE void masked_store(vec v, float *p, Masktype mask) {
+        _mm256_maskstore_ps(p, _mm256_castps_si256(mask), v.v);
+    }
+
+    static Masktype make_mask(int N);
 
     operator __m256() const { return v; }
 };
@@ -149,11 +157,11 @@ INLINE vec_f32x8 sc_abs(vec_f32x8 const &a) {
     return _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a.v);
 }
 
-inline vec_f32x8 sc_isnan(vec_f32x8 v1, vec_f32x8 v2) {
+INLINE vec_f32x8 sc_isnan(vec_f32x8 v1, vec_f32x8 v2) {
     return _mm256_cmp_ps(v1.v, v2.v, _CMP_UNORD_Q);
 }
 
-inline vec_f32x8 sc_isnan(vec_f32x8 v1) {
+INLINE vec_f32x8 sc_isnan(vec_f32x8 v1) {
     return _mm256_cmp_ps(v1.v, v1.v, _CMP_UNORD_Q);
 }
 

diff --git a/cpp/KunSIMD/cpu/f64x4.hpp b/cpp/KunSIMD/cpu/f64x4.hpp
@@ -50,7 +50,14 @@ struct alignas(32) vec<double, 4> {
     static INLINE void store_aligned(vec v, double *p) {
         _mm256_store_pd(p, v.v);
     }
+    static INLINE vec masked_load(const double *p, Masktype mask) {
+        return _mm256_maskload_pd(p, _mm256_castpd_si256(mask));
+    }
+    static INLINE void masked_store(vec v, double *p, Masktype mask) {
+        _mm256_maskstore_pd(p, _mm256_castpd_si256(mask), v.v);
+    }
 
+    static Masktype make_mask(int N);
     operator __m256d() const { return v; }
 };
 

diff --git a/cpp/KunSIMD/cpu/f64x8.hpp b/cpp/KunSIMD/cpu/f64x8.hpp
@@ -33,7 +33,16 @@ struct alignas(64) vec<double, 8> {
     static INLINE void store_aligned(vec v, double *p) {
         _mm512_store_pd(p, v.v);
     }
+    static INLINE vec masked_load(const double *p, Masktype mask) {
+        return _mm512_mask_loadu_pd(vec{0}, mask, p);
+    }
+    static INLINE void masked_store(vec v, double *p, Masktype mask) {
+        _mm512_mask_storeu_pd(p, mask, v.v);
+    }
 
+    static INLINE Masktype make_mask(int N) {
+        return (Masktype(1) << Masktype(N)) - Masktype(1);
+    }
     operator __m512d() const { return v; }
 };