Skip to content

Commit

Permalink
Allow unaligned stocks (Menooker#25)
Browse files Browse the repository at this point in the history
  • Loading branch information
Menooker authored Aug 27, 2024
1 parent a73c1b6 commit 87d4e08
Show file tree
Hide file tree
Showing 21 changed files with 335 additions and 94 deletions.
11 changes: 10 additions & 1 deletion Customize.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,4 +147,13 @@ Note that `MyLib` corresponds to the directory name in `projects/`, and `"my_lib

You can check the script in `projects/` for more examples using KunQuant to convert expressions to C++ source code file.

More reading on operators provided by KunQuant: See [Operators.md](./Operators.md)
More reading on operators provided by KunQuant: See [Operators.md](./Operators.md)

## Performance tuning

There are some configurable options of function `compileit(...)` above that may improve the performance (and maybe at the cost of accuracy).

* Input and output memory layout: `compileit(input_layout=?, output_layout=?)`. This affects how data are arranged in memory. Usually `STs` layout is faster than `TS` but may require some additional memory movement when you call the factor library.
* Partition factor: `compileit(partition_factor=some_int)`. A larger Partition factor will put more computations in a single generated function in C++. Enlarging Partition factor may reduce the overhead of thread-scheduling and eliminate some of the temp buffers. However, if the factor is too high, the generated C++ code will suffer from register-spilling.
* Blocking len: `compileit(blocking_len=some_int)`. It selects AVX2 or AVX512 instruction sets. Using AVX512 might have some slight performance gain over AVX2.
* Unaligned stock number: `compileit(allow_unaligned=some_bool)`. By default `True`. When `allow_unaligned` is set to false, the generated C++ code will assume the number of stocks to be aligned with the SIMD length (e.g., 8 float32 on AVX2). This will slightly improve the performance.
18 changes: 13 additions & 5 deletions KunQuant/Driver.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from KunQuant.passes import *
from KunQuant.Stage import Function
from KunQuant.Op import Input, Output, OpBase, CrossSectionalOp
from typing import Dict, List
from typing import Dict, List, Union
import typing
from collections import OrderedDict
from dataclasses import dataclass
from KunQuant.passes import Util as PassUtil

required_version = "0x64100002"
required_version = "0x64100003"

def optimize(f: Function, options: dict)->Dict[str, int]:
if PassUtil.debug_mode:
Expand Down Expand Up @@ -65,7 +65,7 @@ def _deprecation_check(name: str, argname: str) -> str:
return "STs"
return name

def compileit(f: Function, module_name: str, partition_factor = 3, dtype = "float", blocking_len = None, input_layout = "STs", output_layout = "STs", options = {}):
def compileit(f: Function, module_name: str, partition_factor = 3, dtype = "float", blocking_len = None, input_layout = "STs", output_layout = "STs", allow_unaligned: Union[bool, None] = None, options = {}):
input_layout = _deprecation_check(input_layout, "input_layout")
output_layout = _deprecation_check(output_layout, "input_layout")
if dtype not in ["float", "double"]:
Expand All @@ -84,6 +84,13 @@ def compileit(f: Function, module_name: str, partition_factor = 3, dtype = "floa

if stream_mode and options.get("opt_reduce", False):
raise RuntimeError("Currently opt_reduce in stream mode is not supported.")
if stream_mode and allow_unaligned is None:
allow_unaligned = False
elif allow_unaligned is None:
allow_unaligned = True
if allow_unaligned and stream_mode:
raise RuntimeError("Currently allow_unaligned in stream mode is not supported.")

input_name_to_idx: Dict[str, int] = dict()
buffer_names: List[_Buffer] = []
partitions: typing.OrderedDict[str, _Partition] = OrderedDict()
Expand Down Expand Up @@ -152,7 +159,7 @@ def set_buffer_layout(op: OpBase, buf: _Buffer):
def query_temp_buf_id(tempname: str, window: int) -> int:
input_windows[tempname] = window
return insert_name_str(tempname, "TEMP").idx
src = codegen_cpp(func, input_name_to_idx, ins, outs, options, stream_mode, query_temp_buf_id, input_windows, dtype, blocking_len)
src = codegen_cpp(func, input_name_to_idx, ins, outs, options, stream_mode, query_temp_buf_id, input_windows, dtype, blocking_len, not allow_unaligned)
impl_src.append(src)
newparti = _Partition(func.name, len(partitions), pins, pouts)
if len(func.ops) == 3 and isinstance(func.ops[1], CrossSectionalOp):
Expand Down Expand Up @@ -212,6 +219,7 @@ def query_temp_buf_id(tempname: str, window: int) -> int:
MemoryLayout::{input_layout},
MemoryLayout::{output_layout},
{blocking_len},
Datatype::{dty}
Datatype::{dty},
{"0" if allow_unaligned else "1"}
}};''')
return "\n\n".join(impl_src)
20 changes: 15 additions & 5 deletions KunQuant/passes/CodegenCpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def _value_to_float(op: OpBase, dtype: str) -> str:

vector_len = 8

def codegen_cpp(f: Function, input_name_to_idx: Dict[str, int], inputs: List[Tuple[Input, bool]], outputs: List[Tuple[Output, bool]], options: dict, stream_mode: bool, query_temp_buffer_id, stream_window_size: Dict[str, int], elem_type: str, simd_lanes: int) -> str:
def codegen_cpp(f: Function, input_name_to_idx: Dict[str, int], inputs: List[Tuple[Input, bool]], outputs: List[Tuple[Output, bool]], options: dict, stream_mode: bool, query_temp_buffer_id, stream_window_size: Dict[str, int], elem_type: str, simd_lanes: int, aligned: bool) -> str:
if len(f.ops) == 3 and isinstance(f.ops[1], CrossSectionalOp):
return f'''static auto stage_{f.name} = {f.ops[1].__class__.__name__}Stocks<Mapper{f.ops[0].attrs["layout"]}<{elem_type}, {simd_lanes}>, Mapper{f.ops[2].attrs["layout"]}<{elem_type}, {simd_lanes}>>;'''
header = f'''static void stage_{f.name}(Context* __ctx, size_t __stock_idx, size_t __total_time, size_t __start, size_t __length) '''
Expand All @@ -92,7 +92,9 @@ def codegen_cpp(f: Function, input_name_to_idx: Dict[str, int], inputs: List[Tup
buffer_type[inp] = f"Input{layout}<{elem_type}, {simd_lanes}>"
code = f"Input{layout}<{elem_type}, {simd_lanes}> buf_{name}{{__ctx->buffers[{idx_in_ctx}].ptr{ptrname}, __stock_idx, __ctx->stock_count, {total_str}, {start_str}}};"
toplevel.scope.append(_CppSingleLine(toplevel, code))

if not aligned:
toplevel.scope.append(_CppSingleLine(toplevel, f'''auto todo_count = __ctx->stock_count - __stock_idx * {simd_lanes};'''))
toplevel.scope.append(_CppSingleLine(toplevel, f'''auto mask = kun_simd::vec<{elem_type}, {simd_lanes}>::make_mask(todo_count > {simd_lanes} ? {simd_lanes} : todo_count);'''))
for idx, (outp, is_tmp) in enumerate(outputs):
name = outp.attrs["name"]
layout = outp.attrs["layout"]
Expand Down Expand Up @@ -131,10 +133,18 @@ def codegen_cpp(f: Function, input_name_to_idx: Dict[str, int], inputs: List[Tup
scope = loop_to_cpp_loop[op.get_parent()]
if isinstance(op, Input):
name = op.attrs["name"]
scope.scope.append(_CppSingleLine(scope, f"auto v{idx} = buf_{name}.step(i);"))
if not aligned and op.attrs["layout"] == "TS":
mask_str = ", mask"
else:
mask_str = ""
scope.scope.append(_CppSingleLine(scope, f"auto v{idx} = buf_{name}.step(i{mask_str});"))
elif isinstance(op, Output):
name = op.attrs["name"]
scope.scope.append(_CppSingleLine(scope, f"buf_{name}.store(i, v{inp[0]});"))
if not aligned and op.attrs["layout"] == "TS":
mask_str = ", mask"
else:
mask_str = ""
scope.scope.append(_CppSingleLine(scope, f"buf_{name}.store(i, v{inp[0]}{mask_str});"))
scope.scope.append(_CppSingleLine(scope, f"auto v{idx} = v{inp[0]};"))
elif isinstance(op, WindowedTempOutput):
scope.scope.append(_CppSingleLine(scope, f"temp_{idx}.store(i, v{inp[0]});"))
Expand Down Expand Up @@ -226,5 +236,5 @@ def codegen_cpp(f: Function, input_name_to_idx: Dict[str, int], inputs: List[Tup
elif isinstance(op, Select):
scope.scope.append(_CppSingleLine(scope, f"auto v{idx} = Select(v{inp[0]}, v{inp[1]}, v{inp[2]});"))
else:
raise RuntimeError("Cannot generate " + str(op))
raise RuntimeError(f"Cannot generate {op} of function {f}")
return header + str(toplevel)
2 changes: 1 addition & 1 deletion Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ g++=11.4.0
* x86-64 CPU with at least AVX2-FMA instruction set
* Optionally requires AVX512 on CPU for better performance

**Important node**: Currently KunQuant only supports a multiple of `{blocking_len}` as the number of stocks as inputs. For single-precision float type and AVX2 instruction set, `blocking_len=8`. That is, you can only input 8, 16, 24, ..., etc. stocks in a batch, if your code is compiled with AVX2 (without AVX512) and `float` datatype.
**Important node**: For better performance compared with Pandas, KunQuant suggests to use a multiple of `{blocking_len}` as the number of stocks in inputs. For single-precision float type and AVX2 instruction set, `blocking_len=8`. That is, you are suggested to input 8, 16, 24, ..., etc. stocks in a batch, if your code is compiled with AVX2 (without AVX512) and `float` datatype. Other numbers of stocks **are supported**, with lower execution performance.

## Compiling and running Alpha101

Expand Down
1 change: 1 addition & 0 deletions cpp/Kun/Module.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ struct Module {
MemoryLayout output_layout;
size_t blocking_len;
Datatype dtype;
size_t aligned;
};

struct Library {
Expand Down
42 changes: 38 additions & 4 deletions cpp/Kun/Ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ struct InputTS : DataSource<true> {
}

simd_t step(size_t index) { return simd_t::load(getPtr(index)); }
simd_t step(size_t index, const typename simd_t::Masktype &mask) {
return simd_t::masked_load(getPtr(index), mask);
}

simd_t getWindow(size_t index, size_t offset) {
if (index < offset) {
Expand Down Expand Up @@ -97,7 +100,6 @@ struct OutputSTs : DataSource<true> {
void store(size_t index, const simd_t &v) {
simd_t::store(v, &buf[index * stride]);
}

simd_t getWindow(size_t index, size_t offset) {
if (index < offset) {
return NAN;
Expand Down Expand Up @@ -133,6 +135,10 @@ struct OutputTS : DataSource<true> {
void store(size_t index, const simd_t &v) {
simd_t::store(v, getPtr(index));
}
void store(size_t index, const simd_t &v,
const typename simd_t::Masktype &mask) {
simd_t::masked_store(v, getPtr(index), mask);
}

simd_t getWindow(size_t index, size_t offset) {
if (index < offset) {
Expand Down Expand Up @@ -256,6 +262,27 @@ struct RequireWindow {
using kun_simd::sc_isnan;
using kun_simd::sc_select;

template <typename T, int stride>
INLINE kun_simd::vec<T, stride>
kahanAdd(typename kun_simd::vec<T, stride>::Masktype isnan_small,
kun_simd::vec<T, stride> sum, kun_simd::vec<T, stride> small,
kun_simd::vec<T, stride> &compensation) {
auto y = small - compensation;
auto t = sum + y;
compensation = sc_select(isnan_small, compensation, t - sum - y);
return t;
}

template <typename T, int stride>
INLINE kun_simd::vec<T, stride>
kahanAdd(kun_simd::vec<T, stride> sum, kun_simd::vec<T, stride> small,
kun_simd::vec<T, stride> &compensation) {
auto y = small - compensation;
auto t = sum + y;
compensation = t - sum - y;
return t;
}

template <typename T, int stride, int window>
struct FastWindowedSum {
using simd_t = kun_simd::vec<T, stride>;
Expand All @@ -264,6 +291,8 @@ struct FastWindowedSum {
using int_mask_t = typename simd_int_t::Masktype;
using float_mask_t = typename simd_t::Masktype;
simd_t v = 0;
simd_t compensationAdd = 0;
simd_t compensationSub = 0;
simd_int_t num_nans = window;
template <typename TInput>
simd_t step(TInput &input, simd_t cur, size_t index) {
Expand All @@ -272,9 +301,11 @@ struct FastWindowedSum {
auto old_is_nan = sc_isnan(old);
auto new_is_nan = sc_isnan(cur);
// v = old_is_nan? v : (v-old)
v = sc_select(old_is_nan, v, v - old);
v = sc_select(old_is_nan, v,
kahanAdd(old_is_nan, v, 0 - old, compensationSub));
// v = new_is_nan? v : (v+cur)
v = sc_select(new_is_nan, v, v + cur);
v = sc_select(new_is_nan, v,
kahanAdd(new_is_nan, v, cur, compensationAdd));
num_nans =
num_nans - sc_select(kun_simd::bitcast<int_mask_t>(old_is_nan),
simd_int_t{1}, simd_int_t{0});
Expand Down Expand Up @@ -384,7 +415,10 @@ template <typename T, int stride>
struct ReduceAdd {
using simd_t = kun_simd::vec<T, stride>;
simd_t v = 0;
void step(simd_t input, size_t index) { v = v + input; }
simd_t compensation = 0;
void step(simd_t input, size_t index) {
v = kahanAdd(v, input, compensation);
}
operator simd_t() { return v; }
};

Expand Down
4 changes: 3 additions & 1 deletion cpp/Kun/Rank.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ void KUN_TEMPLATE_EXPORT RankStocks(RuntimeStage *stage, size_t time_idx,
auto pos = std::equal_range(data.begin(), data.end(), in);
auto start = pos.first - data.begin();
auto end = pos.second - data.begin();
out = ((start + end - 1) / 2.0f + 1.0f) / data.size();
auto sum = (start + end + 1) * (end - start) / 2;
out = T(sum) / T(end - start) / T(data.size());
// out = ((start + end - 1) / T{2.0} + T{1.0}) / data.size();
} else {
out = NAN;
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/Kun/Runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ void checkedDealloc(void *ptr, size_t sz) {
#endif

namespace kun {
static const uint64_t VERSION = 0x64100002;
static const uint64_t VERSION = 0x64100003;

void Buffer::alloc(size_t count, size_t use_count, size_t elem_size) {
if (!ptr) {
Expand Down Expand Up @@ -195,7 +195,7 @@ void runGraph(std::shared_ptr<Executor> exec, const Module *m,
Context ctx{std::move(rtlbuffers),
{},
exec,
num_stocks * length,
divideAndCeil(num_stocks, m->blocking_len) * m->blocking_len * length,
num_stocks,
total_time,
cur_time,
Expand Down
19 changes: 13 additions & 6 deletions cpp/KunSIMD/cpu/cast.hpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#include "f32x16.hpp"
#include "f32x8.hpp"
#include "f64x4.hpp"
#include "f64x8.hpp"
#include "s32x16.hpp"
#include "s32x8.hpp"
#include "s64x4.hpp"
#include "s32x16.hpp"
#include "f32x16.hpp"
#include "f64x8.hpp"
#include "s64x8.hpp"

namespace kun_simd {
Expand Down Expand Up @@ -64,7 +64,6 @@ INLINE vec_s64x8 bitcast(vec_f64x8 v) {
}
#endif


template <typename T1, typename T2>
inline T1 bitcast(T2 v) {
static_assert(sizeof(T1) == sizeof(T2), "unmatched bitcast");
Expand Down Expand Up @@ -128,7 +127,6 @@ INLINE vec_s32x16 fast_cast(vec_f32x16 v) {

/////// end of f32 <==> s32


/////// start of f64x4 <==> s64x4
#if !defined(__AVX512DQ__) || !defined(__AVX512VL__)

Expand Down Expand Up @@ -192,7 +190,6 @@ INLINE vec_f64x4 fast_cast(vec_s64x4 v) {

/////// end of f64x4 <==> s64x4


/////// end of f64x8 <==> s64x8 with AVX512DQ
#if defined(__AVX512F__) && defined(__AVX512DQ__)
template <>
Expand Down Expand Up @@ -250,4 +247,14 @@ INLINE vec_s64x8 cast(vec_f64x8 v) {
return _mm512_add_epi64(_mm512_slli_epi64(v_hi.v, 32), v_lo);
}
#endif

INLINE vec_f32x8::Masktype vec_f32x8::make_mask(int N) {
return bitcast<vec_f32x8::Masktype>(vec_s32x8{N} >
vec_s32x8{0, 1, 2, 3, 4, 5, 6, 7});
}

INLINE vec_f64x4::Masktype vec_f64x4::make_mask(int N) {
return bitcast<vec_f64x4::Masktype>(vec_s64x4{N} > vec_s64x4{0, 1, 2, 3});
}

} // namespace kun_simd
9 changes: 9 additions & 0 deletions cpp/KunSIMD/cpu/f32x16.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,16 @@ struct alignas(64) vec<float, 16> {
static INLINE void store_aligned(vec v, float *p) {
_mm512_store_ps(p, v.v);
}
static INLINE vec masked_load(const float *p, Masktype mask) {
return _mm512_mask_loadu_ps(vec{0}, mask, p);
}
static INLINE void masked_store(vec v, float *p, Masktype mask) {
_mm512_mask_storeu_ps(p, mask, v.v);
}

static INLINE Masktype make_mask(int N) {
return (Masktype(1) << Masktype(N)) - Masktype(1);
}
operator __m512() const { return v; }
};

Expand Down
12 changes: 10 additions & 2 deletions cpp/KunSIMD/cpu/f32x8.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@ struct alignas(32) vec<float, 8> {
static INLINE void store_aligned(vec v, float *p) {
_mm256_store_ps(p, v.v);
}
static INLINE vec masked_load(const float *p, Masktype mask) {
return _mm256_maskload_ps(p, _mm256_castps_si256(mask));
}
static INLINE void masked_store(vec v, float *p, Masktype mask) {
_mm256_maskstore_ps(p, _mm256_castps_si256(mask), v.v);
}

static Masktype make_mask(int N);

operator __m256() const { return v; }
};
Expand Down Expand Up @@ -149,11 +157,11 @@ INLINE vec_f32x8 sc_abs(vec_f32x8 const &a) {
return _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a.v);
}

inline vec_f32x8 sc_isnan(vec_f32x8 v1, vec_f32x8 v2) {
INLINE vec_f32x8 sc_isnan(vec_f32x8 v1, vec_f32x8 v2) {
return _mm256_cmp_ps(v1.v, v2.v, _CMP_UNORD_Q);
}

inline vec_f32x8 sc_isnan(vec_f32x8 v1) {
INLINE vec_f32x8 sc_isnan(vec_f32x8 v1) {
return _mm256_cmp_ps(v1.v, v1.v, _CMP_UNORD_Q);
}

Expand Down
7 changes: 7 additions & 0 deletions cpp/KunSIMD/cpu/f64x4.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,14 @@ struct alignas(32) vec<double, 4> {
static INLINE void store_aligned(vec v, double *p) {
_mm256_store_pd(p, v.v);
}
static INLINE vec masked_load(const double *p, Masktype mask) {
return _mm256_maskload_pd(p, _mm256_castpd_si256(mask));
}
static INLINE void masked_store(vec v, double *p, Masktype mask) {
_mm256_maskstore_pd(p, _mm256_castpd_si256(mask), v.v);
}

static Masktype make_mask(int N);
operator __m256d() const { return v; }
};

Expand Down
9 changes: 9 additions & 0 deletions cpp/KunSIMD/cpu/f64x8.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,16 @@ struct alignas(64) vec<double, 8> {
static INLINE void store_aligned(vec v, double *p) {
_mm512_store_pd(p, v.v);
}
static INLINE vec masked_load(const double *p, Masktype mask) {
return _mm512_mask_loadu_pd(vec{0}, mask, p);
}
static INLINE void masked_store(vec v, double *p, Masktype mask) {
_mm512_mask_storeu_pd(p, mask, v.v);
}

static INLINE Masktype make_mask(int N) {
return (Masktype(1) << Masktype(N)) - Masktype(1);
}
operator __m512d() const { return v; }
};

Expand Down
Loading

0 comments on commit 87d4e08

Please sign in to comment.