Skip to content

Commit

Permalink
Fix fp64 test, add benchmark result (#24)
Browse files Browse the repository at this point in the history
  • Loading branch information
Menooker authored Aug 23, 2024
1 parent c35f0f2 commit a73c1b6
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 31 deletions.
9 changes: 6 additions & 3 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@ A typical workload of designing and running financial factors with KunQuant will

Experiments show that KunQuant-generated code can be more than 170x faster than naive implementation based on Pandas. We ran Alpha001~Alpha101 with [Pandas-based code](https://github.com/yli188/WorldQuant_alpha101_code/blob/master/101Alpha_code_1.py) and our optimized code. See results below:

| Pandas-based | KunQuant 1-thread | KunQuant 4-threads |
|---|---|---|
| 6.138s | 0.115s | 0.035s |
| Datatype | Pandas-based | KunQuant 1-thread | KunQuant 4-threads |
|---|---|---|---|
| Single precision (STs layout) | 6.138s | 0.083s | 0.027s |
| Double precision (TS layout) | 6.332s | 0.120s | 0.031s |

The data was collected on 4-core Intel i7-7700HQ CPU, running synthetic data of 64 stocks with 260 rows of data in single precision float point data type. Environment:

Expand Down Expand Up @@ -259,6 +260,8 @@ There are some other CPU instruction sets that is optional for KunQuant. You can

To see if your CPU supports AVX512 (and `AVX512DQ` and `AVX512VL`), you can run command `lscpu` in Linux and check the outputs.

Enabling AVX512 will slightly improve the performance, if it is supported by the CPU. Experiments only shows ~1% performance gain for 16-threads of AVX512 on Icelake, testing on double-precision Alpha101, with 128 stocks and time length of 12000. A single thread running the same task shows 5% performance gain on AVX512.

## Operator definitions

See [Operators.md](./Operators.md)
Expand Down
41 changes: 25 additions & 16 deletions cpp/Python/PyBinding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ PYBIND11_MODULE(KunRunner, m) {
"runGraph",
[](std::shared_ptr<kun::Executor> exec, const kun::Module *mod,
const py::dict inputs, size_t cur_time, size_t length,
const py::object outputs) {
const py::object outputs, bool skip_check) {
std::unordered_map<std::string, float *> bufs;
py::ssize_t known_S = 0;
py::ssize_t known_T = 0;
Expand All @@ -100,6 +100,23 @@ PYBIND11_MODULE(KunRunner, m) {
auto name = py::cast<std::string>(kv.first);
auto buf_obj = py::cast<py::buffer>(kv.second);
auto info = buf_obj.request();
bufs[name] = (float *)info.ptr;
if (skip_check) {
if (known_S == 0) {
if (mod->input_layout == kun::MemoryLayout::STs) {
auto S = info.shape[0];
auto T = info.shape[1];
known_S = S;
known_T = T;
} else if (mod->input_layout == kun::MemoryLayout::TS) {
auto S = info.shape[1];
auto T = info.shape[0];
known_S = S / simd_len;
known_T = T;
}
}
continue;
}
if (mod->dtype == kun::Datatype::Float) {
if (info.format != py::format_descriptor<float>::format())
throw std::runtime_error("Expecting float buffer at " +
Expand Down Expand Up @@ -139,7 +156,6 @@ PYBIND11_MODULE(KunRunner, m) {
} else {
throw std::runtime_error("Unknown layout at " + name);
}
bufs[name] = (float *)info.ptr;
}
if ((py::ssize_t)length > known_T) {
throw std::runtime_error("Bad parameter: length");
Expand All @@ -157,20 +173,12 @@ PYBIND11_MODULE(KunRunner, m) {
py::array outbuffer;
if (!outputs.is_none() && outputs.contains(buf.name)) {
py::array v;
if (mod->dtype == kun::Datatype::Float) {
outbuffer =
outputs[buf.name]
.cast<py::array_t<float,
py::array::c_style>>();
} else {
outbuffer =
outputs[buf.name]
.cast<py::array_t<double,
py::array::c_style>>();
outbuffer = outputs[buf.name].cast<py::buffer>();
auto info = outbuffer.request(true);
if (!skip_check) {
expectContiguousShape(mod->dtype, info, buf.name,
*expected_out_shape);
}
auto info = outbuffer.request();
expectContiguousShape(mod->dtype, info, buf.name,
*expected_out_shape);
bufs[buf.name] = (float *)info.ptr;
} else {
if (mod->dtype == kun::Datatype::Float) {
Expand All @@ -190,7 +198,8 @@ PYBIND11_MODULE(KunRunner, m) {
return ret;
},
py::arg("exec"), py::arg("mod"), py::arg("inputs"), py::arg("cur_time"),
py::arg("length"), py::arg("outputs") = py::dict());
py::arg("length"), py::arg("outputs") = py::dict(),
py::arg("skip_check") = false);

py::class_<kun::StreamContext>(m, "StreamContext")
.def(py::init<std::shared_ptr<kun::Executor>, const kun::Module *,
Expand Down
19 changes: 19 additions & 0 deletions tests/KunTestUtil/gen_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
import numpy as np

def aligned_copy(v: np.ndarray) -> np.ndarray:
alloc_size = v.nbytes
pointer = v.__array_interface__['data'][0]
rem = pointer % 64
if rem == 0:
return v
newbase = np.empty((alloc_size + 64, 0), dtype=np.uint8)
pointer = newbase.__array_interface__['data'][0]
rem = pointer % 64
if rem == 0:
offset = 0
else:
offset = 64 - rem
ret = newbase[offset:offset+alloc_size].view(v.dtype).reshape(v.shape)
ret[:] = v
return ret



def gen_stock_data2(low, high, stocks, num_time, stddev, dtype):
xopen = np.random.uniform(low, high, size = (stocks, 1)).astype(dtype)
# xvol = np.random.uniform(5, 5.2, size = (stocks, 1)).astype(dtype)
Expand Down
41 changes: 29 additions & 12 deletions tests/test_alpha101.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,10 +234,19 @@ def test(modu, executor, start_window, num_stock, num_time, my_input, ref, ische
# print(ref.alpha001())
# blocked = TS_STs(inp)

start = time.time()
out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
end = time.time()
print(f"Exec takes: {end-start:.6f} seconds")
if not ischeck:
out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
start = time.time()
for _ in range(20):
out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers, skip_check = True)
end = time.time()
tdiff = (end-start)/20
else:
start = time.time()
out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
end = time.time()
tdiff = end-start
print(f"Exec takes: {tdiff:.6f} seconds")
if not ischeck:
return True
# print(out)
Expand Down Expand Up @@ -323,11 +332,19 @@ def test64(modu, executor, start_window, num_stock, num_time, my_input, ref, isc
outbuffers[name] = sharedbuf[idx]
# print(ref.alpha001())
# blocked = TS_STs(inp)

start = time.time()
out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
end = time.time()
print(f"Exec takes: {end-start:.6f} seconds")
if not ischeck:
out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
start = time.time()
for _ in range(20):
out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers, skip_check = True)
end = time.time()
tdiff = (end-start)/20
else:
start = time.time()
out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
end = time.time()
tdiff = end-start
print(f"Exec takes: {tdiff:.6f} seconds")
if not ischeck:
return True
# print(out)
Expand All @@ -348,10 +365,10 @@ def main64():
my_input, pd_ref = make_data_and_ref(num_stock, num_time, is_check, 0, "float64")
executor = kr.createSingleThreadExecutor()
done = True
done = done & test(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
done = done & test(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 50)
done = done & test64(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
done = done & test64(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 50)
executor = kr.createMultiThreadExecutor(4)
done = done & test(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
done = done & test64(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
print("OK", done)
if not done:
exit(1)
Expand Down

0 comments on commit a73c1b6

Please sign in to comment.