Fix fp64 test, add benchmark result (#24)

Menooker · Aug 23, 2024 · a73c1b6 · a73c1b6
1 parent c35f0f2
commit a73c1b6
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 31 deletions.
diff --git a/Readme.md b/Readme.md
@@ -15,9 +15,10 @@ A typical workload of designing and running financial factors with KunQuant will
 
 Experiments show that KunQuant-generated code can be more than 170x faster than naive implementation based on Pandas. We ran Alpha001~Alpha101 with [Pandas-based code](https://github.com/yli188/WorldQuant_alpha101_code/blob/master/101Alpha_code_1.py) and our optimized code. See results below:
 
-| Pandas-based  |  KunQuant 1-thread  |  KunQuant  4-threads |
-|---|---|---|
-| 6.138s |  0.115s  |  0.035s  |
+| Datatype | Pandas-based  |  KunQuant 1-thread  |  KunQuant  4-threads |
+|---|---|---|---|
+| Single precision (STs layout) | 6.138s |  0.083s  |  0.027s  |
+| Double precision (TS layout) | 6.332s |  0.120s  |  0.031s  |
 
 The data was collected on 4-core Intel i7-7700HQ CPU, running synthetic data of 64 stocks with 260 rows of data in single precision float point data type. Environment:
 
@@ -259,6 +260,8 @@ There are some other CPU instruction sets that is optional for KunQuant. You can
 
 To see if your CPU supports AVX512 (and `AVX512DQ` and `AVX512VL`), you can run command `lscpu` in Linux and check the outputs.
 
+Enabling AVX512 will slightly improve the performance, if it is supported by the CPU. Experiments only shows ~1% performance gain for 16-threads of AVX512 on Icelake, testing on double-precision Alpha101, with 128 stocks and time length of 12000. A single thread running the same task shows 5% performance gain on AVX512.
+
 ## Operator definitions
 
 See [Operators.md](./Operators.md)

diff --git a/cpp/Python/PyBinding.cpp b/cpp/Python/PyBinding.cpp
@@ -91,7 +91,7 @@ PYBIND11_MODULE(KunRunner, m) {
         "runGraph",
         [](std::shared_ptr<kun::Executor> exec, const kun::Module *mod,
            const py::dict inputs, size_t cur_time, size_t length,
-           const py::object outputs) {
+           const py::object outputs, bool skip_check) {
             std::unordered_map<std::string, float *> bufs;
             py::ssize_t known_S = 0;
             py::ssize_t known_T = 0;
@@ -100,6 +100,23 @@ PYBIND11_MODULE(KunRunner, m) {
                 auto name = py::cast<std::string>(kv.first);
                 auto buf_obj = py::cast<py::buffer>(kv.second);
                 auto info = buf_obj.request();
+                bufs[name] = (float *)info.ptr;
+                if (skip_check) {
+                    if (known_S == 0) {
+                        if (mod->input_layout == kun::MemoryLayout::STs) {
+                            auto S = info.shape[0];
+                            auto T = info.shape[1];
+                            known_S = S;
+                            known_T = T;
+                        } else if (mod->input_layout == kun::MemoryLayout::TS) {
+                            auto S = info.shape[1];
+                            auto T = info.shape[0];
+                            known_S = S / simd_len;
+                            known_T = T;
+                        }
+                    }
+                    continue;
+                }
                 if (mod->dtype == kun::Datatype::Float) {
                     if (info.format != py::format_descriptor<float>::format())
                         throw std::runtime_error("Expecting float buffer at " +
@@ -139,7 +156,6 @@ PYBIND11_MODULE(KunRunner, m) {
                 } else {
                     throw std::runtime_error("Unknown layout at " + name);
                 }
-                bufs[name] = (float *)info.ptr;
             }
             if ((py::ssize_t)length > known_T) {
                 throw std::runtime_error("Bad parameter: length");
@@ -157,20 +173,12 @@ PYBIND11_MODULE(KunRunner, m) {
                     py::array outbuffer;
                     if (!outputs.is_none() && outputs.contains(buf.name)) {
                         py::array v;
-                        if (mod->dtype == kun::Datatype::Float) {
-                            outbuffer =
-                                outputs[buf.name]
-                                    .cast<py::array_t<float,
-                                                      py::array::c_style>>();
-                        } else {
-                            outbuffer =
-                                outputs[buf.name]
-                                    .cast<py::array_t<double,
-                                                      py::array::c_style>>();
+                        outbuffer = outputs[buf.name].cast<py::buffer>();
+                        auto info = outbuffer.request(true);
+                        if (!skip_check) {
+                            expectContiguousShape(mod->dtype, info, buf.name,
+                                                  *expected_out_shape);
                         }
-                        auto info = outbuffer.request();
-                        expectContiguousShape(mod->dtype, info, buf.name,
-                                              *expected_out_shape);
                         bufs[buf.name] = (float *)info.ptr;
                     } else {
                         if (mod->dtype == kun::Datatype::Float) {
@@ -190,7 +198,8 @@ PYBIND11_MODULE(KunRunner, m) {
             return ret;
         },
         py::arg("exec"), py::arg("mod"), py::arg("inputs"), py::arg("cur_time"),
-        py::arg("length"), py::arg("outputs") = py::dict());
+        py::arg("length"), py::arg("outputs") = py::dict(),
+        py::arg("skip_check") = false);
 
     py::class_<kun::StreamContext>(m, "StreamContext")
         .def(py::init<std::shared_ptr<kun::Executor>, const kun::Module *,

diff --git a/tests/KunTestUtil/gen_data.py b/tests/KunTestUtil/gen_data.py
@@ -1,5 +1,24 @@
 import numpy as np
 
+def aligned_copy(v: np.ndarray) -> np.ndarray:
+    alloc_size = v.nbytes
+    pointer = v.__array_interface__['data'][0]
+    rem = pointer % 64
+    if rem == 0:
+        return v
+    newbase = np.empty((alloc_size + 64, 0), dtype=np.uint8)
+    pointer = newbase.__array_interface__['data'][0]
+    rem = pointer % 64
+    if rem == 0:
+        offset = 0
+    else:
+        offset = 64  -  rem
+    ret = newbase[offset:offset+alloc_size].view(v.dtype).reshape(v.shape)
+    ret[:] = v
+    return ret
+
+
+
 def gen_stock_data2(low, high, stocks, num_time, stddev, dtype):
     xopen = np.random.uniform(low, high, size = (stocks, 1)).astype(dtype)
     # xvol = np.random.uniform(5, 5.2, size = (stocks, 1)).astype(dtype)

diff --git a/tests/test_alpha101.py b/tests/test_alpha101.py
@@ -234,10 +234,19 @@ def test(modu, executor, start_window, num_stock, num_time, my_input, ref, ische
     # print(ref.alpha001())
     # blocked = TS_STs(inp)
 
-    start = time.time()
-    out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
-    end = time.time()
-    print(f"Exec takes: {end-start:.6f} seconds")
+    if not ischeck:
+        out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
+        start = time.time()
+        for _ in range(20):
+            out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers, skip_check = True)
+        end = time.time()
+        tdiff = (end-start)/20
+    else:
+        start = time.time()
+        out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
+        end = time.time()
+        tdiff = end-start
+    print(f"Exec takes: {tdiff:.6f} seconds")
     if not ischeck:
         return True
     # print(out)
@@ -323,11 +332,19 @@ def test64(modu, executor, start_window, num_stock, num_time, my_input, ref, isc
             outbuffers[name] = sharedbuf[idx]
     # print(ref.alpha001())
     # blocked = TS_STs(inp)
-
-    start = time.time()
-    out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
-    end = time.time()
-    print(f"Exec takes: {end-start:.6f} seconds")
+    if not ischeck:
+        out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
+        start = time.time()
+        for _ in range(20):
+            out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers, skip_check = True)
+        end = time.time()
+        tdiff = (end-start)/20
+    else:
+        start = time.time()
+        out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
+        end = time.time()
+        tdiff = end-start
+    print(f"Exec takes: {tdiff:.6f} seconds")
     if not ischeck:
         return True
     # print(out)
@@ -348,10 +365,10 @@ def main64():
     my_input, pd_ref = make_data_and_ref(num_stock, num_time, is_check, 0, "float64")
     executor = kr.createSingleThreadExecutor()
     done = True
-    done = done & test(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
-    done = done & test(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 50)
+    done = done & test64(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
+    done = done & test64(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 50)
     executor = kr.createMultiThreadExecutor(4)
-    done = done & test(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
+    done = done & test64(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
     print("OK", done)
     if not done:
         exit(1)