[Performance] Dynamic cpu kernel V3 for SpMMSumCsr all Ops (dmlc#2309)

* support AVX512 * env DGL_CPU_INTEL_KERNEL_ENABLED=1 * env DGL_CPU_INTEL_KERNEL_LOG=1 * Add unittest test_spmm.cc Co-authored-by: Izabela Mazur <[email protected]> Co-authored-by: Michal Szarmach <[email protected]> Review patch
feixian15 · Nov 17, 2020 · f8ebcd7 · f8ebcd7
1 parent 62b4bbb
commit f8ebcd7
Show file tree

Hide file tree

Showing 9 changed files with 1,003 additions and 212 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -23,3 +23,6 @@
 [submodule "third_party/thrust"]
 	path = third_party/thrust
 	url = https://github.com/NVIDIA/thrust.git
+[submodule "third_party/xbyak"]
+	path = third_party/xbyak
+	url = https://github.com/herumi/xbyak
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -59,6 +59,7 @@ include_directories("third_party/dmlc-core/include")
 include_directories("third_party/minigun/minigun")
 include_directories("third_party/minigun/third_party/moderngpu/src")
 include_directories("third_party/phmap/")
+include_directories("third_party/xbyak/")
 
 # initial variables
 set(DGL_LINKER_LIBS "")

diff --git a/docs/source/env_var.rst b/docs/source/env_var.rst
@@ -22,3 +22,15 @@ Data Repository
 * ``DGL_DOWNLOAD_DIR``:
     * Values: String (default="${HOME}/.dgl")
     * The local directory to cache the downloaded data.
+
+Intel CPU Performance Options
+---------------
+* ``DGL_CPU_INTEL_KERNEL_ENABLED``:
+    * Values: int (default='0')
+    * Use dynamic cpu kernels.
+    * Suggested values: 1
+
+* ``DGL_CPU_INTEL_KERNEL_LOG``:
+    * Values: int (default='0')
+    * Show diagnostic message (debug mode).
+    * Suggested values: 1
diff --git a/include/intel/cpu_support.h b/include/intel/cpu_support.h
@@ -0,0 +1,332 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file intel/cpu_support.h
+ * \brief Intel CPU support
+ * \author Pawel Piotrowicz <[email protected]>
+ */
+#ifndef INTEL_CPU_SUPPORT_H_
+#define INTEL_CPU_SUPPORT_H_
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include "dmlc/logging.h"
+#include "meta_utils.h"
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
+
+namespace dgl {
+
+typedef std::tuple<float, double> supported_types;
+
+#ifndef log_intel
+#define log_intel(x)                   \
+  if (IntelKernel<>::IsLogEnabled()) { \
+    LOG(INFO) << x;                    \
+  }
+#endif
+
+static inline Xbyak::Zmm make_zmm(const Xbyak::Xmm &v) {
+  return Xbyak::Zmm(v.getIdx());
+}
+template <int version = 0>
+struct IntelKernel {
+  static int64_t GetValue() {
+    int64_t v = 0;
+    const char *label = "DGL_CPU_INTEL_KERNEL_ENABLED";
+    const char *ptr = std::getenv(label);
+    if (ptr) {
+      v = atoll(ptr);
+      log_intel(label << "=>" << v);
+    }
+    return v;
+  }
+
+  static int64_t IsEnabled() {
+    static int64_t r = IntelKernel<version>::GetValue();
+    return r;
+  }
+
+  static int IsLogEnabled() {
+    static int r = (std::getenv("DGL_CPU_INTEL_KERNEL_LOG")) ? 1 : 0;
+    return r;
+  }
+};
+
+/*!
+ * \brief Element-wise addition kernel using Intel AVX512 instructions.
+ * \note it uses AVX512.
+ */
+template <class Op>
+class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
+ public:
+  typedef typename Op::type DType;
+  static_assert(
+    std::is_base_of<std::true_type,
+                    utils::has_type<DType, supported_types>>::value,
+    "Use case fail dgl::ElemWiseAddUpdate< Operator<DType> > DType is not "
+    "supported !");
+
+ protected:
+  const Xbyak::Reg64 &r_out_;
+  const Xbyak::Reg64 &r_left_;
+  const Xbyak::Reg64 &r_right;
+  const Xbyak::Reg64 &r_size_;
+
+  /* [functional] Does kernel is applicable on this machine ? */
+  bool applicable_;
+
+ public:
+  static constexpr int UNIT_SIZE_BYTES = sizeof(DType);
+  static constexpr int BITS_IN_BYTES = 8;
+  static constexpr int REG_BIT_SIZE = 512;
+  static constexpr int UNIT_PER_REG =
+    REG_BIT_SIZE / (UNIT_SIZE_BYTES * BITS_IN_BYTES);
+
+  template <class TType, class R1, class R2,
+            utils::CheckCmp<TType, float> = true>
+  void alias_load(R1 r1, R2 r2) {
+    vmovups(r1, r2);
+  }
+  template <class TType, class R1, class R2,
+            utils::CheckCmp<TType, double> = true>
+  void alias_load(R1 r1, R2 r2) {
+    vmovupd(r1, r2);
+  }
+
+  template <class TType, class R1, class R2,
+            utils::CheckCmp<TType, float> = true>
+  void alias_save(R1 r1, R2 r2) {
+    alias_load<TType>(r1, r2);
+  }
+  template <class TType, class R1, class R2,
+            utils::CheckCmp<TType, double> = true>
+  void alias_save(R1 r1, R2 r2) {
+    alias_load<TType>(r1, r2);
+  }
+
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, float> = true>
+  void alias_ADD(R1 r1, R2 r2, R3 r3) {
+    vaddps(r1, r2, r3);
+  }
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, double> = true>
+  void alias_ADD(R1 r1, R2 r2, R3 r3) {
+    vaddpd(r1, r2, r3);
+  }
+
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, float> = true>
+  void alias_SUB(R1 r1, R2 r2, R3 r3) {
+    vsubps(r1, r2, r3);
+  }
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, double> = true>
+  void alias_SUB(R1 r1, R2 r2, R3 r3) {
+    vsubpd(r1, r2, r3);
+  }
+
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, float> = true>
+  void alias_DIV(R1 r1, R2 r2, R3 r3) {
+    vdivps(r1, r2, r3);
+  }
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, double> = true>
+  void alias_DIV(R1 r1, R2 r2, R3 r3) {
+    vdivpd(r1, r2, r3);
+  }
+
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, float> = true>
+  void alias_MUL(R1 r1, R2 r2, R3 r3) {
+    vmulps(r1, r2, r3);
+  }
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, double> = true>
+  void alias_MUL(R1 r1, R2 r2, R3 r3) {
+    vmulpd(r1, r2, r3);
+  }
+
+  template <class Operator,
+            utils::Verify<Operator, ::dgl::aten::cpu::op::CopyLhs,
+                          supported_types> = true>
+  void full_chunk_loop_operations() {
+    typedef typename Operator::type IType;
+    alias_load<IType>(zmm0, ptr[r_out_ + r9 * sizeof(IType)]);
+    alias_load<IType>(zmm1, ptr[r_left_ + r9 * sizeof(IType)]);
+    alias_ADD<IType>(zmm2, zmm0, zmm1);
+    alias_save<IType>(ptr[r_out_ + r9 * sizeof(IType)], zmm2);
+  }
+  template <class Operator,
+            utils::Verify<Operator, ::dgl::aten::cpu::op::CopyRhs,
+                          supported_types> = true>
+  void full_chunk_loop_operations() {
+    typedef typename Operator::type IType;
+    alias_load<IType>(zmm0, ptr[r_out_ + r9 * sizeof(IType)]);
+    alias_load<IType>(zmm1, ptr[r_right + r9 * sizeof(IType)]);
+    alias_ADD<IType>(zmm2, zmm0, zmm1);
+    alias_save<IType>(ptr[r_out_ + r9 * sizeof(IType)], zmm2);
+  }
+  template <class T>
+  void loop_pre() {
+    alias_load<T>(zmm0, ptr[r_out_ + r9 * sizeof(T)]);
+    alias_load<T>(zmm1, ptr[r_left_ + r9 * sizeof(T)]);
+    alias_load<T>(zmm2, ptr[r_right + r9 * sizeof(T)]);
+  }
+  template <class T>
+  void loop_post() {
+    alias_ADD<T>(zmm2, zmm0, zmm2);
+    alias_save<T>(ptr[r_out_ + r9 * sizeof(T)], zmm2);
+  }
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Add,
+                                          supported_types> = true>
+  void full_chunk_loop_operations() {
+    typedef typename Operator::type IType;
+    loop_pre<IType>();
+    alias_ADD<IType>(zmm2, zmm1, zmm2);
+    loop_post<IType>();
+  }
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Sub,
+                                          supported_types> = true>
+  void full_chunk_loop_operations() {
+    typedef typename Operator::type IType;
+    loop_pre<IType>();
+    alias_SUB<IType>(zmm2, zmm1, zmm2);
+    loop_post<IType>();
+  }
+
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Div,
+                                          supported_types> = true>
+  void full_chunk_loop_operations() {
+    typedef typename Operator::type IType;
+    loop_pre<IType>();
+    alias_DIV<IType>(zmm2, zmm1, zmm2);
+    loop_post<IType>();
+  }
+
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Mul,
+                                          supported_types> = true>
+  void full_chunk_loop_operations() {
+    typedef typename Operator::type IType;
+    loop_pre<IType>();
+    alias_MUL<IType>(zmm2, zmm1, zmm2);
+    loop_post<IType>();
+  }
+
+  template <class Operator,
+            utils::Verify<Operator, ::dgl::aten::cpu::op::CopyLhs,
+                          supported_types> = true>
+  void remainder_operations(const Xbyak::Opmask mask) {
+    typedef typename Operator::type IType;
+    alias_load<IType>(make_zmm(zmm2) | mask, ptr[r_left_ + r9 * sizeof(IType)]);
+  }
+
+  template <class Operator,
+            utils::Verify<Operator, ::dgl::aten::cpu::op::CopyRhs,
+                          supported_types> = true>
+  void remainder_operations(const Xbyak::Opmask mask) {
+    typedef typename Operator::type IType;
+    alias_load<IType>(make_zmm(zmm2) | mask, ptr[r_right + r9 * sizeof(IType)]);
+  }
+
+  template <class T>
+  void remainder_fetch_LR(const Xbyak::Opmask mask) {
+    alias_load<T>(make_zmm(zmm2) | mask, ptr[r_left_ + r9 * sizeof(T)]);
+    alias_load<T>(make_zmm(zmm1) | mask, ptr[r_right + r9 * sizeof(T)]);
+  }
+
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Mul,
+                                          supported_types> = true>
+  void remainder_operations(const Xbyak::Opmask mask) {
+    typedef typename Operator::type IType;
+    remainder_fetch_LR<IType>(mask);
+    alias_MUL<IType>(zmm2, zmm2, zmm1);
+  }
+
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Add,
+                                          supported_types> = true>
+  void remainder_operations(const Xbyak::Opmask mask) {
+    typedef typename Operator::type IType;
+    remainder_fetch_LR<IType>(mask);
+    alias_ADD<DType>(zmm2, zmm2, zmm1);
+  }
+
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Div,
+                                          supported_types> = true>
+  void remainder_operations(const Xbyak::Opmask mask) {
+    typedef typename Operator::type IType;
+    remainder_fetch_LR<IType>(mask);
+    alias_DIV<DType>(zmm2, zmm2, zmm1);
+  }
+
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Sub,
+                                          supported_types> = true>
+  void remainder_operations(const Xbyak::Opmask mask) {
+    typedef typename Operator::type IType;
+    remainder_fetch_LR<IType>(mask);
+    alias_SUB<DType>(zmm2, zmm2, zmm1);
+  }
+
+  ElemWiseAddUpdate()
+      : r_out_(rdi),
+        r_left_(rsi),
+        r_right(rdx),
+        r_size_(rcx),
+        applicable_(false) {
+    static Xbyak::util::Cpu current_cpu;
+
+    /* Default case for all */
+    if (current_cpu.has(Xbyak::util::Cpu::tAVX512F)) {
+      /* prepare REMAINDER */
+      mov(r8, r_size_);
+      and_(r8,
+           UNIT_PER_REG - 1);  // r8_modulo = size/(sizeof(zmm)/sizeof(float))
+      xor_(r9, r9);            // reset r9
+      cmp(r_size_, UNIT_PER_REG);  // if ( size < 16 ) {  }
+      jl("remainder");
+
+      /*  decrease  divident */
+      sub(r_size_, r8);  // prepare alignment chunks
+      cmp(r_size_, 0);   // do we have any full chunks ?
+      jz("remainder");
+
+      L("for_i");
+      full_chunk_loop_operations<Op>();
+      add(r9, UNIT_PER_REG);  // r9+=sizeof(zmm)/sizeof(float)
+      cmp(r_size_, r9);       // more full chunks ?
+      jnz("for_i");
+
+      L("remainder");
+      cmp(r8, 0);  //  do we have a remainder ?
+      jz("done");
+      /* prepare a bitmask for k1 */
+      mov(rax, 1);
+      mov(r_size_, r8);
+      sal(rax, cl);
+      dec(rax);        // k1= (1 << r8 )-1
+      kmovw(k1, eax);  // set bitmask
+      alias_load<DType>(make_zmm(zmm0) | k1,
+                        ptr[r_out_ + r9 * UNIT_SIZE_BYTES]);
+      remainder_operations<Op>(k1);
+      alias_ADD<DType>(zmm3, zmm2, zmm0);
+      alias_save<DType>(ptr[r_out_ + r9 * UNIT_SIZE_BYTES],
+                        make_zmm(zmm3) | k1);
+      L("done");
+      applicable_ = true;
+      log_intel("AVX512F cpu kernel is ready");
+    }
+    ret();
+  }
+
+  bool applicable() const { return applicable_; }
+
+  template <class... P>
+  void run(P... args) {
+    ((void (*)(P...))(this)->getCode())(args...);
+  }
+};
+
+}  // namespace dgl
+
+#endif  // INTEL_CPU_SUPPORT_H_