Include Eigen as BLAS replacement.

This adds Eigen as a default matrix/vector library via a submodule. This has a load of advantages: * It can be used as a replacement for a cBLAS library when it is not available, cannot be found, or is outdated compared to the compiler or CPU. * Because Eigen is header only, it significantly eases the build prequisite requirements. * The Eigen code paths are much more readable from a mathematical perspective. * Eigen can optimize more heavily for known matrix sizes. The current code doesn't yet take advantage of this, though. The downsides: * Eigen might be a bit slower than other BLAS libraries. (Nevertheless, on my system it is faster than OpenBLAS) * Binaries built with Eigen are optimized for the CPU it was compiled on and don't port as well to other CPUs. So you need seperate binaries for wider client support. * Default Eigen in CMake, add tests. Default the Eigen library in CMake, as it's the fastest for most contemporary CPUs and configurations, and the easiest to build. We can optionally use BLAS by adding the USE_BLAS define, and will try to locate BLAS/OpenBLAS if so. This is useful for binaries for distribution such as our releases or distros. Split all tests to cover both Eigen and BLAS. Update build instructions to remove BLAS as a dependency, use CMake on all Unixy platforms, and use HTTPS. Pull request leela-zero#1692.
mirolyj · Sep 26, 2018 · 72431e2 · 72431e2
1 parent 142199c
commit 72431e2
Show file tree

Hide file tree

Showing 14 changed files with 176 additions and 64 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "gtest"]
 	path = gtest
 	url = https://github.com/google/googletest.git
+[submodule "src/Eigen"]
+	path = src/Eigen
+	url = https://github.com/eigenteam/eigen-git-mirror
diff --git a/.travis.yml b/.travis.yml
@@ -14,12 +14,21 @@ jobs:
       script:
       - docker build -f Dockerfiles/Dockerfile.gpu -t leela-zero:gpu .
       - docker run leela-zero:gpu
+    - script:
+      - docker build -f Dockerfiles/Dockerfile.gpu-blas -t leela-zero:gpu-blas .
+      - docker run leela-zero:gpu-blas
     - script:
       - docker build -f Dockerfiles/Dockerfile.cpu -t leela-zero:cpu .
       - docker run leela-zero:cpu
+    - script:
+      - docker build -f Dockerfiles/Dockerfile.cpu-blas -t leela-zero:cpu-blas .
+      - docker run leela-zero:cpu-blas
     - script:
       - docker build -f Dockerfiles/Dockerfile.tests -t leela-zero:tests .
       - docker run leela-zero:tests
+    - script:
+      - docker build -f Dockerfiles/Dockerfile.tests-blas -t leela-zero:tests-blas .
+      - docker run leela-zero:tests-blas
     - stage: style
       before_install:
-      script: find . -regex ".*\.\(cpp\|h\|hpp\)" -not -regex ".*moc_.*.cpp" -not -path "./gtest/*" -not -path "./training/*" -not -path "./src/half/*" -not -path "./src/CL/*" | xargs python2 scripts/cpplint.py --filter=-build/c++11,-build/include,-build/include_order,-build/include_what_you_use,-build/namespaces,-readability/braces,-readability/casting,-readability/fn_size,-readability/namespace,-readability/todo,-runtime/explicit,-runtime/indentation_namespace,-runtime/int,-runtime/references,-whitespace/blank_line,-whitespace/braces,-whitespace/comma,-whitespace/comments,-whitespace/empty_loop_body,-whitespace/line_length,-whitespace/semicolon
+      script: find . -regex ".*\.\(cpp\|h\|hpp\)" -not -regex ".*moc_.*.cpp" -not -path "./gtest/*" -not -path "./training/*" -not -path "./src/half/*" -not -path "./src/CL/*" -not -path "./src/Eigen/*" | xargs python2 scripts/cpplint.py --filter=-build/c++11,-build/include,-build/include_order,-build/include_what_you_use,-build/namespaces,-readability/braces,-readability/casting,-readability/fn_size,-readability/namespace,-readability/todo,-runtime/explicit,-runtime/indentation_namespace,-runtime/int,-runtime/references,-whitespace/blank_line,-whitespace/braces,-whitespace/comma,-whitespace/comments,-whitespace/empty_loop_body,-whitespace/line_length,-whitespace/semicolon
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,14 +32,20 @@ find_package(OpenCL REQUIRED)
 if(NOT APPLE)
   set(BLA_VENDOR OpenBLAS)
 endif()
-find_package(BLAS REQUIRED)
-find_path(BLAS_INCLUDE_DIRS openblas_config.h
-  /usr/include
-  /usr/local/include
-  /usr/include/openblas
-  /opt/OpenBLAS/include
-  /usr/include/x86_64-linux-gnu
-  $ENV{BLAS_HOME}/include)
+if(USE_BLAS)
+  message(STATUS "Looking for system BLAS/OpenBLAS library.")
+  find_package(BLAS REQUIRED)
+  find_path(BLAS_INCLUDE_DIRS openblas_config.h
+    /usr/include
+    /usr/local/include
+    /usr/include/openblas
+    /opt/OpenBLAS/include
+    /usr/include/x86_64-linux-gnu
+    $ENV{BLAS_HOME}/include)
+    add_definitions(-DUSE_BLAS)
+else()
+message(STATUS "Using built-in matrix library.")
+endif()
 find_package(Qt5Core)
 
 set(CMAKE_CXX_STANDARD 14)
@@ -79,7 +85,7 @@ if(USE_HALF)
   add_definitions(-DUSE_HALF)
 endif()
 
-set(IncludePath "${CMAKE_CURRENT_SOURCE_DIR}/src")
+set(IncludePath "${CMAKE_CURRENT_SOURCE_DIR}/src" "${CMAKE_CURRENT_SOURCE_DIR}/src/Eigen")
 set(SrcPath "${CMAKE_CURRENT_SOURCE_DIR}/src")
 
 include_directories(${IncludePath})

diff --git a/Dockerfiles/Dockerfile.cpu-blas b/Dockerfiles/Dockerfile.cpu-blas
@@ -0,0 +1,6 @@
+FROM leela-zero:base
+
+# CPU build
+RUN CXX=g++ CC=gcc cmake -DUSE_CPU_ONLY=1 -DUSE_BLAS=1 ..
+
+CMD cmake --build . --target leelaz --config Release -- -j2
diff --git a/Dockerfiles/Dockerfile.gpu-blas b/Dockerfiles/Dockerfile.gpu-blas
@@ -0,0 +1,6 @@
+FROM leela-zero:base
+
+# GPU build
+RUN CXX=g++ CC=gcc cmake -DUSE_BLAS=1 ..
+
+CMD cmake --build . --target leelaz --config Release -- -j2
diff --git a/Dockerfiles/Dockerfile.tests-blas b/Dockerfiles/Dockerfile.tests-blas
@@ -0,0 +1,8 @@
+FROM leela-zero:base
+
+# CPU build
+RUN CXX=g++ CC=gcc cmake -DUSE_CPU_ONLY=1 -DUSE_BLAS=1 ..
+RUN cmake --build . --target tests --config Release -- -j2
+
+CMD ./tests
+
diff --git a/README.md b/README.md
@@ -58,7 +58,7 @@ launch autogtp.
 
 # I just want to play right now
 
-Download the best known network weights file from: http://zero.sjeng.org/best-network
+Download the best known network weights file from: https://zero.sjeng.org/best-network
 
 And head to the [Usage](#usage) section of this README.
 
@@ -70,7 +70,6 @@ If you prefer a more human style, a network trained from human games is availabl
 
 * GCC, Clang or MSVC, any C++14 compiler
 * Boost 1.58.x or later, headers and program_options, filesystem and system libraries (libboost-dev, libboost-program-options-dev and libboost-filesystem-dev on Debian/Ubuntu)
-* BLAS Library: OpenBLAS (libopenblas-dev) or (optionally) Intel MKL
 * zlib library (zlib1g & zlib1g-dev on Debian/Ubuntu)
 * Standard OpenCL C headers (opencl-headers on Debian/Ubuntu, or at
 https://github.com/KhronosGroup/OpenCL-Headers/tree/master/opencl22/)
@@ -79,62 +78,61 @@ https://github.com/KhronosGroup/OpenCL-Headers/tree/master/opencl22/)
 drivers is strongly recommended (OpenCL 1.1 support is enough).
 If you do not have a GPU, modify config.h in the source and remove
 the line that says "#define USE_OPENCL".
+* Optional: BLAS Library: OpenBLAS (libopenblas-dev) or Intel MKL
 * The program has been tested on Windows, Linux and macOS.
 
-## Example of compiling and running - Ubuntu
+## Example of compiling and running - Ubuntu & similar
 
     # Test for OpenCL support & compatibility
     sudo apt install clinfo && clinfo
 
     # Clone github repo
     git clone https://github.com/gcp/leela-zero
-    cd leela-zero/src
-    sudo apt install libboost-dev libboost-program-options-dev libboost-filesystem-dev libopenblas-dev opencl-headers ocl-icd-libopencl1 ocl-icd-opencl-dev zlib1g-dev
-    make
-    cd ..
-    wget http://zero.sjeng.org/best-network
-    src/leelaz --weights best-network
+    cd leela-zero
+    git submodule update --init --recursive
+
+    # Install build depedencies
+    sudo apt install libboost-dev libboost-program-options-dev libboost-filesystem-dev opencl-headers ocl-icd-libopencl1 ocl-icd-opencl-dev zlib1g-dev
+
+    # Use stand alone directory to keep source dir clean
+    mkdir build && cd build
+    cmake ..
+    cmake --build .
+    ./tests
+    curl -O https://zero.sjeng.org/best-network
+    ./leelaz --weights best-network
 
 ## Example of compiling and running - macOS
 
     # Clone github repo
     git clone https://github.com/gcp/leela-zero
-    cd leela-zero/src
-    brew install boost
-    make
-    cd ..
-    curl -O http://zero.sjeng.org/best-network
-    src/leelaz --weights best-network
+    cd leela-zero
+    git submodule update --init --recursive
+
+    # Install build depedencies
+    brew install boost cmake
+
+    # Use stand alone directory to keep source dir clean
+    mkdir build && cd build
+    cmake ..
+    cmake --build .
+    ./tests
+    curl -O https://zero.sjeng.org/best-network
+    ./leelaz --weights best-network
 
 ## Example of compiling and running - Windows
 
     # Clone github repo
     git clone https://github.com/gcp/leela-zero
     cd leela-zero
+    git submodule update --init --recursive
     cd msvc
     Double-click the leela-zero2015.sln or leela-zero2017.sln corresponding
     to the Visual Studio version you have.
     # Build from Visual Studio 2015 or 2017
-    # Download <http://zero.sjeng.org/best-network> to msvc\x64\Release
+    # Download <https://zero.sjeng.org/best-network> to msvc\x64\Release
     msvc\x64\Release\leelaz.exe --weights best-network
 
-## Example of compiling and running - CMake (macOS/Ubuntu)
-
-    # Clone github repo
-    git clone https://github.com/gcp/leela-zero
-    cd leela-zero
-    git submodule update --init --recursive
-
-    # Use stand alone directory to keep source dir clean
-    mkdir build && cd build
-    cmake ..
-    make leelaz
-    make tests
-    ./tests
-    curl -O http://zero.sjeng.org/best-network
-    ./leelaz --weights best-network
-
-
 # Usage
 
 The engine supports the [GTP protocol, version 2](https://www.lysator.liu.se/~gunnar/gtp/gtp2-spec-draft2/gtp2-spec.html).
@@ -295,7 +293,6 @@ If interrupted, training can be resumed with:
 - [ ] Implement GPU batching.
 - [ ] Parameter setting over GTP.
 - More backends:
-- [ ] Eigen based BLAS backend.
 - [ ] MKL-DNN based backend.
 - [ ] CUDA specific version using cuDNN.
 - [ ] AMD specific version using MIOpen.
@@ -319,5 +316,4 @@ https://medium.com/applied-data-science/alphago-zero-explained-in-one-diagram-36
 
 # License
 
-The code is released under the GPLv3 or later, except for ThreadPool.h, cl2.hpp,
-half.hpp and the clblast_level3 subdirs, which have specific licenses (compatible with GPLv3) mentioned in those files.
+The code is released under the GPLv3 or later, except for ThreadPool.h, cl2.hpp, half.hpp and the eigen and clblast_level3 subdirs, which have specific licenses (compatible with GPLv3) mentioned in those files.
diff --git a/appveyor.yml b/appveyor.yml
@@ -4,6 +4,9 @@ platform: x64
 environment:
   matrix:
   - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+  - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+    features: USE_CPU_ONLY USE_BLAS
+    run_tests: 1
   - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
     features: USE_CPU_ONLY
     run_tests: 1

diff --git a/src/CPUPipe.cpp b/src/CPUPipe.cpp
@@ -28,13 +28,23 @@
 #include <cblas.h>
 #endif
 #ifndef USE_BLAS
-#error "No non-BLAS implementation"
+#include <Eigen/Dense>
 #endif
 
 #include "CPUPipe.h"
 #include "Network.h"
 #include "Im2Col.h"
 
+#ifndef USE_BLAS
+// Eigen helpers
+template <typename T>
+using EigenMatrixMap =
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenMatrixMap =
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
+#endif
+
 void CPUPipe::initialize(int channels) {
     m_input_channels = channels;
 }
@@ -130,14 +140,20 @@ void CPUPipe::winograd_sgemm(const std::vector<float>& U,
         const auto offset_u = b * K * C;
         const auto offset_v = b * C * P;
         const auto offset_m = b * K * P;
-
+#ifdef USE_BLAS
         cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
                     K, P, C,
                     1.0f,
                     &U[offset_u], K,
                     &V[offset_v], P,
                     0.0f,
                     &M[offset_m], P);
+#else
+        auto C_mat = EigenMatrixMap<float>(M.data() + offset_m, P, K);
+        C_mat.noalias() =
+           ConstEigenMatrixMap<float>(V.data() + offset_v, P, C)
+            * ConstEigenMatrixMap<float>(U.data() + offset_u, K, C).transpose();
+#endif
     }
 }
 
@@ -253,13 +269,20 @@ void convolve(const size_t outputs,
     // passing a matrix A[m][n], the value should be m.
     //    cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
     //                ldb, beta, C, N);
-
+#ifdef USE_BLAS
     cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                 // M        N            K
                 outputs, num_intersections, filter_dim,
                 1.0f, &weights[0], filter_dim,
                 &col[0], num_intersections,
                 0.0f, &output[0], num_intersections);
+#else
+    auto C_mat = EigenMatrixMap<float>(output.data(),
+                                       num_intersections, outputs);
+    C_mat.noalias() =
+        ConstEigenMatrixMap<float>(col.data(), num_intersections, filter_dim)
+        * ConstEigenMatrixMap<float>(weights.data(), filter_dim, outputs);
+#endif
 
     for (unsigned int o = 0; o < outputs; o++) {
         for (unsigned int b = 0; b < num_intersections; b++) {

diff --git a/src/Eigen b/src/Eigen
diff --git a/src/Makefile b/src/Makefile
@@ -26,14 +26,15 @@ LIBS =
 
 ifeq ($(THE_OS),Linux)
 # for Linux with OpenBLAS
-	CXXFLAGS += -I/usr/include/openblas
+	CXXFLAGS += -I/usr/include/openblas -I./Eigen
 	DYNAMIC_LIBS += -lopenblas
 	DYNAMIC_LIBS += -lOpenCL
 endif
 ifeq ($(THE_OS),Darwin)
 # for macOS (comment out the Linux part)
 	LIBS += -framework Accelerate
 	LIBS += -framework OpenCL
+	CXXFLAGS += -I./Eigen
 	CXXFLAGS += -I/System/Library/Frameworks/Accelerate.framework/Versions/Current/Headers
 endif
 

diff --git a/src/Network.cpp b/src/Network.cpp
@@ -30,6 +30,9 @@
 #include <boost/utility.hpp>
 #include <boost/format.hpp>
 #include <boost/spirit/home/x3.hpp>
+#ifndef USE_BLAS
+#include <Eigen/Dense>
+#endif
 
 #ifdef __APPLE__
 #include <Accelerate/Accelerate.h>
@@ -62,6 +65,19 @@
 namespace x3 = boost::spirit::x3;
 using namespace Utils;
 
+#ifndef USE_BLAS
+// Eigen helpers
+template <typename T>
+using EigenVectorMap =
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorMap =
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenMatrixMap =
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
+#endif
+
 // Symmetry helper
 static std::array<std::array<int, NUM_INTERSECTIONS>,
                   Network::NUM_SYMMETRIES> symmetry_nn_idx_table;
@@ -362,6 +378,9 @@ void Network::initialize(int playouts, const std::string & weightsfile) {
     myprintf("BLAS core: MKL %s\n", Version.Processor);
 #endif
 #endif
+#else
+    myprintf("BLAS Core: built-in Eigen %d.%d.%d library.\n",
+             EIGEN_WORLD_VERSION, EIGEN_MAJOR_VERSION, EIGEN_MINOR_VERSION);
 #endif
 
     m_fwd_weights = std::make_shared<ForwardPipeWeights>();
@@ -519,8 +538,6 @@ void Network::initialize(int playouts, const std::string & weightsfile) {
     m_fwd_weights.reset();
 }
 
-#ifdef USE_BLAS
-
 template<unsigned int inputs,
          unsigned int outputs,
          bool ReLU,
@@ -530,13 +547,21 @@ std::vector<float> innerproduct(const std::vector<float>& input,
                                 const std::array<float, outputs>& biases) {
     std::vector<float> output(outputs);
 
+#ifdef USE_BLAS
     cblas_sgemv(CblasRowMajor, CblasNoTrans,
                 // M     K
                 outputs, inputs,
                 1.0f, &weights[0], inputs,
                 &input[0], 1,
                 0.0f, &output[0], 1);
-
+#else
+    EigenVectorMap<float> y(output.data(), outputs);
+    y.noalias() =
+        ConstEigenMatrixMap<float>(weights.data(),
+                                   inputs,
+                                   outputs).transpose()
+        * ConstEigenVectorMap<float>(input.data(), inputs);
+#endif
     const auto lambda_ReLU = [](const auto val) { return (val > 0.0f) ?
                                                           val : 0.0f; };
     for (unsigned int o = 0; o < outputs; o++) {
@@ -577,7 +602,6 @@ void batchnorm(const size_t channels,
         }
     }
 }
-#endif
 
 #ifdef USE_OPENCL_SELFCHECK
 void Network::compare_net_outputs(const Netresult& data,