Skip to content

Commit

Permalink
Include Eigen as BLAS replacement.
Browse files Browse the repository at this point in the history
This adds Eigen as a default matrix/vector library via a submodule.
This has a load of advantages:

* It can be used as a replacement for a cBLAS library when it is
  not available, cannot be found, or is outdated compared to the
  compiler or CPU.
* Because Eigen is header only, it significantly eases the build
  prequisite requirements.
* The Eigen code paths are much more readable from a mathematical
  perspective.
* Eigen can optimize more heavily for known matrix sizes. The
  current code doesn't yet take advantage of this, though.

The downsides:

* Eigen might be a bit slower than other BLAS libraries. (Nevertheless,
  on my system it is faster than OpenBLAS)
* Binaries built with Eigen are optimized for the CPU it was compiled
  on and don't port as well to other CPUs. So you need seperate binaries
  for wider client support.

* Default Eigen in CMake, add tests.

Default the Eigen library in CMake, as it's the fastest for most
contemporary CPUs and configurations, and the easiest to build.

We can optionally use BLAS by adding the USE_BLAS define, and
will try to locate BLAS/OpenBLAS if so. This is useful for
binaries for distribution such as our releases or distros.

Split all tests to cover both Eigen and BLAS.

Update build instructions to remove BLAS as a dependency, use CMake on
all Unixy platforms, and use HTTPS.

Pull request leela-zero#1692.
  • Loading branch information
gcp authored Sep 26, 2018
1 parent 142199c commit 72431e2
Show file tree
Hide file tree
Showing 14 changed files with 176 additions and 64 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "gtest"]
path = gtest
url = https://github.com/google/googletest.git
[submodule "src/Eigen"]
path = src/Eigen
url = https://github.com/eigenteam/eigen-git-mirror
11 changes: 10 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,21 @@ jobs:
script:
- docker build -f Dockerfiles/Dockerfile.gpu -t leela-zero:gpu .
- docker run leela-zero:gpu
- script:
- docker build -f Dockerfiles/Dockerfile.gpu-blas -t leela-zero:gpu-blas .
- docker run leela-zero:gpu-blas
- script:
- docker build -f Dockerfiles/Dockerfile.cpu -t leela-zero:cpu .
- docker run leela-zero:cpu
- script:
- docker build -f Dockerfiles/Dockerfile.cpu-blas -t leela-zero:cpu-blas .
- docker run leela-zero:cpu-blas
- script:
- docker build -f Dockerfiles/Dockerfile.tests -t leela-zero:tests .
- docker run leela-zero:tests
- script:
- docker build -f Dockerfiles/Dockerfile.tests-blas -t leela-zero:tests-blas .
- docker run leela-zero:tests-blas
- stage: style
before_install:
script: find . -regex ".*\.\(cpp\|h\|hpp\)" -not -regex ".*moc_.*.cpp" -not -path "./gtest/*" -not -path "./training/*" -not -path "./src/half/*" -not -path "./src/CL/*" | xargs python2 scripts/cpplint.py --filter=-build/c++11,-build/include,-build/include_order,-build/include_what_you_use,-build/namespaces,-readability/braces,-readability/casting,-readability/fn_size,-readability/namespace,-readability/todo,-runtime/explicit,-runtime/indentation_namespace,-runtime/int,-runtime/references,-whitespace/blank_line,-whitespace/braces,-whitespace/comma,-whitespace/comments,-whitespace/empty_loop_body,-whitespace/line_length,-whitespace/semicolon
script: find . -regex ".*\.\(cpp\|h\|hpp\)" -not -regex ".*moc_.*.cpp" -not -path "./gtest/*" -not -path "./training/*" -not -path "./src/half/*" -not -path "./src/CL/*" -not -path "./src/Eigen/*" | xargs python2 scripts/cpplint.py --filter=-build/c++11,-build/include,-build/include_order,-build/include_what_you_use,-build/namespaces,-readability/braces,-readability/casting,-readability/fn_size,-readability/namespace,-readability/todo,-runtime/explicit,-runtime/indentation_namespace,-runtime/int,-runtime/references,-whitespace/blank_line,-whitespace/braces,-whitespace/comma,-whitespace/comments,-whitespace/empty_loop_body,-whitespace/line_length,-whitespace/semicolon
24 changes: 15 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,20 @@ find_package(OpenCL REQUIRED)
if(NOT APPLE)
set(BLA_VENDOR OpenBLAS)
endif()
find_package(BLAS REQUIRED)
find_path(BLAS_INCLUDE_DIRS openblas_config.h
/usr/include
/usr/local/include
/usr/include/openblas
/opt/OpenBLAS/include
/usr/include/x86_64-linux-gnu
$ENV{BLAS_HOME}/include)
if(USE_BLAS)
message(STATUS "Looking for system BLAS/OpenBLAS library.")
find_package(BLAS REQUIRED)
find_path(BLAS_INCLUDE_DIRS openblas_config.h
/usr/include
/usr/local/include
/usr/include/openblas
/opt/OpenBLAS/include
/usr/include/x86_64-linux-gnu
$ENV{BLAS_HOME}/include)
add_definitions(-DUSE_BLAS)
else()
message(STATUS "Using built-in matrix library.")
endif()
find_package(Qt5Core)

set(CMAKE_CXX_STANDARD 14)
Expand Down Expand Up @@ -79,7 +85,7 @@ if(USE_HALF)
add_definitions(-DUSE_HALF)
endif()

set(IncludePath "${CMAKE_CURRENT_SOURCE_DIR}/src")
set(IncludePath "${CMAKE_CURRENT_SOURCE_DIR}/src" "${CMAKE_CURRENT_SOURCE_DIR}/src/Eigen")
set(SrcPath "${CMAKE_CURRENT_SOURCE_DIR}/src")

include_directories(${IncludePath})
Expand Down
6 changes: 6 additions & 0 deletions Dockerfiles/Dockerfile.cpu-blas
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM leela-zero:base

# CPU build
RUN CXX=g++ CC=gcc cmake -DUSE_CPU_ONLY=1 -DUSE_BLAS=1 ..

CMD cmake --build . --target leelaz --config Release -- -j2
6 changes: 6 additions & 0 deletions Dockerfiles/Dockerfile.gpu-blas
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM leela-zero:base

# GPU build
RUN CXX=g++ CC=gcc cmake -DUSE_BLAS=1 ..

CMD cmake --build . --target leelaz --config Release -- -j2
8 changes: 8 additions & 0 deletions Dockerfiles/Dockerfile.tests-blas
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM leela-zero:base

# CPU build
RUN CXX=g++ CC=gcc cmake -DUSE_CPU_ONLY=1 -DUSE_BLAS=1 ..
RUN cmake --build . --target tests --config Release -- -j2

CMD ./tests

68 changes: 32 additions & 36 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ launch autogtp.

# I just want to play right now

Download the best known network weights file from: http://zero.sjeng.org/best-network
Download the best known network weights file from: https://zero.sjeng.org/best-network

And head to the [Usage](#usage) section of this README.

Expand All @@ -70,7 +70,6 @@ If you prefer a more human style, a network trained from human games is availabl

* GCC, Clang or MSVC, any C++14 compiler
* Boost 1.58.x or later, headers and program_options, filesystem and system libraries (libboost-dev, libboost-program-options-dev and libboost-filesystem-dev on Debian/Ubuntu)
* BLAS Library: OpenBLAS (libopenblas-dev) or (optionally) Intel MKL
* zlib library (zlib1g & zlib1g-dev on Debian/Ubuntu)
* Standard OpenCL C headers (opencl-headers on Debian/Ubuntu, or at
https://github.com/KhronosGroup/OpenCL-Headers/tree/master/opencl22/)
Expand All @@ -79,62 +78,61 @@ https://github.com/KhronosGroup/OpenCL-Headers/tree/master/opencl22/)
drivers is strongly recommended (OpenCL 1.1 support is enough).
If you do not have a GPU, modify config.h in the source and remove
the line that says "#define USE_OPENCL".
* Optional: BLAS Library: OpenBLAS (libopenblas-dev) or Intel MKL
* The program has been tested on Windows, Linux and macOS.

## Example of compiling and running - Ubuntu
## Example of compiling and running - Ubuntu & similar

# Test for OpenCL support & compatibility
sudo apt install clinfo && clinfo

# Clone github repo
git clone https://github.com/gcp/leela-zero
cd leela-zero/src
sudo apt install libboost-dev libboost-program-options-dev libboost-filesystem-dev libopenblas-dev opencl-headers ocl-icd-libopencl1 ocl-icd-opencl-dev zlib1g-dev
make
cd ..
wget http://zero.sjeng.org/best-network
src/leelaz --weights best-network
cd leela-zero
git submodule update --init --recursive

# Install build depedencies
sudo apt install libboost-dev libboost-program-options-dev libboost-filesystem-dev opencl-headers ocl-icd-libopencl1 ocl-icd-opencl-dev zlib1g-dev

# Use stand alone directory to keep source dir clean
mkdir build && cd build
cmake ..
cmake --build .
./tests
curl -O https://zero.sjeng.org/best-network
./leelaz --weights best-network

## Example of compiling and running - macOS

# Clone github repo
git clone https://github.com/gcp/leela-zero
cd leela-zero/src
brew install boost
make
cd ..
curl -O http://zero.sjeng.org/best-network
src/leelaz --weights best-network
cd leela-zero
git submodule update --init --recursive

# Install build depedencies
brew install boost cmake

# Use stand alone directory to keep source dir clean
mkdir build && cd build
cmake ..
cmake --build .
./tests
curl -O https://zero.sjeng.org/best-network
./leelaz --weights best-network

## Example of compiling and running - Windows

# Clone github repo
git clone https://github.com/gcp/leela-zero
cd leela-zero
git submodule update --init --recursive
cd msvc
Double-click the leela-zero2015.sln or leela-zero2017.sln corresponding
to the Visual Studio version you have.
# Build from Visual Studio 2015 or 2017
# Download <http://zero.sjeng.org/best-network> to msvc\x64\Release
# Download <https://zero.sjeng.org/best-network> to msvc\x64\Release
msvc\x64\Release\leelaz.exe --weights best-network

## Example of compiling and running - CMake (macOS/Ubuntu)

# Clone github repo
git clone https://github.com/gcp/leela-zero
cd leela-zero
git submodule update --init --recursive

# Use stand alone directory to keep source dir clean
mkdir build && cd build
cmake ..
make leelaz
make tests
./tests
curl -O http://zero.sjeng.org/best-network
./leelaz --weights best-network


# Usage

The engine supports the [GTP protocol, version 2](https://www.lysator.liu.se/~gunnar/gtp/gtp2-spec-draft2/gtp2-spec.html).
Expand Down Expand Up @@ -295,7 +293,6 @@ If interrupted, training can be resumed with:
- [ ] Implement GPU batching.
- [ ] Parameter setting over GTP.
- More backends:
- [ ] Eigen based BLAS backend.
- [ ] MKL-DNN based backend.
- [ ] CUDA specific version using cuDNN.
- [ ] AMD specific version using MIOpen.
Expand All @@ -319,5 +316,4 @@ https://medium.com/applied-data-science/alphago-zero-explained-in-one-diagram-36

# License

The code is released under the GPLv3 or later, except for ThreadPool.h, cl2.hpp,
half.hpp and the clblast_level3 subdirs, which have specific licenses (compatible with GPLv3) mentioned in those files.
The code is released under the GPLv3 or later, except for ThreadPool.h, cl2.hpp, half.hpp and the eigen and clblast_level3 subdirs, which have specific licenses (compatible with GPLv3) mentioned in those files.
3 changes: 3 additions & 0 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ platform: x64
environment:
matrix:
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
features: USE_CPU_ONLY USE_BLAS
run_tests: 1
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
features: USE_CPU_ONLY
run_tests: 1
Expand Down
29 changes: 26 additions & 3 deletions src/CPUPipe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,23 @@
#include <cblas.h>
#endif
#ifndef USE_BLAS
#error "No non-BLAS implementation"
#include <Eigen/Dense>
#endif

#include "CPUPipe.h"
#include "Network.h"
#include "Im2Col.h"

#ifndef USE_BLAS
// Eigen helpers
template <typename T>
using EigenMatrixMap =
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenMatrixMap =
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
#endif

void CPUPipe::initialize(int channels) {
m_input_channels = channels;
}
Expand Down Expand Up @@ -130,14 +140,20 @@ void CPUPipe::winograd_sgemm(const std::vector<float>& U,
const auto offset_u = b * K * C;
const auto offset_v = b * C * P;
const auto offset_m = b * K * P;

#ifdef USE_BLAS
cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
K, P, C,
1.0f,
&U[offset_u], K,
&V[offset_v], P,
0.0f,
&M[offset_m], P);
#else
auto C_mat = EigenMatrixMap<float>(M.data() + offset_m, P, K);
C_mat.noalias() =
ConstEigenMatrixMap<float>(V.data() + offset_v, P, C)
* ConstEigenMatrixMap<float>(U.data() + offset_u, K, C).transpose();
#endif
}
}

Expand Down Expand Up @@ -253,13 +269,20 @@ void convolve(const size_t outputs,
// passing a matrix A[m][n], the value should be m.
// cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
// ldb, beta, C, N);

#ifdef USE_BLAS
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
// M N K
outputs, num_intersections, filter_dim,
1.0f, &weights[0], filter_dim,
&col[0], num_intersections,
0.0f, &output[0], num_intersections);
#else
auto C_mat = EigenMatrixMap<float>(output.data(),
num_intersections, outputs);
C_mat.noalias() =
ConstEigenMatrixMap<float>(col.data(), num_intersections, filter_dim)
* ConstEigenMatrixMap<float>(weights.data(), filter_dim, outputs);
#endif

for (unsigned int o = 0; o < outputs; o++) {
for (unsigned int b = 0; b < num_intersections; b++) {
Expand Down
1 change: 1 addition & 0 deletions src/Eigen
Submodule Eigen added at a1b9c2
3 changes: 2 additions & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,15 @@ LIBS =

ifeq ($(THE_OS),Linux)
# for Linux with OpenBLAS
CXXFLAGS += -I/usr/include/openblas
CXXFLAGS += -I/usr/include/openblas -I./Eigen
DYNAMIC_LIBS += -lopenblas
DYNAMIC_LIBS += -lOpenCL
endif
ifeq ($(THE_OS),Darwin)
# for macOS (comment out the Linux part)
LIBS += -framework Accelerate
LIBS += -framework OpenCL
CXXFLAGS += -I./Eigen
CXXFLAGS += -I/System/Library/Frameworks/Accelerate.framework/Versions/Current/Headers
endif

Expand Down
32 changes: 28 additions & 4 deletions src/Network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
#include <boost/utility.hpp>
#include <boost/format.hpp>
#include <boost/spirit/home/x3.hpp>
#ifndef USE_BLAS
#include <Eigen/Dense>
#endif

#ifdef __APPLE__
#include <Accelerate/Accelerate.h>
Expand Down Expand Up @@ -62,6 +65,19 @@
namespace x3 = boost::spirit::x3;
using namespace Utils;

#ifndef USE_BLAS
// Eigen helpers
template <typename T>
using EigenVectorMap =
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorMap =
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenMatrixMap =
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
#endif

// Symmetry helper
static std::array<std::array<int, NUM_INTERSECTIONS>,
Network::NUM_SYMMETRIES> symmetry_nn_idx_table;
Expand Down Expand Up @@ -362,6 +378,9 @@ void Network::initialize(int playouts, const std::string & weightsfile) {
myprintf("BLAS core: MKL %s\n", Version.Processor);
#endif
#endif
#else
myprintf("BLAS Core: built-in Eigen %d.%d.%d library.\n",
EIGEN_WORLD_VERSION, EIGEN_MAJOR_VERSION, EIGEN_MINOR_VERSION);
#endif

m_fwd_weights = std::make_shared<ForwardPipeWeights>();
Expand Down Expand Up @@ -519,8 +538,6 @@ void Network::initialize(int playouts, const std::string & weightsfile) {
m_fwd_weights.reset();
}

#ifdef USE_BLAS

template<unsigned int inputs,
unsigned int outputs,
bool ReLU,
Expand All @@ -530,13 +547,21 @@ std::vector<float> innerproduct(const std::vector<float>& input,
const std::array<float, outputs>& biases) {
std::vector<float> output(outputs);

#ifdef USE_BLAS
cblas_sgemv(CblasRowMajor, CblasNoTrans,
// M K
outputs, inputs,
1.0f, &weights[0], inputs,
&input[0], 1,
0.0f, &output[0], 1);

#else
EigenVectorMap<float> y(output.data(), outputs);
y.noalias() =
ConstEigenMatrixMap<float>(weights.data(),
inputs,
outputs).transpose()
* ConstEigenVectorMap<float>(input.data(), inputs);
#endif
const auto lambda_ReLU = [](const auto val) { return (val > 0.0f) ?
val : 0.0f; };
for (unsigned int o = 0; o < outputs; o++) {
Expand Down Expand Up @@ -577,7 +602,6 @@ void batchnorm(const size_t channels,
}
}
}
#endif

#ifdef USE_OPENCL_SELFCHECK
void Network::compare_net_outputs(const Netresult& data,
Expand Down
Loading

0 comments on commit 72431e2

Please sign in to comment.