Skip to content

Commit

Permalink
Merge bitcoin#13191: Specialized double-SHA256 with 64 byte inputs wi…
Browse files Browse the repository at this point in the history
…th SSE4.1 and AVX2

4defdfa [MOVEONLY] Move unused Merkle branch code to tests (Pieter Wuille)
4437d6e 8-way AVX2 implementation for double SHA256 on 64-byte inputs (Pieter Wuille)
230294b 4-way SSE4.1 implementation for double SHA256 on 64-byte inputs (Pieter Wuille)
1f0e7ca Use SHA256D64 in Merkle root computation (Pieter Wuille)
d0c9632 Specialized double sha256 for 64 byte inputs (Pieter Wuille)
57f3463 Refactor SHA256 code (Pieter Wuille)
0df0178 Benchmark Merkle root computation (Pieter Wuille)

Pull request description:

  This introduces a framework for specialized double-SHA256 with 64 byte inputs. 4 different implementations are provided:
  * Generic C++ (reusing the normal SHA256 code)
  * Specialized C++ for 64-byte inputs, but no special instructions
  * 4-way using SSE4.1 intrinsics
  * 8-way using AVX2 intrinsics

  On my own system (AVX2 capable), I get these benchmarks for computing the Merkle root of 9001 leaves (supported lengths / special instructions / parallellism):
  * 7.2 ms with varsize/naive/1way (master, non-SSE4 hardware)
  * 5.8 ms with size64/naive/1way (this PR, non-SSE4 capable systems)
  * 4.8 ms with varsize/SSE4/1way (master, SSE4 hardware)
  * 2.9 ms with size64/SSE4/4way (this PR, SSE4 hardware)
  * 1.1 ms with size64/AVX2/8way (this PR, AVX2 hardware)

Tree-SHA512: efa32d48b32820d9ce788ead4eb583949265be8c2e5f538c94bc914e92d131a57f8c1ee26c6f998e81fb0e30675d4e2eddc3360bcf632676249036018cff343e
  • Loading branch information
laanwj committed Jun 4, 2018
2 parents 2722a1f + 4defdfa commit 0de7cc8
Show file tree
Hide file tree
Showing 16 changed files with 1,347 additions and 204 deletions.
44 changes: 44 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,8 @@ fi
# be compiled with them, rather that specific objects/libs may use them after checking for runtime
# compatibility.
AX_CHECK_COMPILE_FLAG([-msse4.2],[[SSE42_CXXFLAGS="-msse4.2"]],,[[$CXXFLAG_WERROR]])
AX_CHECK_COMPILE_FLAG([-msse4.1],[[SSE41_CXXFLAGS="-msse4.1"]],,[[$CXXFLAG_WERROR]])
AX_CHECK_COMPILE_FLAG([-mavx -mavx2],[[AVX2_CXXFLAGS="-mavx -mavx2"]],,[[$CXXFLAG_WERROR]])

TEMP_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $SSE42_CXXFLAGS"
Expand All @@ -335,6 +337,44 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
)
CXXFLAGS="$TEMP_CXXFLAGS"

TEMP_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $SSE41_CXXFLAGS"
AC_MSG_CHECKING(for SSE4.1 intrinsics)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdint.h>
#if defined(_MSC_VER)
#include <immintrin.h>
#elif defined(__GNUC__)
#include <x86intrin.h>
#endif
]],[[
__m128i l = _mm_set1_epi32(0);
return _mm_extract_epi32(l, 3);
]])],
[ AC_MSG_RESULT(yes); enable_sse41=yes; AC_DEFINE(ENABLE_SSE41, 1, [Define this symbol to build code that uses SSE4.1 intrinsics]) ],
[ AC_MSG_RESULT(no)]
)
CXXFLAGS="$TEMP_CXXFLAGS"

TEMP_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $AVX2_CXXFLAGS"
AC_MSG_CHECKING(for AVX2 intrinsics)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdint.h>
#if defined(_MSC_VER)
#include <immintrin.h>
#elif defined(__GNUC__) && defined(__AVX2__)
#include <x86intrin.h>
#endif
]],[[
__m256i l = _mm256_set1_epi32(0);
return _mm256_extract_epi32(l, 7);
]])],
[ AC_MSG_RESULT(yes); enable_avx2=yes; AC_DEFINE(ENABLE_AVX2, 1, [Define this symbol to build code that uses AVX2 intrinsics]) ],
[ AC_MSG_RESULT(no)]
)
CXXFLAGS="$TEMP_CXXFLAGS"

CPPFLAGS="$CPPFLAGS -DHAVE_BUILD_INFO -D__STDC_FORMAT_MACROS"

AC_ARG_WITH([utils],
Expand Down Expand Up @@ -1253,6 +1293,8 @@ AM_CONDITIONAL([USE_LCOV],[test x$use_lcov = xyes])
AM_CONDITIONAL([GLIBC_BACK_COMPAT],[test x$use_glibc_compat = xyes])
AM_CONDITIONAL([HARDEN],[test x$use_hardening = xyes])
AM_CONDITIONAL([ENABLE_HWCRC32],[test x$enable_hwcrc32 = xyes])
AM_CONDITIONAL([ENABLE_SSE41],[test x$enable_sse41 = xyes])
AM_CONDITIONAL([ENABLE_AVX2],[test x$enable_avx2 = xyes])
AM_CONDITIONAL([USE_ASM],[test x$use_asm = xyes])

AC_DEFINE(CLIENT_VERSION_MAJOR, _CLIENT_VERSION_MAJOR, [Major version])
Expand Down Expand Up @@ -1295,6 +1337,8 @@ AC_SUBST(PIE_FLAGS)
AC_SUBST(SANITIZER_CXXFLAGS)
AC_SUBST(SANITIZER_LDFLAGS)
AC_SUBST(SSE42_CXXFLAGS)
AC_SUBST(SSE41_CXXFLAGS)
AC_SUBST(AVX2_CXXFLAGS)
AC_SUBST(LIBTOOL_APP_LDFLAGS)
AC_SUBST(USE_UPNP)
AC_SUBST(USE_QRCODE)
Expand Down
28 changes: 27 additions & 1 deletion src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ LIBBITCOIN_CONSENSUS=libbitcoin_consensus.a
LIBBITCOIN_CLI=libbitcoin_cli.a
LIBBITCOIN_UTIL=libbitcoin_util.a
LIBBITCOIN_CRYPTO=crypto/libbitcoin_crypto.a
LIBBITCOIN_CRYPTO_SSE41=crypto/libbitcoin_crypto_sse41.a
LIBBITCOIN_CRYPTO_AVX2=crypto/libbitcoin_crypto_avx2.a
LIBBITCOINQT=qt/libbitcoinqt.a
LIBSECP256K1=secp256k1/libsecp256k1.la

Expand All @@ -50,6 +52,8 @@ $(LIBSECP256K1): $(wildcard secp256k1/src/*) $(wildcard secp256k1/include/*)
# But to build the less dependent modules first, we manually select their order here:
EXTRA_LIBRARIES += \
$(LIBBITCOIN_CRYPTO) \
$(LIBBITCOIN_CRYPTO_SSE41) \
$(LIBBITCOIN_CRYPTO_AVX2) \
$(LIBBITCOIN_UTIL) \
$(LIBBITCOIN_COMMON) \
$(LIBBITCOIN_CONSENSUS) \
Expand Down Expand Up @@ -289,6 +293,22 @@ if USE_ASM
crypto_libbitcoin_crypto_a_SOURCES += crypto/sha256_sse4.cpp
endif

crypto_libbitcoin_crypto_sse41_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
crypto_libbitcoin_crypto_sse41_a_CPPFLAGS = $(AM_CPPFLAGS)
if ENABLE_SSE41
crypto_libbitcoin_crypto_sse41_a_CXXFLAGS += $(SSE41_CXXFLAGS)
crypto_libbitcoin_crypto_sse41_a_CPPFLAGS += -DENABLE_SSE41
endif
crypto_libbitcoin_crypto_sse41_a_SOURCES = crypto/sha256_sse41.cpp

crypto_libbitcoin_crypto_avx2_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
crypto_libbitcoin_crypto_avx2_a_CPPFLAGS = $(AM_CPPFLAGS)
if ENABLE_AVX2
crypto_libbitcoin_crypto_avx2_a_CXXFLAGS += $(AVX2_CXXFLAGS)
crypto_libbitcoin_crypto_avx2_a_CPPFLAGS += -DENABLE_AVX2
endif
crypto_libbitcoin_crypto_avx2_a_SOURCES = crypto/sha256_avx2.cpp

# consensus: shared between all executables that validate any consensus rules.
libbitcoin_consensus_a_CPPFLAGS = $(AM_CPPFLAGS) $(BITCOIN_INCLUDES)
libbitcoin_consensus_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
Expand Down Expand Up @@ -411,6 +431,8 @@ bitcoind_LDADD = \
$(LIBBITCOIN_ZMQ) \
$(LIBBITCOIN_CONSENSUS) \
$(LIBBITCOIN_CRYPTO) \
$(LIBBITCOIN_CRYPTO_SSE41) \
$(LIBBITCOIN_CRYPTO_AVX2) \
$(LIBLEVELDB) \
$(LIBLEVELDB_SSE42) \
$(LIBMEMENV) \
Expand All @@ -432,7 +454,9 @@ bitcoin_cli_LDADD = \
$(LIBBITCOIN_CLI) \
$(LIBUNIVALUE) \
$(LIBBITCOIN_UTIL) \
$(LIBBITCOIN_CRYPTO)
$(LIBBITCOIN_CRYPTO) \
$(LIBBITCOIN_CRYPTO_SSE41) \
$(LIBBITCOIN_CRYPTO_AVX2)

bitcoin_cli_LDADD += $(BOOST_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(EVENT_LIBS)
#
Expand All @@ -453,6 +477,8 @@ bitcoin_tx_LDADD = \
$(LIBBITCOIN_UTIL) \
$(LIBBITCOIN_CONSENSUS) \
$(LIBBITCOIN_CRYPTO) \
$(LIBBITCOIN_CRYPTO_SSE41) \
$(LIBBITCOIN_CRYPTO_AVX2) \
$(LIBSECP256K1)

bitcoin_tx_LDADD += $(BOOST_LIBS) $(CRYPTO_LIBS)
Expand Down
3 changes: 3 additions & 0 deletions src/Makefile.bench.include
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ bench_bench_bitcoin_SOURCES = \
bench/rollingbloom.cpp \
bench/crypto_hash.cpp \
bench/ccoins_caching.cpp \
bench/merkle_root.cpp \
bench/mempool_eviction.cpp \
bench/verify_script.cpp \
bench/base58.cpp \
Expand All @@ -38,6 +39,8 @@ bench_bench_bitcoin_LDADD = \
$(LIBBITCOIN_UTIL) \
$(LIBBITCOIN_CONSENSUS) \
$(LIBBITCOIN_CRYPTO) \
$(LIBBITCOIN_CRYPTO_SSE41) \
$(LIBBITCOIN_CRYPTO_AVX2) \
$(LIBLEVELDB) \
$(LIBLEVELDB_SSE42) \
$(LIBMEMENV) \
Expand Down
2 changes: 1 addition & 1 deletion src/Makefile.qt.include
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ endif
if ENABLE_ZMQ
qt_bitcoin_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
endif
qt_bitcoin_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) \
qt_bitcoin_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBBITCOIN_CRYPTO_SSE41) $(LIBBITCOIN_CRYPTO_AVX2) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) \
$(BOOST_LIBS) $(QT_LIBS) $(QT_DBUS_LIBS) $(QR_LIBS) $(PROTOBUF_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
$(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
qt_bitcoin_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)
Expand Down
2 changes: 1 addition & 1 deletion src/Makefile.qttest.include
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ endif
if ENABLE_ZMQ
qt_test_test_bitcoin_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
endif
qt_test_test_bitcoin_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) \
qt_test_test_bitcoin_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBBITCOIN_CRYPTO_SSE41) $(LIBBITCOIN_CRYPTO_AVX2) $(LIBUNIVALUE) $(LIBLEVELDB) \
$(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \
$(QR_LIBS) $(PROTOBUF_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
$(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
Expand Down
5 changes: 4 additions & 1 deletion src/Makefile.test.include
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ test_test_bitcoin_LDADD =
if ENABLE_WALLET
test_test_bitcoin_LDADD += $(LIBBITCOIN_WALLET)
endif
test_test_bitcoin_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) \

test_test_bitcoin_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBBITCOIN_CRYPTO_SSE41) $(LIBBITCOIN_CRYPTO_AVX2) $(LIBUNIVALUE) \
$(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS)
test_test_bitcoin_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)

Expand All @@ -134,6 +135,8 @@ test_test_bitcoin_fuzzy_LDADD = \
$(LIBBITCOIN_UTIL) \
$(LIBBITCOIN_CONSENSUS) \
$(LIBBITCOIN_CRYPTO) \
$(LIBBITCOIN_CRYPTO_SSE41) \
$(LIBBITCOIN_CRYPTO_AVX2) \
$(LIBSECP256K1)

test_test_bitcoin_fuzzy_LDADD += $(BOOST_LIBS) $(CRYPTO_LIBS)
Expand Down
9 changes: 9 additions & 0 deletions src/bench/crypto_hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ static void SHA256_32b(benchmark::State& state)
}
}

static void SHA256D64_1024(benchmark::State& state)
{
std::vector<uint8_t> in(64 * 1024, 0);
while (state.KeepRunning()) {
SHA256D64(in.data(), in.data(), 1024);
}
}

static void SHA512(benchmark::State& state)
{
uint8_t hash[CSHA512::OUTPUT_SIZE];
Expand Down Expand Up @@ -94,5 +102,6 @@ BENCHMARK(SHA512, 330);

BENCHMARK(SHA256_32b, 4700 * 1000);
BENCHMARK(SipHash_32b, 40 * 1000 * 1000);
BENCHMARK(SHA256D64_1024, 7400);
BENCHMARK(FastRandom_32bit, 110 * 1000 * 1000);
BENCHMARK(FastRandom_1bit, 440 * 1000 * 1000);
26 changes: 26 additions & 0 deletions src/bench/merkle_root.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Copyright (c) 2016 The Bitcoin Core developers
// Distributed under the MIT software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.

#include "bench.h"

#include "uint256.h"
#include "random.h"
#include "consensus/merkle.h"

static void MerkleRoot(benchmark::State& state)
{
FastRandomContext rng(true);
std::vector<uint256> leaves;
leaves.resize(9001);
for (auto& item : leaves) {
item = rng.rand256();
}
while (state.KeepRunning()) {
bool mutation = false;
uint256 hash = ComputeMerkleRoot(std::vector<uint256>(leaves), &mutation);
leaves[mutation] = hash;
}
}

BENCHMARK(MerkleRoot, 800);
Loading

0 comments on commit 0de7cc8

Please sign in to comment.