Skip to content

Commit

Permalink
SHA256 implementations based on Intel SHA Extensions
Browse files Browse the repository at this point in the history
Summary:
 * [Refactor] CPU feature detection logic for SHA256

 * Add SHA256 implementation using using Intel SHA intrinsics

 * Use immintrin.h everywhere for intrinsics

 * Use __cpuid_count for gnu C to avoid gitian build fail.

This is a backport of Core PR13386 and PR13611

Depends on D1850

Test Plan:
  make check

Reviewers: #bitcoin_abc, schancel

Reviewed By: #bitcoin_abc, schancel

Subscribers: teamcity

Differential Revision: https://reviews.bitcoinabc.org/D1851
  • Loading branch information
sipa authored and deadalnix committed Oct 9, 2018
1 parent 05a2afd commit b575d39
Show file tree
Hide file tree
Showing 7 changed files with 472 additions and 41 deletions.
28 changes: 20 additions & 8 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@ fi
AX_CHECK_COMPILE_FLAG([-msse4.2],[[SSE42_CXXFLAGS="-msse4.2"]],,[[$CXXFLAG_WERROR]])
AX_CHECK_COMPILE_FLAG([-msse4.1],[[SSE41_CXXFLAGS="-msse4.1"]],,[[$CXXFLAG_WERROR]])
AX_CHECK_COMPILE_FLAG([-mavx -mavx2],[[AVX2_CXXFLAGS="-mavx -mavx2"]],,[[$CXXFLAG_WERROR]])
AX_CHECK_COMPILE_FLAG([-msse4 -msha],[[SHANI_CXXFLAGS="-msse4 -msha"]],,[[$CXXFLAG_WERROR]])

TEMP_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $SSE42_CXXFLAGS"
Expand Down Expand Up @@ -357,11 +358,7 @@ CXXFLAGS="$CXXFLAGS $SSE41_CXXFLAGS"
AC_MSG_CHECKING(for SSE4.1 intrinsics)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdint.h>
#if defined(_MSC_VER)
#include <immintrin.h>
#elif defined(__GNUC__)
#include <x86intrin.h>
#endif
]],[[
__m128i l = _mm_set1_epi32(0);
return _mm_extract_epi32(l, 3);
Expand All @@ -376,11 +373,7 @@ CXXFLAGS="$CXXFLAGS $AVX2_CXXFLAGS"
AC_MSG_CHECKING(for AVX2 intrinsics)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdint.h>
#if defined(_MSC_VER)
#include <immintrin.h>
#elif defined(__GNUC__) && defined(__AVX2__)
#include <x86intrin.h>
#endif
]],[[
__m256i l = _mm256_set1_epi32(0);
return _mm256_extract_epi32(l, 7);
Expand All @@ -390,6 +383,23 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
)
CXXFLAGS="$TEMP_CXXFLAGS"

TEMP_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $SHANI_CXXFLAGS"
AC_MSG_CHECKING(for SHA-NI intrinsics)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdint.h>
#include <immintrin.h>
]],[[
__m128i i = _mm_set1_epi32(0);
__m128i j = _mm_set1_epi32(1);
__m128i k = _mm_set1_epi32(2);
return _mm_extract_epi32(_mm_sha256rnds2_epu32(i, i, k), 0);
]])],
[ AC_MSG_RESULT(yes); enable_shani=yes; AC_DEFINE(ENABLE_SHANI, 1, [Define this symbol to build code that uses SHA-NI intrinsics]) ],
[ AC_MSG_RESULT(no)]
)
CXXFLAGS="$TEMP_CXXFLAGS"

CPPFLAGS="$CPPFLAGS -DHAVE_BUILD_INFO -D__STDC_FORMAT_MACROS"

AC_ARG_WITH([utils],
Expand Down Expand Up @@ -1266,6 +1276,7 @@ AM_CONDITIONAL([HARDEN],[test x$use_hardening = xyes])
AM_CONDITIONAL([ENABLE_HWCRC32],[test x$enable_hwcrc32 = xyes])
AM_CONDITIONAL([ENABLE_SSE41],[test x$enable_sse41 = xyes])
AM_CONDITIONAL([ENABLE_AVX2],[test x$enable_avx2 = xyes])
AM_CONDITIONAL([ENABLE_SHANI],[test x$enable_shani = xyes])
AM_CONDITIONAL([USE_ASM],[test x$use_asm = xyes])

AC_DEFINE(CLIENT_VERSION_MAJOR, _CLIENT_VERSION_MAJOR, [Major version])
Expand Down Expand Up @@ -1304,6 +1315,7 @@ AC_SUBST(PIE_FLAGS)
AC_SUBST(SSE42_CXXFLAGS)
AC_SUBST(SSE41_CXXFLAGS)
AC_SUBST(AVX2_CXXFLAGS)
AC_SUBST(SHANI_CXXFLAGS)
AC_SUBST(LIBTOOL_APP_LDFLAGS)
AC_SUBST(USE_UPNP)
AC_SUBST(USE_QRCODE)
Expand Down
2 changes: 1 addition & 1 deletion doc/release-notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ Bitcoin ABC version 0.18.3 is now available from:
This release includes the following features and fixes:
- Remove support for Qt4
- Upgrade reproducible build to us Qt 5.9.6
- Improve SHA256 performance using SSE4.1 or AVX2 if available.
- Improve SHA256 performance using SSE4.1, AVX2 and/or SHA if available.

10 changes: 10 additions & 0 deletions src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ if ENABLE_AVX2
LIBBITCOIN_CRYPTO_AVX2 = crypto/libbitcoin_crypto_avx2.a
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_AVX2)
endif
if ENABLE_SHANI
LIBBITCOIN_CRYPTO_SHANI = crypto/libbitcoin_crypto_shani.a
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_SHANI)
endif

$(LIBSECP256K1): $(wildcard secp256k1/src/*) $(wildcard secp256k1/include/*)
$(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C $(@D) $(@F)
Expand Down Expand Up @@ -326,6 +330,12 @@ crypto_libbitcoin_crypto_avx2_a_CXXFLAGS += $(AVX2_CXXFLAGS)
crypto_libbitcoin_crypto_avx2_a_CPPFLAGS += -DENABLE_AVX2
crypto_libbitcoin_crypto_avx2_a_SOURCES = crypto/sha256_avx2.cpp

crypto_libbitcoin_crypto_shani_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
crypto_libbitcoin_crypto_shani_a_CPPFLAGS = $(AM_CPPFLAGS)
crypto_libbitcoin_crypto_shani_a_CXXFLAGS += $(SHANI_CXXFLAGS)
crypto_libbitcoin_crypto_shani_a_CPPFLAGS += -DENABLE_SHANI
crypto_libbitcoin_crypto_shani_a_SOURCES = crypto/sha256_shani.cpp

# consensus: shared between all executables that validate any consensus rules.
libbitcoin_consensus_a_CPPFLAGS = $(AM_CPPFLAGS) $(BITCOIN_INCLUDES)
libbitcoin_consensus_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
Expand Down
95 changes: 77 additions & 18 deletions src/crypto/sha256.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <cassert>
#include <cstring>

#if defined(__x86_64__) || defined(__amd64__)
#if defined(__x86_64__) || defined(__amd64__) || defined(__i386__)
#if defined(USE_ASM)
#include <cpuid.h>
namespace sha256_sse4 {
Expand All @@ -26,6 +26,14 @@ namespace sha256d64_avx2 {
void Transform_8way(uint8_t *out, const uint8_t *in);
}

namespace sha256d64_shani {
void Transform_2way(unsigned char *out, const unsigned char *in);
}

namespace sha256_shani {
void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks);
}

// Internal implementation code.
namespace {
/// Internal SHA-256 implementation.
Expand Down Expand Up @@ -607,6 +615,7 @@ void TransformD64Wrapper(uint8_t *out, const uint8_t *in) {

TransformType Transform = sha256::Transform;
TransformD64Type TransformD64 = sha256::TransformD64;
TransformD64Type TransformD64_2way = nullptr;
TransformD64Type TransformD64_4way = nullptr;
TransformD64Type TransformD64_8way = nullptr;

Expand Down Expand Up @@ -689,6 +698,13 @@ bool SelfTest() {
if (!std::equal(out, out + 32, result_d64)) return false;
}

// Test TransformD64_2way, if available.
if (TransformD64_2way) {
unsigned char out[64];
TransformD64_2way(out, data + 1);
if (!std::equal(out, out + 64, result_d64)) return false;
}

// Test TransformD64_4way, if available.
if (TransformD64_4way) {
uint8_t out[128];
Expand All @@ -709,11 +725,15 @@ bool SelfTest() {
#if defined(USE_ASM) && \
(defined(__x86_64__) || defined(__amd64__) || defined(__i386__))
// We can't use cpuid.h's __get_cpuid as it does not support subleafs.
void inline cpuid(uint32_t leaf, uint32_t subleaf, uint32_t &a, uint32_t &b,
inline void cpuid(uint32_t leaf, uint32_t subleaf, uint32_t &a, uint32_t &b,
uint32_t &c, uint32_t &d) {
#ifdef __GNUC__
__cpuid_count(leaf, subleaf, a, b, c, d);
#else
__asm__("cpuid"
: "=a"(a), "=b"(b), "=c"(c), "=d"(d)
: "0"(leaf), "2"(subleaf));
#endif
}

/** Check whether the OS has enabled AVX registers. */
Expand All @@ -729,33 +749,64 @@ std::string SHA256AutoDetect() {
std::string ret = "standard";
#if defined(USE_ASM) && \
(defined(__x86_64__) || defined(__amd64__) || defined(__i386__))
// Silence unused warning (in case ENABLE_AVX2 is not defined)
bool have_sse4 = false;
bool have_xsave = false;
bool have_avx = false;
bool have_avx2 = false;
bool have_shani = false;
bool enabled_avx = false;

(void)AVXEnabled;
(void)have_sse4;
(void)have_avx;
(void)have_xsave;
(void)have_avx2;
(void)have_shani;
(void)enabled_avx;

uint32_t eax, ebx, ecx, edx;
cpuid(1, 0, eax, ebx, ecx, edx);
if ((ecx >> 19) & 1) {
have_sse4 = (ecx >> 19) & 1;
have_xsave = (ecx >> 27) & 1;
have_avx = (ecx >> 28) & 1;
if (have_xsave && have_avx) {
enabled_avx = AVXEnabled();
}
if (have_sse4) {
cpuid(7, 0, eax, ebx, ecx, edx);
have_avx2 = (ebx >> 5) & 1;
have_shani = (ebx >> 29) & 1;
}

#if defined(ENABLE_SHANI) && !defined(BUILD_BITCOIN_INTERNAL)
if (have_shani) {
Transform = sha256_shani::Transform;
TransformD64 = TransformD64Wrapper<sha256_shani::Transform>;
TransformD64_2way = sha256d64_shani::Transform_2way;
ret = "shani(1way,2way)";
have_sse4 = false; // Disable SSE4/AVX2;
have_avx2 = false;
}
#endif

if (have_sse4) {
#if defined(__x86_64__) || defined(__amd64__)
Transform = sha256_sse4::Transform;
TransformD64 = TransformD64Wrapper<sha256_sse4::Transform>;
ret = "sse4(1way)";
#endif
#if defined(ENABLE_SSE41) && !defined(BUILD_BITCOIN_INTERNAL)
TransformD64_4way = sha256d64_sse41::Transform_4way;
ret = "sse4(1way+4way)";
#if defined(ENABLE_AVX2) && !defined(BUILD_BITCOIN_INTERNAL)
if (((ecx >> 27) & 1) && ((ecx >> 28) & 1)) { // XSAVE and AVX
cpuid(7, 0, eax, ebx, ecx, edx);
if ((ebx >> 5) & 1) { // AVX2 flag
if (AVXEnabled()) { // OS has enabled AVX registers
TransformD64_8way = sha256d64_avx2::Transform_8way;
ret += ",avx2(8way)";
}
}
}
#endif
#else
ret = "sse4";
ret += ",sse41(4way)";
#endif
}

#if defined(ENABLE_AVX2) && !defined(BUILD_BITCOIN_INTERNAL)
if (have_avx2 && have_avx && enabled_avx) {
TransformD64_8way = sha256d64_avx2::Transform_8way;
ret += ",avx2(8way)";
}
#endif
#endif

assert(SelfTest());
Expand Down Expand Up @@ -832,6 +883,14 @@ void SHA256D64(uint8_t *out, const uint8_t *in, size_t blocks) {
blocks -= 4;
}
}
if (TransformD64_2way) {
while (blocks >= 2) {
TransformD64_2way(out, in);
out += 64;
in += 128;
blocks -= 2;
}
}
while (blocks) {
TransformD64(out, in);
out += 32;
Expand Down
10 changes: 3 additions & 7 deletions src/crypto/sha256_avx2.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
#ifdef ENABLE_AVX2

#include <stdint.h>
#if defined(_MSC_VER)
#include <immintrin.h>
#elif defined(__GNUC__)
#include <x86intrin.h>
#endif
#include <stdint.h>

#include "crypto/common.h"
#include "crypto/sha256.h"
Expand Down Expand Up @@ -70,7 +66,7 @@ namespace {
}

/** One round of SHA-256. */
void inline __attribute__((always_inline))
inline void __attribute__((always_inline))
Round(__m256i a, __m256i b, __m256i c, __m256i &d, __m256i e, __m256i f,
__m256i g, __m256i &h, __m256i k) {
__m256i t1 = Add(h, Sigma1(e), Ch(e, f, g), k);
Expand All @@ -91,7 +87,7 @@ namespace {
0x04050607UL, 0x00010203UL));
}

void inline Write8(uint8_t *out, int offset, __m256i v) {
inline void Write8(uint8_t *out, int offset, __m256i v) {
v = _mm256_shuffle_epi8(
v, _mm256_set_epi32(0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL,
0x00010203UL, 0x0C0D0E0FUL, 0x08090A0BUL,
Expand Down
Loading

0 comments on commit b575d39

Please sign in to comment.