Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
herumi committed Jul 22, 2024
2 parents cddb4e3 + c5430da commit 94fd641
Show file tree
Hide file tree
Showing 10 changed files with 1,213 additions and 570 deletions.
17 changes: 14 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,13 @@ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64") # Win64
target_link_libraries(mcl_st PUBLIC ${BINT_X64_OBJ})
add_dependencies(mcl_st gen_bint-x64-win.obj)
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT APPLE)
target_sources(mcl PRIVATE src/asm/bint-x64-amd64.S)
target_sources(mcl_st PRIVATE src/asm/bint-x64-amd64.S)
if (CMAKE_SYSTEM_NAME STREQUAL "MSYS")
target_sources(mcl PRIVATE src/asm/bint-x64-mingw.S)
target_sources(mcl_st PRIVATE src/asm/bint-x64-mingw.S)
else()
target_sources(mcl PRIVATE src/asm/bint-x64-amd64.S)
target_sources(mcl_st PRIVATE src/asm/bint-x64-amd64.S)
endif()
else()
if(NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
message(FATAL_ERROR "requiring clang++. cmake -DCMAKE_CXX_COMPILER=clang++ ..")
Expand All @@ -185,9 +190,15 @@ endif()

# use generated asm or compile base${BIT}.ll by clang

if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT APPLE AND NOT CMAKE_SYSTEM_NAME STREQUAL "MSYS")
set(X86_64_LINUX TRUE)
else()
set(X86_64_LINUX FALSE)
endif()

if(CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64") # Win64
# skip
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT APPLE)
elseif(X86_64_LINUX)
target_compile_definitions(mcl PUBLIC MCL_USE_LLVM=1)
target_compile_definitions(mcl_st PUBLIC MCL_USE_LLVM=1)
target_sources(mcl PRIVATE src/asm/x86-64.S)
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ src/fp.cpp: include/mcl/bint_proto.hpp
LIB_OBJ+=$(BINT_OBJ)
ifeq ($(CPU)-$(MCL_BINT_ASM_X64),x86-64-1)
ifeq ($(OS),mingw64)
BINT_ASM_X64_BASENAME=bint-x64
BINT_ASM_X64_BASENAME=bint-x64-mingw
$(BINT_OBJ): src/asm/$(BINT_ASM_X64_BASENAME).S
$(PRE)$(CXX) $(CFLAGS) -c $< -o $@

Expand Down
2 changes: 1 addition & 1 deletion include/mcl/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ struct RoundUpT {
const size_t maxUnitSize = (MCL_MAX_BIT_SIZE + UnitBitSize - 1) / UnitBitSize;
#define MCL_MAX_UNIT_SIZE MCL_ROUNDUP(MCL_MAX_BIT_SIZE, MCL_UNIT_BIT_SIZE)

#ifdef _WIN32
#ifdef _MSC_VER
#ifdef MCL_DLL_EXPORT
#define MCL_DLL_API __declspec(dllexport)
#else
Expand Down
12 changes: 6 additions & 6 deletions include/mcl/ec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2188,12 +2188,12 @@ class EcT : public fp::Serializable<EcT<_Fp, _Fr> > {
// xVec[i] *= yVec[i]
static void mulEach(EcT *xVec, const EcT::Fr *yVec, size_t n)
{
if (mulEachOpti && n >= 8) {
size_t n8 = n & ~size_t(7);
mulEachOpti((Unit*)xVec, yVec[0].getUnit(), n8);
xVec += n8;
yVec += n8;
n -= n8;
if (mulEachOpti && n >= 16) {
size_t n16 = n & ~size_t(16-1);
mulEachOpti((Unit*)xVec, yVec[0].getUnit(), n16);
xVec += n16;
yVec += n16;
n -= n16;
}
for (size_t i = 0; i < n; i++) {
xVec[i] *= yVec[i];
Expand Down
2 changes: 1 addition & 1 deletion include/mcl/op.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

namespace mcl {

static const int version = 0x194; /* 0xABC = A.BC */
static const int version = 0x195; /* 0xABC = A.BC */

/*
specifies available string format mode for X::setIoMode()
Expand Down
5 changes: 3 additions & 2 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ mcl is a library for pairing-based cryptography,
which supports the optimal Ate pairing over BN curves and BLS12-381 curves.

# News
- mulEach with AVX-512 IFMA is 2.5 times faster than G1::mul on BLS12-381
- mulEach with AVX-512 IFMA is improved slightly and 2.8 times faster than G1::mul on BLS12-381.
- mulVec (multi scalar multiplication) with AVX-512 IFMA is 1.4 times faster on Xeon w9-3495X
- a little performance improvement of G1::mulVec of BLS12-381
- improve performance of Fr::inv on M1 mac
Expand Down Expand Up @@ -104,13 +104,14 @@ cmake ..
make
```

For the other platform, clang++ is required.
For the other platform (including mingw), clang++ is required.
```
mkdir build
cd build
cmake .. -DCMAKE_CXX_COMPILER=clang++
make
```
Use `clang++` instead of gcc on mingw.

For Visual Studio, (REMARK : It is not maintained; use the vcxproj file.)
```
Expand Down
267 changes: 266 additions & 1 deletion src/avx512.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,29 @@

typedef __m512i Vec;
typedef __mmask8 Vmask;
static const size_t vN = 2; // fixed
struct VecA {
Vec v[vN];
};

inline Vec vzero()
struct VmaskA {
Vmask v[vN];
};

template<class V=Vec>
inline V vzero()
{
return _mm512_setzero_epi32();
}

template<>
inline VecA vzero()
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vzero();
return r;
}

inline Vec vone()
{
return _mm512_set1_epi32(1);
Expand Down Expand Up @@ -148,6 +165,254 @@ inline Vec vselect(const Vmask& c, const Vec& a, const Vec& b)
return vpandq(c, a, a, b);
}

/////

inline VecA vmulL(const VecA& a, const VecA& b, const VecA& c)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vmulL(a.v[i], b.v[i], c.v[i]);
return r;
}

inline VecA vmulL(const VecA& a, const VecA& b, const Vec& c = vzero())
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vmulL(a.v[i], b.v[i], c);
return r;
}

inline VecA vmulL(const Vec& a, const VecA& b, const VecA& c)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vmulL(a, b.v[i], c.v[i]);
return r;
}

inline VecA vmulL(const VecA& a, const Vec& b, const VecA& c)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vmulL(a.v[i], b, c.v[i]);
return r;
}

inline VecA vmulL(const VecA& a, const Vec& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vmulL(a.v[i], b);
return r;
}

inline VecA vmulH(const VecA& a, const VecA& b, const VecA& c)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vmulH(a.v[i], b.v[i], c.v[i]);
return r;
}

inline VecA vmulH(const VecA& a, const VecA& b, const Vec& c = vzero())
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vmulH(a.v[i], b.v[i], c);
return r;
}

inline VecA vmulH(const Vec& a, const VecA& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vmulH(a, b.v[i]);
return r;
}

inline VecA vpaddq(const VecA& a, const VecA& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpaddq(a.v[i], b.v[i]);
return r;
}

inline VecA vpaddq(const VecA& a, const Vec& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpaddq(a.v[i], b);
return r;
}

inline VecA vpaddq(const VmaskA& v, const VecA& a, const VecA& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpaddq(v.v[i], a.v[i], b.v[i]);
return r;
}

inline VecA vpaddq(const VmaskA& v, const VecA& a, const Vec& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpaddq(v.v[i], a.v[i], b);
return r;
}

inline VecA vpsubq(const VecA& a, const VecA& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpsubq(a.v[i], b.v[i]);
return r;
}

inline VecA vpsubq(const VecA& a, const Vec& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpsubq(a.v[i], b);
return r;
}

inline VecA vpsubq(const Vec& a, const VecA& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpsubq(a, b.v[i]);
return r;
}

inline VecA vpsubq(const VmaskA& v, const VecA& a, const VecA& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpsubq(v.v[i], a.v[i], b.v[i]);
return r;
}

inline VecA vpsrlq(const VecA& a, size_t b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpsrlq(a.v[i], b);
return r;
}

inline VecA vpsllq(const VecA& a, size_t b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpsllq(a.v[i], b);
return r;
}

inline VecA vpandq(const VecA& a, const VecA& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpandq(a.v[i], b.v[i]);
return r;
}

inline VecA vpandq(const VecA& a, const Vec& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpandq(a.v[i], b);
return r;
}

inline VecA vporq(const VecA& a, const VecA& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vporq(a.v[i], b.v[i]);
return r;
}

inline VecA vpxorq(const VecA& a, const VecA& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpxorq(a.v[i], b.v[i]);
return r;
}

//template<int scale=8>
inline VecA vpgatherqq(const VecA& idx, const void *base)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpgatherqq(idx.v[i], base);
return r;
}

inline void vpscatterqq(void *base, const VecA& idx, const VecA& v)
{
for (size_t i = 0; i < vN; i++) vpscatterqq(base, idx.v[i], v.v[i]);
}

// return [H:L][idx]
inline VecA vperm2tq(const VecA& L, const VecA& idx, const VecA& H)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vperm2tq(L.v[i], idx.v[i], H.v[i]);
return r;
}

inline VmaskA vpcmpeqq(const VecA& a, const VecA& b)
{
VmaskA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpcmpeqq(a.v[i], b.v[i]);
return r;
}

inline VmaskA vpcmpeqq(const VecA& a, const Vec& b)
{
VmaskA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpcmpeqq(a.v[i], b);
return r;
}

inline VmaskA vpcmpneqq(const VecA& a, const VecA& b)
{
VmaskA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpcmpneqq(a.v[i], b.v[i]);
return r;
}

inline VmaskA vpcmpneqq(const VecA& a, const Vec& b)
{
VmaskA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpcmpneqq(a.v[i], b);
return r;
}

inline VmaskA vpcmpgtq(const VecA& a, const VecA& b)
{
VmaskA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpcmpgtq(a.v[i], b.v[i]);
return r;
}

inline VmaskA vpcmpgtq(const VecA& a, const Vec& b)
{
VmaskA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpcmpgtq(a.v[i], b);
return r;
}
inline VmaskA kandb(const VmaskA& a, const VmaskA& b)
{
VmaskA r;
for (size_t i = 0; i < vN; i++) r.v[i] = kandb(a.v[i], b.v[i]);
return r;
}

/*
inline VecA vpbroadcastq(int64_t a)
{
return _mm512_set1_epi64(a);
}
*/

// return c ? a&b : d;
inline VecA vpandq(const VmaskA& c, const VecA& a, const VecA& b, const VecA& d)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vpandq(c.v[i], a.v[i], b.v[i], d.v[i]);
return r;
}

// return c ? a : b;
inline VecA vselect(const VmaskA& c, const VecA& a, const VecA& b)
{
VecA r;
for (size_t i = 0; i < vN; i++) r.v[i] = vselect(c.v[i], a.v[i], b.v[i]);
return r;
}

#if defined(__GNUC__)
#pragma GCC diagnostic pop
#endif
Expand Down
Loading

0 comments on commit 94fd641

Please sign in to comment.