Skip to content

Commit

Permalink
With cleaner lazymod127
Browse files Browse the repository at this point in the history
  • Loading branch information
lemire committed Jan 19, 2017
1 parent 7b0d46b commit 5e6abcf
Showing 1 changed file with 22 additions and 32 deletions.
54 changes: 22 additions & 32 deletions src/clhash.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,24 @@
#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
#endif



// computes a << 1
static inline __m128i leftshift1(__m128i a) {
const int x = 1;
__m128i u64shift = _mm_slli_epi64(a,x);
__m128i topbits = _mm_slli_si128(_mm_srli_epi64(a,64 - x),sizeof(uint64_t));
return _mm_or_si128(u64shift, topbits);
}

// computes a << 2
static inline __m128i leftshift2(__m128i a) {
const int x = 2;
__m128i u64shift = _mm_slli_epi64(a,x);
__m128i topbits = _mm_slli_si128(_mm_srli_epi64(a,64 - x),sizeof(uint64_t));
return _mm_or_si128(u64shift, topbits);
}

//////////////////
// compute the "lazy" modulo with 2^127 + 2 + 1, actually we compute the
// modulo with (2^128 + 4 + 2) = 2 * (2^127 + 2 + 1) ,
Expand All @@ -31,38 +49,10 @@ static inline __m128i lazymod127(__m128i Alow, __m128i Ahigh) {
// This is correct because the two highest bits of Ahigh are
// assumed to be zero.
///////////////////////////////////////////////////
// We want to take Ahigh and compute ( Ahigh <<1 ) XOR ( Ahigh <<2 )
// Except that there is no way to shift an entire XMM register by 1 or 2 bits using a single instruction.
// So how do you compute Ahigh <<1 using as few instructions as possible?
//
// First you do _mm_slli_epi64(Ahigh,1). This is *almost* correct... except that
// the 64th bit is not shifted in 65th position.
// Well, ok, but we can compute Ahigh >> 8, this given by _mm_srli_si128(Ahigh,1)
// _mm_srli_si128(Ahigh,1) has the same bits as Ahigh (except that we lose the lowest 8)
// but at different positions.
// So let us shift left the result again...
// _mm_slli_epi64(_mm_srli_si128(Ahigh,1),1)
// If you keep track, this is "almost" equivalent to A >> 7, except that the 72th bit
// from A is lost.
// From something that is almost A >>7, we can get back something that is almost A << 1
// by shifting left by 8 bits...
// _mm_slli_si128(_mm_slli_epi64(_mm_srli_si128(Ahigh,1),1),1)
// So this is a shift left by 1, except that the 72th bit is lost along with the lowest 8 bits.
// We have that _mm_slli_epi64(Ahigh,1) is a shift let by 1 except that the 64th bit
// is lost. We can combine the two to get the desired result (just OR them).
// The algorithm below is just an optimized version of this where we do both shifts (by 1 and 2)
// at the same time and XOR the result.
//
__m128i shifteddownAhigh = _mm_srli_si128(Ahigh,1);
__m128i s1 = _mm_slli_epi64(Ahigh,1);
__m128i s2 = _mm_slli_epi64(Ahigh,2);
__m128i sd1 = _mm_slli_si128(_mm_slli_epi64(shifteddownAhigh,1),1);
__m128i sd2 = _mm_slli_si128(_mm_slli_epi64(shifteddownAhigh,2),1);
s1 = _mm_or_si128(s1,sd1);
s2 = _mm_or_si128(s2,sd2);
__m128i reduced = _mm_xor_si128(s1,s2);
// combining results
__m128i final = _mm_xor_si128(Alow,reduced);
// credit for simplified implementation : Jan Wassenberg
__m128i shift1 = leftshift1(Ahigh);
__m128i shift2 = leftshift2(Ahigh);
__m128i final = _mm_xor_si128(_mm_xor_si128(Alow, shift1),shift2);
return final;
}

Expand Down

0 comments on commit 5e6abcf

Please sign in to comment.