From c82203076da6d5841fc5d71bd088442a917f43cc Mon Sep 17 00:00:00 2001 From: Zach Wegner Date: Wed, 8 Jan 2020 18:35:35 -0600 Subject: [PATCH] Initial commit of zp7 --- README.md | 44 ++++++++++++ test.c | 106 ++++++++++++++++++++++++++++ zp7.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 356 insertions(+) create mode 100644 README.md create mode 100644 test.c create mode 100644 zp7.c diff --git a/README.md b/README.md new file mode 100644 index 0000000..482003a --- /dev/null +++ b/README.md @@ -0,0 +1,44 @@ +# ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill) + +This is a fast branchless replacement for the PEXT/PDEP instructions. +If you don't know what those are, this probably won't make much sense. + +These instructions are very fast on Intel chips that support them +(3 cycles latency), but have a much slower implementation on AMD. +This code will be much slower than the native instructions on Intel +chips, but faster on AMD chips, and generally faster than a naive +loop (for all but the most trivial cases). + +A detailed description of the algorithm used is in `zp7.c`. + +# Usage +This is distributed as a single C file, `zp7.c`. +These two functions are drop-in replacements for `_pext_u64` and `_pdep_u64`: +```c +uint64_t zp7_pext_64(uint64_t a, uint64_t mask); +uint64_t zp7_pdep_64(uint64_t a, uint64_t mask); +``` + +There are also variants for precomputed masks, in case the same mask is used +across multiple calls (whether for PEXT or PDEP--the masks are the same for both). +In this case, a `zp7_masks_64_t` struct is created from the input mask using the +`zp7_ppp_64` function, and passed to the `zp7_*_pre_64` variants: +```c +zp7_masks_64_t zp7_ppp_64(uint64_t mask); +uint64_t zp7_pext_pre_64(uint64_t a, const zp7_masks_64_t *masks); +uint64_t zp7_pdep_pre_64(uint64_t a, const zp7_masks_64_t *masks); +``` + +Two #defines can change the instructions used, depending on the target CPU: +* `HAS_CLMUL`: whether the processor has the +[CLMUL instruction set](https://en.wikipedia.org/wiki/CLMUL_instruction_set), which +is on most x86 CPUs since ~2010. Using CLMUL gives a fairly significant +speedup and code size reduction. +* `HAS_BZHI`: whether the processor has BZHI, which was introduced in the same [BMI2 +instructions](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets) as +PEXT/PDEP. This is only used once for PDEP, so it matters much less than CLMUL. + +This code is hardcoded to operate on 64 bits. It could easily be adapted +for 32 bits by changing `N_BITS` to 5 and replacing `uint64_t` with `uint32_t`. +This will be slightly faster and will save some memory for pre-calculated +masks. diff --git a/test.c b/test.c new file mode 100644 index 0000000..2173791 --- /dev/null +++ b/test.c @@ -0,0 +1,106 @@ +// ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill) +// +// Copyright (c) 2020 Zach Wegner +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include + +#include + +#define HAS_CLMUL +#define HAS_BZHI + +#include "zp7.c" + +#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])) + +#define N_TESTS (1 << 20) + +// PRNG modified from the public domain RKISS by Bob Jenkins. See: +// http://www.burtleburtle.net/bob/rand/smallprng.html + +typedef struct { + uint64_t a, b, c, d; +} rand_ctx_t; + +uint64_t rotate_left(uint64_t x, uint64_t k) { + return (x << k) | (x >> (64 - k)); +} + +uint64_t rand_next(rand_ctx_t *x) { + uint64_t e = x->a - rotate_left(x->b, 7); + x->a = x->b ^ rotate_left(x->c, 13); + x->b = x->c + rotate_left(x->d, 37); + x->c = x->d + e; + x->d = e + x->a; + return x->d; +} + +void rand_init(rand_ctx_t *x) { + x->a = 0x89ABCDEF01234567ULL, x->b = x->c = x->d = 0xFEDCBA9876543210ULL; + for (int i = 0; i < 1000; i++) + (void)rand_next(x); +} + +int main() { + rand_ctx_t r[1]; + rand_init(r); + uint64_t tests = 0; + + for (int test = 0; test < N_TESTS; test++) { + // Create four masks with low/medium/high sparsity + uint64_t mask = rand_next(r); + uint64_t mask_2 = mask | rand_next(r) | rand_next(r); + uint64_t masks[] = { mask, ~mask, mask_2, ~mask_2 }; + + // For each input mask, test 32 random input values + for (int i = 0; i < ARRAY_SIZE(masks); i++) { + uint64_t m = masks[i]; + for (int j = 0; j < 32; j++) { + uint64_t input = rand_next(r); + + // Test PEXT + uint64_t e_1 = _pext_u64(input, m); + uint64_t e_2 = zp7_pext_64(input, m); + if (e_1 != e_2) { + printf("FAIL PEXT!\n"); + printf("%016llx %016llx %016llx %016llx\n", + m, input, e_1, e_2); + exit(1); + } + tests++; + + // Test PDEP + uint64_t d_1 = _pdep_u64(input, m); + uint64_t d_2 = zp7_pdep_64(input, m); + if (d_1 != d_2) { + printf("FAIL PDEP!\n"); + printf("%016llx %016llx %016llx %016llx\n", + m, input, d_1, d_2); + exit(1); + } + tests++; + } + } + } + printf("Passed %llu tests.\n", tests); + return 0; +} diff --git a/zp7.c b/zp7.c new file mode 100644 index 0000000..d3e1975 --- /dev/null +++ b/zp7.c @@ -0,0 +1,206 @@ +// ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill) +// +// Copyright (c) 2020 Zach Wegner +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include + +#if defined(HAS_CLMUL) || defined(HAS_BZHI) +# include +#endif + +// ZP7: branchless PEXT/PDEP replacement code for non-Intel processors +// +// The PEXT/PDEP instructions are pretty cool, with various (usually arcane) +// uses, behaving like bitwise gather/scatter instructions. They were introduced +// by Intel with the BMI2 instructions on Haswell. +// +// AMD processors implement these instructions, but very slowly. PEXT/PDEP can +// take from 18 to ~300 cycles, depending on the input mask. See this table: +// https://mobile.twitter.com/InstLatX64/status/1209095219087585281 +// Other processors don't have PEXT/PDEP at all. This code is a polyfill for +// these processors. It's much slower than the raw instructions on Intel chips +// (which are 3L1T), but should be faster than AMD's implementation. +// +// Description of the algorithm +// ==== +// +// This code uses a "parallel prefix popcount" technique (hereafter PPP for +// brevity). What this means is that we determine, for every bit in the input +// mask, how many bits below it are set. Or rather, aren't set--we need to get +// a count of how many bits each input bit should be shifted to get to its final +// position, that is, the difference between the bit-index of its destination +// and its original bit-index. This is the same as the count of unset bits in +// the mask below each input bit. +// +// The dumb way to get this PPP would be to create a 64-element array in a loop, +// but we want to do this in a bit-parallel fashion. So we store the counts +// "vertically" across six 64-bit values: one 64-bit value holds bit 0 of each +// of the 64 counts, another holds bit 1, etc. We can compute these counts +// fairly easily using a parallel prefix XOR (XOR is equivalent to a 1-bit +// adder that wraps around and ignores carrying). Using parallel prefix XOR as +// a 1-bit adder, we can build an n-bit adder by shifting the result left by +// one and ANDing with the input bits: this computes the carry by seeing where +// an input bit causes the 1-bit sum to overflow from 1 to 0. The shift left +// is needed anyways, because we want the PPP values to represent population +// counts *below* each bit, not including the bit itself. +// +// For processors with the CLMUL instructions (most x86 CPUs since ~2010), we +// can do the parallel prefix XOR and left shift in one instruction, by +// doing a carry-less multiply by -2. This is enabled with the HAS_CLMUL define. +// +// Anyways, once we have these six 64-bit values of the PPP, we can use each +// PPP bit to shift input bits by a power of two. That is, input bits that are +// in the bit-0 PPP mask are shifted by 2**0==1, bits in the bit-1 mask get +// shifted by 2, and so on, for shifts by 4, 8, 16, and 32 bits. Out of these +// six shifts, any shift value between 0 and 63 can be composed. +// +// For PEXT, we have to perform each shift in increasing order (1, 2, ...32) so +// that input bits don't overlap in the intermediate results. PDEP is the +// opposite: the 32-bit shift needs to happen first to make room for the smaller +// shifts. There's also a small complication for PDEP in that the PPP values +// line up where the input bits *end*, rather than where the input bits start +// like for PEXT. This means each bit mask needs to be shifted backwards before +// ANDing with the input bits. +// +// For both PEXT/PDEP the input bits need to be pre-masked so that only the +// relevant bits are being shifted around. For PEXT, this is a simple AND +// (input &= mask), but for PDEP we have to mask out everything but the low N +// bits, where N is the population count of the mask. + +#define N_BITS (6) + +typedef struct { + uint64_t mask; + uint64_t ppp_bit[N_BITS]; +} zp7_masks_64_t; + +#ifndef HAS_CLMUL +// If we don't have access to the CLMUL instruction, emulate it with +// shifts and XORs +static inline uint64_t prefix_sum(uint64_t x) { + for (int i = 0; i < N_BITS; i++) + x ^= x << (1 << i); + return x; +} +#endif + +// Parallel-prefix-popcount. This is used by both the PEXT/PDEP polyfills. +// It can also be called separately and cached, if the mask values will be used +// more than once (these can be shared across PEXT and PDEP calls if they use +// the same masks). +zp7_masks_64_t zp7_ppp_64(uint64_t mask) { + zp7_masks_64_t r; + r.mask = mask; + + // Count *unset* bits + mask = ~mask; + +#ifdef HAS_CLMUL + // Move the mask and -2 to XMM registers for CLMUL + __m128i m = _mm_cvtsi64_si128(mask); + __m128i neg_2 = _mm_cvtsi64_si128(-2LL); + for (int i = 0; i < N_BITS; i++) { + // Do a 1-bit parallel prefix popcount, shifted left by 1, + // in one carry-less multiply by -2. + __m128i bit = _mm_clmulepi64_si128(m, neg_2, 0); + r.ppp_bit[i] = _mm_cvtsi128_si64(bit); + + // Get the carry bit of the 1-bit parallel prefix popcount. On + // the next iteration, we will sum this bit to get the next mask + m = _mm_and_si128(m, bit); + } +#else + for (int i = 0; i < N_BITS; i++) { + // Do a 1-bit parallel prefix popcount, shifted left by 1 + uint64_t bit = prefix_sum(mask) << 1; + r.ppp_bit[i] = bit; + + // Get the carry bit of the 1-bit parallel prefix popcount. On + // the next iteration, we will sum this bit to get the next mask + mask &= bit; + } +#endif + + return r; +} + +// PEXT + +uint64_t zp7_pext_pre_64(uint64_t a, const zp7_masks_64_t *masks) { + // Mask only the bits that are set in the input mask. Otherwise they collide + // with input bits and screw everything up + a &= masks->mask; + + // For each bit in the PPP, shift right only those bits that are set in + // that bit's mask + for (int i = 0; i < N_BITS; i++) { + uint64_t shift = 1 << i; + uint64_t bit = masks->ppp_bit[i]; + // Shift only the input bits that are set in + a = (a & ~bit) | ((a & bit) >> shift); + } + return a; +} + +uint64_t zp7_pext_64(uint64_t a, uint64_t mask) { + zp7_masks_64_t masks = zp7_ppp_64(mask); + return zp7_pext_pre_64(a, &masks); +} + +// PDEP + +uint64_t zp7_pdep_pre_64(uint64_t a, const zp7_masks_64_t *masks) { + // Mask just the bits that will end up in the final result--the low P bits, + // where P is the popcount of the mask. The other bits would collide. + // We need special handling for the mask==-1 case: because 64-bit shifts are + // implicitly modulo 64 on x86 (and (uint64_t)1 << 64 is technically + // undefined behavior in C), the regular "a &= (1 << pop) - 1" doesn't + // work: (1 << popcnt(-1)) - 1 == (1 << 64) - 1 == (1 << 0) - 1 == 0, but + // this should be -1. The BZHI instruction (introduced with BMI2, the same + // instructions as PEXT/PDEP) handles this properly, but isn't portable. + uint64_t popcnt = _popcnt64(masks->mask); +#ifdef HAS_BZHI + a = _bzhi_u64(a, popcnt); +#else + // If we don't have BZHI, use a portable workaround. Since (mask == -1) + // is equivalent to popcnt(mask) >> 6, use that to mask out the 1 << 64 + // case. + uint64_t pop_mask = (1ULL << popcnt) & ~(popcnt >> 6); + a &= pop_mask - 1; +#endif + + // For each bit in the PPP, shift left only those bits that are set in + // that bit's mask. We do this in the opposite order compared to PEXT + for (int i = N_BITS - 1; i >= 0; i--) { + uint64_t shift = 1 << i; + uint64_t bit = masks->ppp_bit[i] >> shift; + // Micro-optimization: the bits that get shifted and those that don't + // will always be disjoint, so we can add them instead of ORing them. + // The shifts of 1 and 2 can thus merge with the adds to become LEAs. + a = (a & ~bit) + ((a & bit) << shift); + } + return a; +} + +uint64_t zp7_pdep_64(uint64_t a, uint64_t mask) { + zp7_masks_64_t masks = zp7_ppp_64(mask); + return zp7_pdep_pre_64(a, &masks); +}