From c82203076da6d5841fc5d71bd088442a917f43cc Mon Sep 17 00:00:00 2001
From: Zach Wegner <zwegner@gmail.com>
Date: Wed, 8 Jan 2020 18:35:35 -0600
Subject: [PATCH] Initial commit of zp7

---
 README.md |  44 ++++++++++++
 test.c    | 106 ++++++++++++++++++++++++++++
 zp7.c     | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 356 insertions(+)
 create mode 100644 README.md
 create mode 100644 test.c
 create mode 100644 zp7.c

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..482003a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,44 @@
+# ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill)
+
+This is a fast branchless replacement for the PEXT/PDEP instructions.
+If you don't know what those are, this probably won't make much sense.
+
+These instructions are very fast on Intel chips that support them
+(3 cycles latency), but have a much slower implementation on AMD.
+This code will be much slower than the native instructions on Intel
+chips, but faster on AMD chips, and generally faster than a naive
+loop (for all but the most trivial cases).
+
+A detailed description of the algorithm used is in `zp7.c`.
+
+# Usage
+This is distributed as a single C file, `zp7.c`.
+These two functions are drop-in replacements for `_pext_u64` and `_pdep_u64`:
+```c
+uint64_t zp7_pext_64(uint64_t a, uint64_t mask);
+uint64_t zp7_pdep_64(uint64_t a, uint64_t mask);
+```
+
+There are also variants for precomputed masks, in case the same mask is used
+across multiple calls (whether for PEXT or PDEP--the masks are the same for both).
+In this case, a `zp7_masks_64_t` struct is created from the input mask using the
+`zp7_ppp_64` function, and passed to the `zp7_*_pre_64` variants:
+```c
+zp7_masks_64_t zp7_ppp_64(uint64_t mask);
+uint64_t zp7_pext_pre_64(uint64_t a, const zp7_masks_64_t *masks);
+uint64_t zp7_pdep_pre_64(uint64_t a, const zp7_masks_64_t *masks);
+```
+
+Two #defines can change the instructions used, depending on the target CPU:
+* `HAS_CLMUL`: whether the processor has the
+[CLMUL instruction set](https://en.wikipedia.org/wiki/CLMUL_instruction_set), which
+is on most x86 CPUs since ~2010.  Using CLMUL gives a fairly significant
+speedup and code size reduction.
+* `HAS_BZHI`: whether the processor has BZHI, which was introduced in the same [BMI2
+instructions](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets) as
+PEXT/PDEP. This is only used once for PDEP, so it matters much less than CLMUL.
+
+This code is hardcoded to operate on 64 bits. It could easily be adapted
+for 32 bits by changing `N_BITS` to 5 and replacing `uint64_t` with `uint32_t`.
+This will be slightly faster and will save some memory for pre-calculated
+masks.
diff --git a/test.c b/test.c
new file mode 100644
index 0000000..2173791
--- /dev/null
+++ b/test.c
@@ -0,0 +1,106 @@
+// ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill)
+//
+// Copyright (c) 2020 Zach Wegner
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <immintrin.h>
+
+#define HAS_CLMUL
+#define HAS_BZHI
+
+#include "zp7.c"
+
+#define ARRAY_SIZE(a)       (sizeof(a) / sizeof((a)[0]))
+
+#define N_TESTS             (1 << 20)
+
+// PRNG modified from the public domain RKISS by Bob Jenkins. See:
+// http://www.burtleburtle.net/bob/rand/smallprng.html
+
+typedef struct {
+    uint64_t a, b, c, d; 
+} rand_ctx_t;
+
+uint64_t rotate_left(uint64_t x, uint64_t k) {
+	return (x << k) | (x >> (64 - k));
+}
+
+uint64_t rand_next(rand_ctx_t *x) {
+    uint64_t e = x->a - rotate_left(x->b, 7);
+    x->a = x->b ^ rotate_left(x->c, 13);
+    x->b = x->c + rotate_left(x->d, 37);
+    x->c = x->d + e;
+    x->d = e + x->a;
+    return x->d;
+}
+
+void rand_init(rand_ctx_t *x) {
+    x->a = 0x89ABCDEF01234567ULL, x->b = x->c = x->d = 0xFEDCBA9876543210ULL;
+    for (int i = 0; i < 1000; i++)
+        (void)rand_next(x);
+}
+
+int main() {
+    rand_ctx_t r[1];
+    rand_init(r);
+    uint64_t tests = 0;
+
+    for (int test = 0; test < N_TESTS; test++) {
+        // Create four masks with low/medium/high sparsity
+        uint64_t mask = rand_next(r);
+        uint64_t mask_2 = mask | rand_next(r) | rand_next(r);
+        uint64_t masks[] = { mask, ~mask, mask_2, ~mask_2 };
+
+        // For each input mask, test 32 random input values
+        for (int i = 0; i < ARRAY_SIZE(masks); i++) {
+            uint64_t m = masks[i];
+            for (int j = 0; j < 32; j++) {
+                uint64_t input = rand_next(r);
+
+                // Test PEXT
+                uint64_t e_1 = _pext_u64(input, m);
+                uint64_t e_2 = zp7_pext_64(input, m);
+                if (e_1 != e_2) {
+                    printf("FAIL PEXT!\n");
+                    printf("%016llx %016llx %016llx %016llx\n",
+                            m, input, e_1, e_2);
+                    exit(1);
+                }
+                tests++;
+
+                // Test PDEP
+                uint64_t d_1 = _pdep_u64(input, m);
+                uint64_t d_2 = zp7_pdep_64(input, m);
+                if (d_1 != d_2) {
+                    printf("FAIL PDEP!\n");
+                    printf("%016llx %016llx %016llx %016llx\n",
+                            m, input, d_1, d_2);
+                    exit(1);
+                }
+                tests++;
+            }
+        }
+    }
+    printf("Passed %llu tests.\n", tests);
+    return 0;
+}
diff --git a/zp7.c b/zp7.c
new file mode 100644
index 0000000..d3e1975
--- /dev/null
+++ b/zp7.c
@@ -0,0 +1,206 @@
+// ZP7 (Zach's Peppy Parallel-Prefix-Popcountin' PEXT/PDEP Polyfill)
+//
+// Copyright (c) 2020 Zach Wegner
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <stdint.h>
+
+#if defined(HAS_CLMUL) || defined(HAS_BZHI)
+#   include <immintrin.h>
+#endif
+
+// ZP7: branchless PEXT/PDEP replacement code for non-Intel processors
+//
+// The PEXT/PDEP instructions are pretty cool, with various (usually arcane)
+// uses, behaving like bitwise gather/scatter instructions. They were introduced
+// by Intel with the BMI2 instructions on Haswell.
+//
+// AMD processors implement these instructions, but very slowly. PEXT/PDEP can
+// take from 18 to ~300 cycles, depending on the input mask. See this table:
+// https://mobile.twitter.com/InstLatX64/status/1209095219087585281
+// Other processors don't have PEXT/PDEP at all. This code is a polyfill for
+// these processors. It's much slower than the raw instructions on Intel chips
+// (which are 3L1T), but should be faster than AMD's implementation.
+//
+// Description of the algorithm
+// ====
+//
+// This code uses a "parallel prefix popcount" technique (hereafter PPP for
+// brevity). What this means is that we determine, for every bit in the input
+// mask, how many bits below it are set. Or rather, aren't set--we need to get
+// a count of how many bits each input bit should be shifted to get to its final
+// position, that is, the difference between the bit-index of its destination
+// and its original bit-index. This is the same as the count of unset bits in
+// the mask below each input bit.
+//
+// The dumb way to get this PPP would be to create a 64-element array in a loop,
+// but we want to do this in a bit-parallel fashion. So we store the counts
+// "vertically" across six 64-bit values: one 64-bit value holds bit 0 of each
+// of the 64 counts, another holds bit 1, etc. We can compute these counts
+// fairly easily using a parallel prefix XOR (XOR is equivalent to a 1-bit
+// adder that wraps around and ignores carrying). Using parallel prefix XOR as
+// a 1-bit adder, we can build an n-bit adder by shifting the result left by
+// one and ANDing with the input bits: this computes the carry by seeing where
+// an input bit causes the 1-bit sum to overflow from 1 to 0. The shift left
+// is needed anyways, because we want the PPP values to represent population
+// counts *below* each bit, not including the bit itself.
+//
+// For processors with the CLMUL instructions (most x86 CPUs since ~2010), we
+// can do the parallel prefix XOR and left shift in one instruction, by
+// doing a carry-less multiply by -2. This is enabled with the HAS_CLMUL define.
+//
+// Anyways, once we have these six 64-bit values of the PPP, we can use each
+// PPP bit to shift input bits by a power of two. That is, input bits that are
+// in the bit-0 PPP mask are shifted by 2**0==1, bits in the bit-1 mask get
+// shifted by 2, and so on, for shifts by 4, 8, 16, and 32 bits. Out of these
+// six shifts, any shift value between 0 and 63 can be composed.
+//
+// For PEXT, we have to perform each shift in increasing order (1, 2, ...32) so
+// that input bits don't overlap in the intermediate results. PDEP is the
+// opposite: the 32-bit shift needs to happen first to make room for the smaller
+// shifts. There's also a small complication for PDEP in that the PPP values
+// line up where the input bits *end*, rather than where the input bits start
+// like for PEXT. This means each bit mask needs to be shifted backwards before
+// ANDing with the input bits.
+//
+// For both PEXT/PDEP the input bits need to be pre-masked so that only the
+// relevant bits are being shifted around. For PEXT, this is a simple AND
+// (input &= mask), but for PDEP we have to mask out everything but the low N
+// bits, where N is the population count of the mask.
+
+#define N_BITS      (6)
+
+typedef struct {
+    uint64_t mask;
+    uint64_t ppp_bit[N_BITS];
+} zp7_masks_64_t;
+
+#ifndef HAS_CLMUL
+// If we don't have access to the CLMUL instruction, emulate it with
+// shifts and XORs
+static inline uint64_t prefix_sum(uint64_t x) {
+    for (int i = 0; i < N_BITS; i++)
+        x ^= x << (1 << i);
+    return x;
+}
+#endif
+
+// Parallel-prefix-popcount. This is used by both the PEXT/PDEP polyfills.
+// It can also be called separately and cached, if the mask values will be used
+// more than once (these can be shared across PEXT and PDEP calls if they use
+// the same masks). 
+zp7_masks_64_t zp7_ppp_64(uint64_t mask) {
+    zp7_masks_64_t r;
+    r.mask = mask;
+
+    // Count *unset* bits
+    mask = ~mask;
+
+#ifdef HAS_CLMUL
+    // Move the mask and -2 to XMM registers for CLMUL
+    __m128i m = _mm_cvtsi64_si128(mask);
+    __m128i neg_2 = _mm_cvtsi64_si128(-2LL);
+    for (int i = 0; i < N_BITS; i++) {
+        // Do a 1-bit parallel prefix popcount, shifted left by 1,
+        // in one carry-less multiply by -2.
+        __m128i bit = _mm_clmulepi64_si128(m, neg_2, 0);
+        r.ppp_bit[i] = _mm_cvtsi128_si64(bit);
+
+        // Get the carry bit of the 1-bit parallel prefix popcount. On
+        // the next iteration, we will sum this bit to get the next mask
+        m = _mm_and_si128(m, bit);
+    }
+#else
+    for (int i = 0; i < N_BITS; i++) {
+        // Do a 1-bit parallel prefix popcount, shifted left by 1
+        uint64_t bit = prefix_sum(mask) << 1;
+        r.ppp_bit[i] = bit;
+
+        // Get the carry bit of the 1-bit parallel prefix popcount. On
+        // the next iteration, we will sum this bit to get the next mask
+        mask &= bit;
+    }
+#endif
+
+    return r;
+}
+
+// PEXT
+
+uint64_t zp7_pext_pre_64(uint64_t a, const zp7_masks_64_t *masks) {
+    // Mask only the bits that are set in the input mask. Otherwise they collide
+    // with input bits and screw everything up
+    a &= masks->mask;
+
+    // For each bit in the PPP, shift right only those bits that are set in
+    // that bit's mask
+    for (int i = 0; i < N_BITS; i++) {
+        uint64_t shift = 1 << i;
+        uint64_t bit = masks->ppp_bit[i];
+        // Shift only the input bits that are set in
+        a = (a & ~bit) | ((a & bit) >> shift);
+    }
+    return a;
+}
+
+uint64_t zp7_pext_64(uint64_t a, uint64_t mask) {
+    zp7_masks_64_t masks = zp7_ppp_64(mask);
+    return zp7_pext_pre_64(a, &masks);
+}
+
+// PDEP
+
+uint64_t zp7_pdep_pre_64(uint64_t a, const zp7_masks_64_t *masks) {
+    // Mask just the bits that will end up in the final result--the low P bits,
+    // where P is the popcount of the mask. The other bits would collide.
+    // We need special handling for the mask==-1 case: because 64-bit shifts are
+    // implicitly modulo 64 on x86 (and (uint64_t)1 << 64 is technically
+    // undefined behavior in C), the regular "a &= (1 << pop) - 1" doesn't
+    // work: (1 << popcnt(-1)) - 1 == (1 << 64) - 1 == (1 << 0) - 1 == 0, but
+    // this should be -1. The BZHI instruction (introduced with BMI2, the same
+    // instructions as PEXT/PDEP) handles this properly, but isn't portable.
+    uint64_t popcnt = _popcnt64(masks->mask);
+#ifdef HAS_BZHI
+    a = _bzhi_u64(a, popcnt);
+#else
+    // If we don't have BZHI, use a portable workaround.  Since (mask == -1)
+    // is equivalent to popcnt(mask) >> 6, use that to mask out the 1 << 64
+    // case.
+    uint64_t pop_mask = (1ULL << popcnt) & ~(popcnt >> 6);
+    a &= pop_mask - 1;
+#endif
+
+    // For each bit in the PPP, shift left only those bits that are set in
+    // that bit's mask. We do this in the opposite order compared to PEXT
+    for (int i = N_BITS - 1; i >= 0; i--) {
+        uint64_t shift = 1 << i;
+        uint64_t bit = masks->ppp_bit[i] >> shift;
+        // Micro-optimization: the bits that get shifted and those that don't
+        // will always be disjoint, so we can add them instead of ORing them.
+        // The shifts of 1 and 2 can thus merge with the adds to become LEAs.
+        a = (a & ~bit) + ((a & bit) << shift);
+    }
+    return a;
+}
+
+uint64_t zp7_pdep_64(uint64_t a, uint64_t mask) {
+    zp7_masks_64_t masks = zp7_ppp_64(mask);
+    return zp7_pdep_pre_64(a, &masks);
+}