Skip to content

Commit

Permalink
dpif-netdev/dpcls-avx512: Enable 16 block processing.
Browse files Browse the repository at this point in the history
This commit implements larger subtable searches in avx512. A limitation
of the previous implementation was that up to 8 blocks of miniflow
data could be matched on (so a subtable with 8 blocks was handled
in avx, but 9 blocks or more would fall back to scalar/generic).
This limitation is removed in this patch, where up to 16 blocks
of subtable can be matched on.

From an implementation perspective, the key to enabling 16 blocks
over 8 blocks was to do bitmask calculation up front, and then use
the pre-calculated bitmasks for 2x passes of the "blocks gather"
routine. The bitmasks need to be shifted for k-mask usage in the
upper (8-15) block range, but it is relatively trivial. This also
helps in case expanding to 24 blocks is desired in future.

The implementation of the 2nd iteration to handle > 8 blocks is
behind a conditional branch which checks the total number of bits.
This helps the specialized versions of the function that have a
miniflow fingerprint of less-than-or-equal 8 blocks, as the code
can be statically stripped out of those functions. Specialized
functions that do require more than 8 blocks will have the branch
removed and unconditionally execute the 2nd blocks gather routine.

Lastly, the _any() flavour will have the conditional branch, and
the branch predictor may mispredict a bit, but per burst will
likely get most packets correct (particularly towards the middle
and end of a burst).

The code has been run with unit tests under autovalidation and
passes all cases, and unit test coverage has been checked to
ensure the 16 block code paths are executing.

Signed-off-by: Harry van Haaren <[email protected]>
Acked-by: Flavio Leitner <[email protected]>
Signed-off-by: Ian Stokes <[email protected]>
  • Loading branch information
harry-van-haaren authored and istokes committed Jul 9, 2021
1 parent d76a719 commit 47a2a8f
Show file tree
Hide file tree
Showing 2 changed files with 162 additions and 57 deletions.
1 change: 1 addition & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Post-v2.15.0
* Add commands to get and set the dpif implementations.
* Add a partial HWOL PMD statistic counting hits similar to existing
EMC/SMC/DPCLS stats.
* Enable AVX512 optimized DPCLS to search subtables with larger miniflows.
- ovs-ctl:
* New option '--no-record-hostname' to disable hostname configuration
in ovsdb on startup.
Expand Down
218 changes: 161 additions & 57 deletions lib/dpif-netdev-lookup-avx512-gather.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,21 @@
* AVX512 code at a time.
*/
#define NUM_U64_IN_ZMM_REG (8)
#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG)

/* This implementation of AVX512 gather allows up to 16 blocks of MF data to be
* present in the blocks_cache, hence the multiply by 2 in the blocks count.
*/
#define MF_BLOCKS_PER_PACKET (NUM_U64_IN_ZMM_REG * 2)

/* Blocks cache size is the maximum number of miniflow blocks that this
* implementation of lookup can handle.
*/
#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * MF_BLOCKS_PER_PACKET)

/* The gather instruction can handle a scale for the size of the items to
* gather. For uint64_t data, this scale is 8.
*/
#define GATHER_SCALE_8 (8)


VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
Expand Down Expand Up @@ -69,22 +83,98 @@ netdev_rule_matches_key(const struct dpcls_rule *rule,
{
const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
const uint32_t lane_mask = (1 << mf_bits_total) - 1;
const uint32_t lane_mask = (1ULL << mf_bits_total) - 1;

/* Always load a full cache line from blocks_cache. Other loads must be
* trimmed to the amount of data required for mf_bits_total blocks.
*/
__m512i v_blocks = _mm512_loadu_si512(&block_cache[0]);
__m512i v_mask = _mm512_maskz_loadu_epi64(lane_mask, &maskp[0]);
__m512i v_key = _mm512_maskz_loadu_epi64(lane_mask, &keyp[0]);
uint32_t res_mask;

__m512i v_data = _mm512_and_si512(v_blocks, v_mask);
uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
/* To avoid a loop, we have two iterations of a block of code here.
* Note the scope brackets { } are used to avoid accidental variable usage
* in the second iteration.
*/
{
__m512i v_blocks = _mm512_loadu_si512(&block_cache[0]);
__m512i v_mask = _mm512_maskz_loadu_epi64(lane_mask, &maskp[0]);
__m512i v_key = _mm512_maskz_loadu_epi64(lane_mask, &keyp[0]);
__m512i v_data = _mm512_and_si512(v_blocks, v_mask);
res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
}

if (mf_bits_total > 8) {
uint32_t lane_mask_gt8 = lane_mask >> 8;
__m512i v_blocks = _mm512_loadu_si512(&block_cache[8]);
__m512i v_mask = _mm512_maskz_loadu_epi64(lane_mask_gt8, &maskp[8]);
__m512i v_key = _mm512_maskz_loadu_epi64(lane_mask_gt8, &keyp[8]);
__m512i v_data = _mm512_and_si512(v_blocks, v_mask);
uint32_t c = _mm512_mask_cmpeq_epi64_mask(lane_mask_gt8, v_data,
v_key);
res_mask |= (c << 8);
}

/* returns 1 assuming result of SIMD compare is all blocks. */
/* Returns 1 assuming result of SIMD compare is all blocks matching. */
return res_mask == lane_mask;
}

/* Takes u0 and u1 inputs, and gathers the next 8 blocks to be stored
* contiguously into the blocks cache. Note that the pointers and bitmasks
* passed into this function must be incremented for handling next 8 blocks.
*
* Register contents on entry:
* v_u0: register with all u64 lanes filled with u0 bits.
* v_u1: register with all u64 lanes filled with u1 bits.
* pkt_blocks: pointer to packet blocks.
* tbl_blocks: pointer to table blocks.
* tbl_mf_masks: pointer to miniflow bitmasks for this subtable.
* u1_bcast_msk: bitmask of lanes where u1 bits are used.
* pkt_mf_u0_pop: population count of bits in u0 of the packet.
* zero_mask: bitmask of lanes to zero as packet doesn't have mf bits set.
* u64_lanes_mask: bitmask of lanes to process.
*/
static inline ALWAYS_INLINE __m512i
avx512_blocks_gather(__m512i v_u0,
__m512i v_u1,
const uint64_t *pkt_blocks,
const void *tbl_blocks,
const void *tbl_mf_masks,
__mmask64 u1_bcast_msk,
const uint64_t pkt_mf_u0_pop,
__mmask64 zero_mask,
__mmask64 u64_lanes_mask)
{
/* Suggest to compiler to load tbl blocks ahead of gather(). */
__m512i v_tbl_blocks = _mm512_maskz_loadu_epi64(u64_lanes_mask,
tbl_blocks);

/* Blend u0 and u1 bits together for these 8 blocks. */
__m512i v_pkt_bits = _mm512_mask_blend_epi64(u1_bcast_msk, v_u0, v_u1);

/* Load pre-created tbl miniflow bitmasks, bitwise AND with them. */
__m512i v_tbl_masks = _mm512_maskz_loadu_epi64(u64_lanes_mask,
tbl_mf_masks);
__m512i v_masks = _mm512_and_si512(v_pkt_bits, v_tbl_masks);

/* Manual AVX512 popcount for u64 lanes. */
__m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);

/* Add popcounts and offset for u1 bits. */
__m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_msk,
pkt_mf_u0_pop);
__m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);

/* Gather u64 blocks from packet miniflow. */
__m512i v_zeros = _mm512_setzero_si512();
__m512i v_blocks = _mm512_mask_i64gather_epi64(v_zeros, u64_lanes_mask,
v_indexes, pkt_blocks,
GATHER_SCALE_8);

/* Mask pkt blocks with subtable blocks, k-mask to zero lanes. */
__m512i v_masked_blocks = _mm512_maskz_and_epi64(zero_mask, v_blocks,
v_tbl_blocks);
return v_masked_blocks;
}

static inline uint32_t ALWAYS_INLINE
avx512_lookup_impl(struct dpcls_subtable *subtable,
uint32_t keys_map,
Expand All @@ -94,76 +184,86 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
const uint32_t bit_count_u1)
{
OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[BLOCKS_CACHE_SIZE];

const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
int i;
uint32_t hashes[NETDEV_MAX_BURST];

const uint32_t n_pkts = __builtin_popcountll(keys_map);
ovs_assert(NETDEV_MAX_BURST >= n_pkts);

const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
const uint64_t bit_count_total_mask = (1ULL << bit_count_total) - 1;

const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];

/* Load subtable blocks for masking later. */
const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);

/* Load pre-created subtable masks for each block in subtable. */
const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask,
subtable->mf_masks);
const uint64_t *tbl_mf_masks = subtable->mf_masks;

int i;
ULLONG_FOR_EACH_1 (i, keys_map) {
/* Create mask register with packet-specific u0 offset.
* Note that as 16 blocks can be handled in total, the width of the
* mask register must be >=16.
*/
const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);

/* Pre-create register with *PER PACKET* u0 offset. */
const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
const __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_mask,
pkt_mf_u0_pop);
const __mmask64 u1_bcast_mask = (UINT64_MAX << bit_count_u0);

/* Broadcast u0, u1 bitmasks to 8x u64 lanes. */
__m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
__m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
keys[i]->mf.map.bits[1]);

/* Bitmask by pre-created masks. */
__m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);

/* Manual AVX512 popcount for u64 lanes. */
__m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);

/* Offset popcounts for u1 with pre-created offset register. */
__m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);

/* Gather u64 blocks from packet miniflow. */
const __m512i v_zeros = _mm512_setzero_si512();
const void *pkt_data = miniflow_get_values(&keys[i]->mf);
__m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
bit_count_total_mask, v_indexes,
pkt_data, 8);
__m512i v_u0 = _mm512_set1_epi64(keys[i]->mf.map.bits[0]);
__m512i v_u1 = _mm512_set1_epi64(keys[i]->mf.map.bits[1]);

/* Zero out bits that pkt doesn't have:
* - 2x pext() to extract bits from packet miniflow as needed by TBL
* - Shift u1 over by bit_count of u0, OR to create zero bitmask
*/
uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;

/* Mask blocks using AND with subtable blocks, use k-mask to zero
* where lanes as required for this packet.
*/
__m512i v_masked_blocks = _mm512_maskz_and_epi64(zero_mask,
v_all_blocks, v_tbl_blocks);
uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
const uint64_t zero_mask_wip = (u1_to_zero << bit_count_u0) |
u0_to_zero;
const uint64_t zero_mask = zero_mask_wip & bit_count_total_mask;

/* Get ptr to packet data blocks. */
const uint64_t *pkt_blocks = miniflow_get_values(&keys[i]->mf);

/* Store first 8 blocks cache, full cache line aligned. */
__m512i v_blocks = avx512_blocks_gather(v_u0, v_u1,
&pkt_blocks[0],
&tbl_blocks[0],
&tbl_mf_masks[0],
u1_bcast_mask,
pkt_mf_u0_pop,
zero_mask,
bit_count_total_mask);
_mm512_storeu_si512(&block_cache[i * MF_BLOCKS_PER_PACKET], v_blocks);

if (bit_count_total > 8) {
/* Shift masks over by 8.
* Pkt blocks pointer remains 0, it is incremented by popcount.
* Move tbl and mf masks pointers forward.
* Increase offsets by 8.
* Re-run same gather code.
*/
uint64_t zero_mask_gt8 = (zero_mask >> 8);
uint64_t u1_bcast_mask_gt8 = (u1_bcast_mask >> 8);
uint64_t bit_count_gt8_mask = bit_count_total_mask >> 8;

__m512i v_blocks_gt8 = avx512_blocks_gather(v_u0, v_u1,
&pkt_blocks[0],
&tbl_blocks[8],
&tbl_mf_masks[8],
u1_bcast_mask_gt8,
pkt_mf_u0_pop,
zero_mask_gt8,
bit_count_gt8_mask);
_mm512_storeu_si512(&block_cache[(i * MF_BLOCKS_PER_PACKET) + 8],
v_blocks_gt8);
}

/* Store to blocks cache, full cache line aligned. */
_mm512_storeu_si512(&block_cache[i * 8], v_masked_blocks);
}

/* Hash the now linearized blocks of packet metadata. */
ULLONG_FOR_EACH_1 (i, keys_map) {
uint64_t *block_ptr = &block_cache[i * 8];
uint64_t *block_ptr = &block_cache[i * MF_BLOCKS_PER_PACKET];
uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
hashes[i] = hash_finish(hash, bit_count_total * 8);
}
Expand All @@ -183,7 +283,7 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
struct dpcls_rule *rule;

CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
const uint32_t cidx = i * 8;
const uint32_t cidx = i * MF_BLOCKS_PER_PACKET;
uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
&block_cache[cidx]);
if (OVS_LIKELY(match)) {
Expand Down Expand Up @@ -220,7 +320,7 @@ DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)

/* Check if a specialized function is valid for the required subtable. */
#define CHECK_LOOKUP_FUNCTION(U0, U1) \
ovs_assert((U0 + U1) <= NUM_U64_IN_ZMM_REG); \
ovs_assert((U0 + U1) <= (NUM_U64_IN_ZMM_REG * 2)); \
if (!f && u0_bits == U0 && u1_bits == U1) { \
f = dpcls_avx512_gather_mf_##U0##_##U1; \
}
Expand Down Expand Up @@ -250,7 +350,11 @@ dpcls_subtable_avx512_gather_probe(uint32_t u0_bits, uint32_t u1_bits)
CHECK_LOOKUP_FUNCTION(4, 1);
CHECK_LOOKUP_FUNCTION(4, 0);

if (!f && (u0_bits + u1_bits) < NUM_U64_IN_ZMM_REG) {
/* Check if the _any looping version of the code can perform this miniflow
* lookup. Performance gain may be less pronounced due to non-specialized
* hashing, however there is usually a good performance win overall.
*/
if (!f && (u0_bits + u1_bits) < (NUM_U64_IN_ZMM_REG * 2)) {
f = dpcls_avx512_gather_mf_any;
VLOG_INFO("Using avx512_gather_mf_any for subtable (%d,%d)\n",
u0_bits, u1_bits);
Expand Down

0 comments on commit 47a2a8f

Please sign in to comment.