Skip to content

Commit

Permalink
lib/ovs-atomic-i586: Faster 64-bit atomics on 32-bit builds with SSE.
Browse files Browse the repository at this point in the history
Aligned 64-bit memory accesses in i586 are atomic.  By using an SSE
register we can make such memory accesses in one instruction without
bus-locking.  Need to compile with -msse (or higher) to enable this
feature.

Signed-off-by: Jarno Rajahalme <[email protected]>
Acked-by: Ben Pfaff <[email protected]>
  • Loading branch information
Jarno Rajahalme committed Oct 2, 2014
1 parent 3b67094 commit 55eebc0
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 44 deletions.
101 changes: 70 additions & 31 deletions lib/ovs-atomic-i586.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,11 @@
* significant when the TYPE is a pointer type. In that case we want the
* pointer to be declared volatile, not the data type that is being pointed
* at!
*/
#define ATOMIC(TYPE) TYPE volatile
*
* Attribute aligned is used to tell the compiler to align 64-bit data
* on a 8-byte boundary. This allows more efficient atomic access, as the
* the CPU guarantees such memory accesses to be atomic. */
#define ATOMIC(TYPE) TYPE volatile __attribute__((aligned(sizeof(TYPE))))

/* Memory ordering. Must be passed in as a constant. */
typedef enum {
Expand Down Expand Up @@ -234,49 +237,85 @@ atomic_signal_fence(memory_order order)
src___; \
})

#if defined(__SSE__)
/* SSE registers are 128-bit wide, and moving the lowest 64-bits of an SSE
* register to proerly aligned memory is atomic. See ATOMIC(TYPE) above. */
#define atomic_store_8__(DST, SRC) \
asm volatile("movq %1,%0 ; # atomic_store_8__" \
: "=m" (*DST) /* 0 */ \
: "x" (SRC)) /* 1, SSE */
#else
/* Locked 64-bit exchange is available on all i586 CPUs. */
#define atomic_store_8__(DST, SRC) \
atomic_exchange_8__(DST, SRC, "cc")
#endif

#define atomic_store_explicit(DST, SRC, ORDER) \
({ \
typeof(DST) dst__ = (DST); \
typeof(*(DST)) src__ = (SRC); \
\
if ((ORDER) != memory_order_seq_cst \
&& sizeof(*(DST)) <= 4) { \
if ((ORDER) != memory_order_seq_cst) { \
atomic_compiler_barrier(ORDER); \
*dst__ = src__; \
if (sizeof(*(DST)) == 8) { \
atomic_store_8__(dst__, src__); \
} else { \
*dst__ = src__; \
} \
} else { \
atomic_exchange__(dst__, src__, ORDER); \
} \
(void) 0; \
})
#define atomic_store(DST, SRC) \
#define atomic_store(DST, SRC) \
atomic_store_explicit(DST, SRC, memory_order_seq_cst)

/* The 8-byte variant compares '*DST' to a random value in bx:cx and
* returns the actual value in ax:dx. The registers bx and cx are
* only read, so they are not clobbered. */
#define atomic_read_explicit(SRC, DST, ORDER) \
({ \
typeof(DST) dst__ = (DST); \
typeof(SRC) src__ = (SRC); \
\
if (sizeof(*(DST)) <= 4) { \
*dst__ = *src__; \
} else { \
typeof(*(DST)) res__; \
\
asm volatile(" movl %%ebx,%%eax ; " \
" movl %%ecx,%%edx ; " \
"lock; cmpxchg8b %1 ; " \
"# atomic_read_explicit " \
: "=&A" (res__), /* 0 */ \
"+m" (*src__) /* 1 */ \
: : "cc"); \
*dst__ = res__; \
} \
atomic_compiler_barrier(ORDER); \
(void) 0; \
#if defined(__SSE__)
/* SSE registers are 128-bit wide, and moving 64-bits from properly aligned
* memory to an SSE register is atomic. See ATOMIC(TYPE) above. */
#define atomic_read_8__(SRC, DST) \
({ \
typeof(*(DST)) res__; \
\
asm ("movq %1,%0 ; # atomic_read_8__" \
: "=x" (res__) /* 0, SSE. */ \
: "m" (*SRC)); /* 1 */ \
*(DST) = res__; \
})
#else
/* Must use locked cmpxchg8b (available on all i586 CPUs) if compiled w/o sse
* support. Compare '*DST' to a random value in bx:cx and returns the actual
* value in ax:dx. The registers bx and cx are only read, so they are not
* clobbered. */
#define atomic_read_8__(SRC, DST) \
({ \
typeof(*(DST)) res__; \
\
asm (" movl %%ebx,%%eax ; " \
" movl %%ecx,%%edx ; " \
"lock; cmpxchg8b %1 ; " \
"# atomic_read_8__ " \
: "=&A" (res__), /* 0 */ \
"+m" (*SRC) /* 1 */ \
: : "cc"); \
*(DST) = res__; \
})
#endif

#define atomic_read_explicit(SRC, DST, ORDER) \
({ \
typeof(DST) dst__ = (DST); \
typeof(SRC) src__ = (SRC); \
\
if (sizeof(*(DST)) <= 4) { \
*dst__ = *src__; \
} else { \
atomic_read_8__(SRC, DST); \
} \
atomic_compiler_barrier(ORDER); \
(void) 0; \
})
#define atomic_read(SRC, DST) \
#define atomic_read(SRC, DST) \
atomic_read_explicit(SRC, DST, memory_order_seq_cst)

#if defined(__PIC__)
Expand Down
28 changes: 15 additions & 13 deletions tests/test-atomic.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,10 @@ test_atomic_flag(void)
static uint32_t a;

struct atomic_aux {
atomic_uint32_t count;
ATOMIC(uint64_t) count;
uint32_t b;
ATOMIC(uint32_t *) data;
ATOMIC(uint64_t) data64;
};

static ATOMIC(struct atomic_aux *) paux = ATOMIC_VAR_INIT(NULL);
Expand All @@ -184,7 +185,7 @@ static void *
atomic_consumer(void * arg1 OVS_UNUSED)
{
struct atomic_aux *old_aux = NULL;
uint32_t count;
uint64_t count;

do {
struct atomic_aux *aux;
Expand Down Expand Up @@ -246,24 +247,24 @@ static void *
atomic_reader(void *aux_)
{
struct atomic_aux *aux = aux_;
uint32_t count;
uint32_t *data;
uint64_t count;
uint64_t data;

do {
/* Non-synchronized add. */
atomic_add_explicit(&aux->count, 1, &count, memory_order_relaxed);

do {
atomic_read_explicit(&aux->data, &data, memory_order_acquire);
atomic_read_explicit(&aux->data64, &data, memory_order_acquire);
} while (!data);

ovs_assert(*data == a && *data == aux->b && a == aux->b);
ovs_assert(data == a && data == aux->b && a == aux->b);

atomic_read_explicit(&aux->count, &count, memory_order_relaxed);

ovs_assert(count == 2 * a && count == 2 * aux->b && count == 2 * *data);
ovs_assert(count == 2 * a && count == 2 * aux->b && count == 2 * data);

atomic_store_explicit(&aux->data, NULL, memory_order_release);
atomic_store_explicit(&aux->data64, UINT64_C(0), memory_order_release);
} while (count < 2 * ATOMIC_ITEM_COUNT);

return NULL;
Expand All @@ -273,21 +274,21 @@ static void *
atomic_writer(void *aux_)
{
struct atomic_aux *aux = aux_;
uint32_t old_count;
uint32_t *data;
uint64_t old_count;
uint64_t data;
size_t i;

for (i = 0; i < ATOMIC_ITEM_COUNT; i++) {
/* Wait for the reader to be done with the data. */
do {
atomic_read_explicit(&aux->data, &data, memory_order_acquire);
atomic_read_explicit(&aux->data64, &data, memory_order_acquire);
} while (data);

a = i + 1;
atomic_add_explicit(&aux->count, 1, &old_count, memory_order_relaxed);
aux->b++;
atomic_store_explicit(&aux->data,
(i & 1) ? &aux->b : &a, memory_order_release);
atomic_store_explicit(&aux->data64,
(i & 1) ? (uint64_t)aux->b : a, memory_order_release);
}

return NULL;
Expand All @@ -304,6 +305,7 @@ test_acq_rel(void)

aux->count = ATOMIC_VAR_INIT(0);
atomic_init(&aux->data, NULL);
aux->data64 = ATOMIC_VAR_INIT(0);

reader = ovs_thread_create("reader", atomic_reader, aux);
writer = ovs_thread_create("writer", atomic_writer, aux);
Expand Down

0 comments on commit 55eebc0

Please sign in to comment.