Are the following 16 byte atomic operations correctly implemented? Are there any better alternatives?
typedef struct {
uintptr_t low;
uintptr_t high;
} uint128_atomic;
uint128_atomic load_relaxed(uint128_atomic const *atomic)
{
uint128_atomic ret;
asm volatile("xor %%eax, %%eax\n"
"xor %%ebx, %%ebx\n"
"xor %%ecx, %%ecx\n"
"xor %%edx, %%edx\n"
"lock; cmpxchg16b %1"
: "=A"(ret)
: "m"(*atomic)
: "cc", "rbx", "rcx");
return ret;
}
bool cmpexch_weak_relaxed(
uint128_atomic *atomic,
uint128_atomic *expected,
uint128_atomic desired)
{
bool matched;
uint128_atomic e = *expected;
asm volatile("lock; cmpxchg16b %1\n"
"setz %0"
: "=q"(matched), "+m"(atomic->ui)
: "a"(e.low), "d"(e.high), "b"(desired.low), "c"(desired.high)
: "cc");
return matched;
}
void store_relaxed(uint128_atomic *atomic, uint128_atomic val)
{
uint128_atomic old = *atomic;
asm volatile("lock; cmpxchg16b %0"
: "+m"(*atomic)
: "a"(old.low), "d"(old.high), "b"(val.low), "c"(val.high)
: "cc");
}
For a full working example, checkout:
https://godbolt.org/g/CemfSg
Updated implementation can be found here: https://godbolt.org/g/vGNQG5
I came up with the following implementation, after applying all the suggestions from @PeterCordes, @David Wohlferd and @prl. Thanks a lot!
struct _uint128_atomic {
volatile uint64_t low;
volatile uint64_t high;
} __attribute__((aligned(16)));
typedef struct _uint128_atomic uint128_atomic;
bool
cmpexch_weak_relaxed(
uint128_atomic *atomic,
uint128_atomic *expected,
uint128_atomic desired)
{
bool matched;
uint128_atomic e = *expected;
asm volatile("lock cmpxchg16b %1"
: "=@ccz"(matched), "+m"(*atomic), "+a"(e.low), "+d"(e.high)
: "b"(desired.low), "c"(desired.high)
: "cc");
if (!matched)
*expected = e;
return matched;
}
uint128_atomic
load_relaxed(uint128_atomic const *atomic)
{
uint128_atomic ret = {0, 0};
asm volatile("lock cmpxchg16b %1"
: "+A"(ret)
: "m"(*atomic), "b"(0), "c"(0)
: "cc");
return ret;
}
void
store_relaxed(uint128_atomic *atomic, uint128_atomic val)
{
uint128_atomic old = *atomic;
while (!cmpexch_weak_relaxed(atomic, &old, val))
;
}
Please keep in mind that the implementation is GCC specific, and will not work on clang
. The implementation of GCCs inline assembly in clang
is suboptimal at best, and garbage at worst.
The GCC implementation can also be found on Godbolt's Compiler Explorer here.
A suboptimal, but working, clang
implementation can be found here.
Why don't you just use the C11 atomic intrinsics?
#include <stdatomic.h>
inline __uint128_t load_relaxed(_Atomic __uint128_t *obj)
{
return atomic_load_explicit(obj, memory_order_relaxed);
}
inline _Bool cmpexch_weak_relaxed(_Atomic __uint128_t *obj,
__uint128_t *expected,
__uint128_t desired)
{
return atomic_compare_exchange_weak_explicit(obj, expected, desired,
memory_order_relaxed, memory_order_relaxed);
}
This compiles to more-or-less the assembly you wrote, using clang 4.0.1 and -march=native
. But, unlike what you wrote, the compiler actually understands what's going on, so code generation around these functions will be correct. There is, as far as I know, no way to annotate a GNU-style assembly insert to tell the compiler that it has the semantics of an atomic operation.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With