Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Atomic 16-Byte operations on x86_64

Are the following 16 byte atomic operations correctly implemented? Are there any better alternatives?

typedef struct {
    uintptr_t low;
    uintptr_t high;
} uint128_atomic;


uint128_atomic load_relaxed(uint128_atomic const *atomic)
{
    uint128_atomic ret;
    asm volatile("xor %%eax, %%eax\n"
                 "xor %%ebx, %%ebx\n"
                 "xor %%ecx, %%ecx\n"
                 "xor %%edx, %%edx\n"
                 "lock; cmpxchg16b %1"
                 : "=A"(ret)
                 : "m"(*atomic)
                 : "cc", "rbx", "rcx");
    return ret;
}

bool cmpexch_weak_relaxed(
    uint128_atomic *atomic,
    uint128_atomic *expected,
    uint128_atomic desired)
{
    bool matched;
    uint128_atomic e = *expected;
    asm volatile("lock; cmpxchg16b %1\n"
                 "setz %0"
                 : "=q"(matched), "+m"(atomic->ui)
                 : "a"(e.low), "d"(e.high), "b"(desired.low), "c"(desired.high)
                 : "cc");
    return matched;
}

void store_relaxed(uint128_atomic *atomic, uint128_atomic val)
{
    uint128_atomic old = *atomic;
    asm volatile("lock; cmpxchg16b %0"
                 : "+m"(*atomic)
                 : "a"(old.low), "d"(old.high), "b"(val.low), "c"(val.high)
                 : "cc");
}

For a full working example, checkout:

https://godbolt.org/g/CemfSg

Updated implementation can be found here: https://godbolt.org/g/vGNQG5

like image 924
Leandros Avatar asked Jan 29 '23 12:01

Leandros


2 Answers

I came up with the following implementation, after applying all the suggestions from @PeterCordes, @David Wohlferd and @prl. Thanks a lot!

struct _uint128_atomic {
    volatile uint64_t low;
    volatile uint64_t high;
} __attribute__((aligned(16)));
typedef struct _uint128_atomic uint128_atomic;


bool
cmpexch_weak_relaxed(
    uint128_atomic *atomic,
    uint128_atomic *expected,
    uint128_atomic desired)
{
    bool matched;
    uint128_atomic e = *expected;
    asm volatile("lock cmpxchg16b %1"
                 : "=@ccz"(matched), "+m"(*atomic), "+a"(e.low), "+d"(e.high)
                 : "b"(desired.low), "c"(desired.high)
                 : "cc");
    if (!matched)
        *expected = e;
    return matched;
}


uint128_atomic
load_relaxed(uint128_atomic const *atomic)
{
    uint128_atomic ret = {0, 0};
    asm volatile("lock cmpxchg16b %1"
                 : "+A"(ret)
                 : "m"(*atomic), "b"(0), "c"(0)
                 : "cc");
    return ret;
}


void
store_relaxed(uint128_atomic *atomic, uint128_atomic val)
{
    uint128_atomic old = *atomic;
    while (!cmpexch_weak_relaxed(atomic, &old, val))
        ;
}

Please keep in mind that the implementation is GCC specific, and will not work on clang. The implementation of GCCs inline assembly in clang is suboptimal at best, and garbage at worst. The GCC implementation can also be found on Godbolt's Compiler Explorer here. A suboptimal, but working, clang implementation can be found here.

like image 180
Leandros Avatar answered Feb 04 '23 03:02

Leandros


Why don't you just use the C11 atomic intrinsics?

#include <stdatomic.h>

inline __uint128_t load_relaxed(_Atomic __uint128_t *obj)
{
  return atomic_load_explicit(obj, memory_order_relaxed);
}

inline _Bool cmpexch_weak_relaxed(_Atomic __uint128_t *obj,
                                  __uint128_t *expected,
                                  __uint128_t desired)
{
  return atomic_compare_exchange_weak_explicit(obj, expected, desired,
    memory_order_relaxed, memory_order_relaxed);
}

This compiles to more-or-less the assembly you wrote, using clang 4.0.1 and -march=native. But, unlike what you wrote, the compiler actually understands what's going on, so code generation around these functions will be correct. There is, as far as I know, no way to annotate a GNU-style assembly insert to tell the compiler that it has the semantics of an atomic operation.

like image 21
zwol Avatar answered Feb 04 '23 04:02

zwol