I am attempting to use a 64-bits integral as a bitmap, and acquire/release ownership of individual bits, atomically.
To this end, I have written the following lock-less code:
#include <cstdint>
#include <atomic>
static constexpr std::uint64_t NO_INDEX = ~std::uint64_t(0);
class AtomicBitMap {
public:
static constexpr std::uint64_t occupied() noexcept {
return ~std::uint64_t(0);
}
std::uint64_t acquire() noexcept {
while (true) {
auto map = mData.load(std::memory_order_relaxed);
if (map == occupied()) {
return NO_INDEX;
}
std::uint64_t index = __builtin_ctzl(~map);
auto previous =
mData.fetch_or(bit(index), std::memory_order_relaxed);
if ((previous & bit(index)) == 0) {
return index;
}
}
}
private:
static constexpr std::uint64_t bit(std::uint64_t index) noexcept {
return std::uint64_t(1) << index;
}
std::atomic_uint64_t mData{ 0 };
};
int main() {
AtomicBitMap map;
return map.acquire();
}
Which, on godbolt, yields the following assembly in isolation:
main:
mov QWORD PTR [rsp-8], 0
jmp .L3
.L10:
not rax
rep bsf rax, rax
mov edx, eax
mov eax, eax
lock bts QWORD PTR [rsp-8], rax
jnc .L9
.L3:
mov rax, QWORD PTR [rsp-8]
cmp rax, -1
jne .L10
ret
.L9:
movsx rax, edx
ret
Which is exactly what I expected1.
@Jester has heroically managed to reduce my 97 lines reproducer to a much simpler 44 lines reproducer which I further reduced to 35 lines:
using u64 = unsigned long long;
struct Bucket {
u64 mLeaves[16] = {};
};
struct BucketMap {
u64 acquire() noexcept {
while (true) {
u64 map = mData;
u64 index = (map & 1) ? 1 : 0;
auto mask = u64(1) << index;
auto previous =
__atomic_fetch_or(&mData, mask, __ATOMIC_SEQ_CST);
if ((previous & mask) == 0) {
return index;
}
}
}
__attribute__((noinline)) Bucket acquireBucket() noexcept {
acquire();
return Bucket();
}
volatile u64 mData = 1;
};
int main() {
BucketMap map;
map.acquireBucket();
return 0;
}
Which generates the following assembly:
BucketMap::acquireBucket():
mov r8, rdi
mov rdx, rsi
.L2:
mov rax, QWORD PTR [rsi]
xor eax, eax
lock bts QWORD PTR [rdx], rax
setc al
jc .L2
mov rdi, r8
mov ecx, 16
rep stosq
mov rax, r8
ret
main:
sub rsp, 152
lea rsi, [rsp+8]
lea rdi, [rsp+16]
mov QWORD PTR [rsp+8], 1
call BucketMap::acquireBucket()
xor eax, eax
add rsp, 152
ret
The xor eax,eax
means that the assembly here always attempts to obtain index 0... resulting in an infinite loop.
I can only see two explanations for this assembly:
And I have exhausted all my ideas as to what could trigger UB.
Can anyone explain why gcc would generate this xor eax,eax
?
Note: tentatively reported to gcc as https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86314.
Compiler version used:
$ gcc --version
gcc (GCC) 7.3.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is
NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE.
Compiler flags:
-Wall -Wextra -Werror -Wduplicated-cond -Wnon-virtual-dtor -Wvla
-rdynamic -Wno-deprecated-declarations -Wno-type-limits
-Wno-unused-parameter -Wno-unused-local-typedefs -Wno-unused-value
-Wno-aligned-new -Wno-implicit-fallthrough -Wno-deprecated
-Wno-noexcept-type -Wno-register -ggdb -fno-strict-aliasing
-std=c++17 -Wl,--no-undefined -Wno-sign-compare
-g -O3 -mpopcnt
1Actually, it's better than I expected, the compiler understanding that the fetch_or(bit(index))
followed by previous & bit(index)
is the equivalent of using bts
and checking the CF flag is pure gold.
This is peephole optimization bug in gcc, see #86413 affecting versions 7.1, 7.2, 7.3 and 8.1. The fix is already in, and will be delivered in version 7.4 and 8.2 respectively.
The short answer is that the particular code sequence (fetch_or
+ checking result) generates a setcc
(set conditional, aka based on status of flags) followed by a movzbl
(move and zero-extend); in 7.x an optimization was introduced which transforms a setcc
followed by movzbl
into a xor
followed by setcc
, however this optimization was missing some checks resulting in the xor
possibly clobbering a register which was still needed (in this case, eax
).
The longer answer is that fetch_or
can be implemented either as a cmpxchg
for full generality, or, if only setting one bit, as bts
(bit test and set). As another optimization introduced in 7.x, gcc now generates a bts
here (gcc 6.4 still generates a cmpxchg
). bts
sets the carry flag (CF
) to the previous value of the bit.
That is, auto previous = a.fetch_or(bit); auto n = previous & bit;
will generate:
lock bts QWORD PTR [<address of a>], <bit index>
to set the bit, and capture its previous value,setc <n>l
to set the lower 8 bits of r<n>x
to the value of the carry flag (CF
),movzx e<n>x, <n>l
to zero-out the upper bits of r<n>x
.And then the peephole optimization will apply, which messes things up.
gcc trunk now generates proper assembly:
BucketMap::acquireBucket():
mov rdx, rdi
mov rcx, rsi
.L2:
mov rax, QWORD PTR [rsi]
and eax, 1
lock bts QWORD PTR [rcx], rax
setc al
movzx eax, al
jc .L2
mov rdi, rdx
mov ecx, 16
rep stosq
mov rax, rdx
ret
main:
sub rsp, 152
lea rsi, [rsp+8]
lea rdi, [rsp+16]
mov QWORD PTR [rsp+8], 1
call BucketMap::acquireBucket()
xor eax, eax
add rsp, 152
ret
Although unfortunately the optimization no longer applies so we are left with setc
+ mov
instead of xor
+ setc
... but at least it's correct!
As a side note, you can find the lowest 0 bit with a straight-forward bit manipulation:
template<class T>
T find_lowest_0_bit_mask(T value) {
T t = value + 1;
return (t ^ value) & t;
}
Returns bit mask, rather than bit index.
Precoditions: T
must be unsigned, value
must contain at least 1 zero bit.
mData.load
must synchronise with mData.fetch_or
, so it should be
mData.load(std::memory_order_acquire)
and
mData.fetch_or(..., std::memory_order_release)
And, IMO, there is something about these bit intrinsics that make it generate wrong assembly with clang
, see .LBB0_5
loop that is clearly wrong because it keeps trying to set the same bit rather than recalculating another bit to set. A version that generates correct assembly:
#include <cstdint>
#include <atomic>
static constexpr int NO_INDEX = -1;
template<class T>
T find_lowest_0_bit_mask(T value) {
T t = value + 1;
return (t ^ value) & t;
}
class AtomicBitMap {
public:
static constexpr std::uint64_t occupied() noexcept { return ~std::uint64_t(0); }
int acquire() noexcept {
auto map = mData.load(std::memory_order_acquire);
while(map != occupied()) {
std::uint64_t mask = find_lowest_0_bit_mask(map);
if(mData.compare_exchange_weak(map, map | mask, std::memory_order_release))
return __builtin_ffsl(mask) - 1;
}
return NO_INDEX;
}
void release(int i) noexcept {
mData.fetch_and(~bit(i), std::memory_order_release);
}
private:
static constexpr std::uint64_t bit(int index) noexcept {
return std::uint64_t(1) << index;
}
std::atomic_uint64_t mData{ 0 };
};
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With