I'm writing C++ code to find the first byte in memory that is non 0xFF. To exploit bitscanforward, I had written an inline assembly code that I like very much. But for "readability" as well as future proofing (i.e. SIMD vectorization) I thought I would give g++ optimizer a chance. g++ didn't vectorize, but it did get to nearly the same non-SIMD solution I did. But for some reason, it's version runs much slower, 260000x slower (i.e. I have to loop my version 260,000x more to get to the same execution time). I excepted some difference but not THAT much! Can some point out why it might be? I just want to know so as to make a mistake in future inline assembly codes.
The C++ starting point is following, (in terms of counting accuracy, there is a bug in this code, but I've simplified it for this speed test):
uint64_t count3 (const void *data, uint64_t const &nBytes) {
uint64_t count = 0;
uint64_t block;
do {
block = *(uint64_t*)(data+count);
if ( block != (uint64_t)-1 ) {
/* count += __builtin_ctz(~block); ignore this for speed test*/
goto done;
};
count += sizeof(block);
} while ( count < nBytes );
done:
return (count>nBytes ? nBytes : count);
}
The assembly code g++ came up with is:
_Z6count3PKvRKm:
.LFB33:
.cfi_startproc
mov rdx, QWORD PTR [rsi]
xor eax, eax
jmp .L19
.p2align 4,,10
.p2align 3
.L21:
add rax, 8
cmp rax, rdx
jnb .L18
.L19:
cmp QWORD PTR [rdi+rax], -1
je .L21
.L18:
cmp rax, rdx
cmova rax, rdx
ret
.cfi_endproc
My inline assembly is
_Z6count2PKvRKm:
.LFB32:
.cfi_startproc
push rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
mov rbx, QWORD PTR [rsi]
# count trailing bytes of 0xFF
xor rax, rax
.ctxff_loop_69:
mov r9, QWORD PTR [rdi+rax]
xor r9, -1
jnz .ctxff_final_69
add rax, 8
cmp rax, rbx
jl .ctxff_loop_69
.ctxff_final_69:
cmp rax,rbx
cmova rax,rbx
pop rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
As far as I can see, it is substantially identical, except for the method by which it compare the data byte against 0xFF. But I cannot believe this would cause a great difference in computation time.
It's conceivable my test method is causing the error, but all I do is change the function name and iteration length in the following, simple for-loop shown below: (when N is 1<<20, and all bytes of 'a' except the last byte is 0xFF)
test 1
for (uint64_t i=0; i < ((uint64_t)1<<15); i++) {
n = count3(a,N);
}
test 2
for (uint64_t i=0; i < ((uint64_t)1<<33); i++) {
n = count2(a,N);
}
EDIT:
Here are my real inline assembly codes with SSE count1()
, x64-64 count()
and then plain-old-c++ versions count0()
and count3()
. I fell down this rabbit hole hoping that I could get g++ to take my count0()
and arrive, on it's own, to my count1()
or even count2()
. But alas it did nothing, absolutely no optmization :( I should add that my platform doesn't have AVX2, which is why I was hoping to get g++ to automatically vectorize, so that the code would automatically update when I update my platform.
In terms of the explicit register use in the inline assembly, if I didn't make them explicitly, g++ would reuse the same registers for nBytes
and count
.
In terms of speedup, between XMM and QWORD, I found the real benefit is simply the "loop-unroll" effect, which I replicate in count2()
.
uint32_t count0(const uint8_t *data, uint64_t const &nBytes) {
for (int i=0; i<nBytes; i++)
if (data[i] != 0xFF) return i;
return nBytes;
}
uint32_t count1(const void *data, uint64_t const &nBytes) {
uint64_t count;
__asm__("# count trailing bytes of 0xFF \n"
" xor %[count], %[count] \n"
" vpcmpeqb xmm0, xmm0, xmm0 \n" // make array of 0xFF
".ctxff_next_block_%=: \n"
" vpcmpeqb xmm1, xmm0, XMMWORD PTR [%[data]+%[count]] \n"
" vpmovmskb r9, xmm1 \n"
" xor r9, 0xFFFF \n" // test if all match (bonus negate r9)
" jnz .ctxff_tzc_%= \n" // if !=0, STOP & tzcnt negated r9
" add %[count], 16 \n" // else inc
" cmp %[count], %[nBytes] \n"
" jl .ctxff_next_block_%= \n" // while count < nBytes, loop
" jmp .ctxff_done_%= \n" // else done + ALL bytes were 0xFF
".ctxff_tzc_%=: \n"
" tzcnt r9, r9 \n" // count bytes up to non-0xFF
" add %[count], r9 \n"
".ctxff_done_%=: \n" // more than 'nBytes' could be tested,
" cmp %[count],%[nBytes] \n" // find minimum
" cmova %[count],%[nBytes] "
: [count] "=a" (count)
: [nBytes] "b" (nBytes), [data] "d" (data)
: "r9", "xmm0", "xmm1"
);
return count;
};
uint64_t count2 (const void *data, uint64_t const &nBytes) {
uint64_t count;
__asm__("# count trailing bytes of 0xFF \n"
" xor %[count], %[count] \n"
".ctxff_loop_%=: \n"
" mov r9, QWORD PTR [%[data]+%[count]] \n"
" xor r9, -1 \n"
" jnz .ctxff_final_%= \n"
" add %[count], 8 \n"
" mov r9, QWORD PTR [%[data]+%[count]] \n" // <--loop-unroll
" xor r9, -1 \n"
" jnz .ctxff_final_%= \n"
" add %[count], 8 \n"
" cmp %[count], %[nBytes] \n"
" jl .ctxff_loop_%= \n"
" jmp .ctxff_done_%= \n"
".ctxff_final_%=: \n"
" bsf r9, r9 \n" // do tz count on r9 (either of first QWORD bits or XMM bytes)
" shr r9, 3 \n" // scale BSF count accordiningly
" add %[count], r9 \n"
".ctxff_done_%=: \n" // more than 'nBytes' bytes could have been tested,
" cmp %[count],%[nBytes] \n" // find minimum of count and nBytes
" cmova %[count],%[nBytes] "
: [count] "=a" (count)
: [nBytes] "b" (nBytes), [data] "D" (data)
: "r9"
);
return count;
}
inline static uint32_t tzcount(uint64_t const &qword) {
uint64_t tzc;
asm("tzcnt %0, %1" : "=r" (tzc) : "r" (qword) );
return tzc;
};
uint64_t count3 (const void *data, uint64_t const &nBytes) {
uint64_t count = 0;
uint64_t block;
do {
block = *(uint64_t*)(data+count);
if ( block != (uint64_t)-1 ) {
count += tzcount(~block);
goto done;
};
count += sizeof(block);
} while ( count < nBytes );
done:
return (count>nBytes ? nBytes : count);
}
uint32_t N = 1<<20;
int main(int argc, char **argv) {
unsigned char a[N];
__builtin_memset(a,0xFF,N);
uint64_t n = 0, j;
for (uint64_t i=0; i < ((uint64_t)1<<18); i++) {
n += count2(a,N);
}
printf("\n\n %x %x %x\n",N, n, 0);
return n;
}
That is because our brain encodes new experiences differently than familiar ones and our subjective experience of time is tied to the number of new memories we create. The more new experiences we have, the more memories that are stored, and the faster time will seem to pass during the event.
Although the clock and how all this time is measured collectively is still unknown, one suggested reason for altered time perception is we sense our minds time over real time – meaning the speed of processing in our brain could be what underlies how fast or slow we feel time going.
Now that you've posted the full code: the call to count2(a,N)
is hoisted out of the loop in main
. The run time still increases very slightly with the loop count (e.g. 1<<18
), but all that loop is doing is a single add
. The compiler optimizes it to look more like this source:
uint64_t hoisted_count = count2(a,N);
for (uint64_t i=0; i < ((uint64_t)1<<18); i++) {
n += hoisted_count; // doesn't optimize to a multiply
}
There is no register conflict: %rax
holds the result of the asm statement inlined from count2
. It's then used as a source operand in the tiny loop that multiplies it by n
through repeated addition.
(see the asm on the Godbolt Compiler Explorer, and note all the compiler warnings about arithmetic on void*
s: clang refuses to compile your code):
## the for() loop in main, when using count2()
.L23:
addq %rax, %r12
subq $1, %rdx
jne .L23
%rdx
is the loop counter here, and %r12
is the accumulator that holds n
. IDK why gcc doesn't optimize it to a constant-time multiply.
Presumably the version that was 260k times slower didn't manage to hoist the whole count2
out of the loop. From gcc's perspective, the inline asm version is much simpler: the asm statement is treated as a pure function of its inputs, and gcc doesn't even know anything about it touching memory. The C version touches a bunch of memory, and is much more complicated to prove that it can be hoisted.
Using a "memory"
clobber in the asm statement did prevent it from being hoisted when I checked on godbolt. You can tell from the presence or absence of a branch target in main
before the vector block.
But anyway, the run time will be something like n + rep_count
vs. n * rep_count
.
The asm
statement doesn't use a "memory"
clobber or any memory inputs to tell gcc that it reads the memory pointed to by the input pointers. Incorrect optimizations could happen, e.g. being hoisted out of a loop that modified array elements. (See the Clobbers section in the manual for an example of using a dummy anonymous struct
memory input instead of a blanket "memory"
clobber. Unfortunately I don't think that's usable when the block of memory doesn't have compile-time-constant size.)
I think -fno-inline
prevents hoisting because the function isn't marked with __attribute__((const))
or the slightly weaker __attribute__((pure))
to indicate no side-effects. After inlining, the optimizer can see that for the asm statement.
count0
doesn't get optimized to anything good because gcc and clang can't auto-vectorize loops where the number of iterations isn't known at the start. i.e. they suck at stuff like strlen
or memchr
, or search loops in general, even if they're told that it's safe to access memory beyond the end of the point where the search loop exits early (e.g. using char buf[static 512]
as a function arg).
Like I commented on the question, using xor reg, 0xFFFF
/ jnz
is silly compared to cmp reg, 0xFFFF
/ jnz
, because cmp/jcc can macro-fuse into a compare-and-branch uop. cmp reg, mem
/ jne
can also macro-fuse, so the scalar version that does a load/xor/branch is using 3x as many uops per compare. (Of course, Sandybridge can only micro-fuse the load if it doesn't use an indexed addressing mode. Also, SnB can only macro-fuse one pair per decode block, and but you'd probably get the first cmp/jcc and the loop branch to macro-fuse.) Anyway, the xor
is a bad idea. It's better to only xor
right before the tzcnt
, since saving uops in the loop is more important than code-size or uops total.
Your scalar loop is 9 fused-domain uops, which is one too many to issue at one iteration per 2 clocks. (SnB is a 4-wide pipeline, and for tiny loops it can actually sustain that.)
The indenting in the code in the first version of the question, with the count += __builtin_ctz
at the same level as the if
, made me think you were counting mismatch blocks, rather than just finding the first.
Unfortunately the asm code I wrote for the first version of this answer doesn't solve the same problem as the OP's updated and clearer code. See an old version of this answer for SSE2 asm that counts 0xFF bytes using pcmpeqb/paddb, and psadbw for the horizontal sum to avoid wraparound.
Branching on the result of a pcmpeq
takes many more uops than branching on a cmp
. If our search array is big, we can use a loop that tests multiple vectors at once, and then figure out which byte had our hit after breaking out of the loop.
This optimization applies to AVX2 as well.
Here's my attempt, using GNU C inline asm with -masm=intel
syntax. (Intrinsics might give better results, esp. when inlining, because the compiler understands intrinsics and so can do constant-propagation through them, and stuff like that. OTOH, you can often beat the compiler with hand-written asm if you understand the trade-offs and the microarchitecture you're targeting. Also, if you can safely make some assumptions, but you can't easily communicate them to the compiler.)
#include <stdint.h>
#include <immintrin.h>
// compile with -masm=intel
// len must be a multiple of 32 (TODO: cleanup loop)
// buf should be 16B-aligned for best performance
size_t find_first_zero_bit_avx1(const char *bitmap, size_t len) {
// return size_t not uint64_t. This same code works in 32bit mode, and in the x32 ABI where pointers are 32bit
__m128i pattern, vtmp1, vtmp2;
const char *result_pos;
int tmpi;
const char *bitmap_start = bitmap;
asm ( // modifies the bitmap pointer, but we're inside a wrapper function
"vpcmpeqw %[pat], %[pat],%[pat]\n\t" // all-ones
".p2align 4\n\t" // force 16B loop alignment, for the benefit of CPUs without a loop buffer
//IACA_START // See the godbolt link for the macro definition
".Lcount_loop%=:\n\t"
// " movdqu %[v1], [ %[p] ]\n\t"
// " pcmpeqb %[v1], %[pat]\n\t" // for AVX: fold the load into vpcmpeqb, making sure to still use a one-register addressing mode so it can micro-fuse
// " movdqu %[v2], [ %[p] + 16 ]\n\t"
// " pcmpeqb %[v2], %[pat]\n\t"
" vpcmpeqb %[v1], %[pat], [ %[p] ]\n\t" // Actually use AVX, to get a big speedup over the OP's scalar code on his SnB CPU
" vpcmpeqb %[v2], %[pat], [ %[p] + 16 ]\n\t"
" vpand %[v2], %[v2], %[v1]\n\t" // combine the two results from this iteration
" vpmovmskb %k[result], %[v2]\n\t"
" cmp %k[result], 0xFFFF\n\t" // k modifier: eax instead of rax
" jne .Lfound%=\n\t"
" add %[p], 32\n\t"
" cmp %[p], %[endp]\n\t" // this is only 2 uops after the previous cmp/jcc. We could re-arrange the loop and put the branches farther apart if needed. (e.g. start with a vpcmpeqb outside the loop, so each iteration actually sets up for the next)
" jb .Lcount_loop%=\n\t"
//IACA_END
// any necessary code for the not-found case, e.g. bitmap = endp
" mov %[result], %[endp]\n\t"
" jmp .Lend%=\n\t"
".Lfound%=:\n\t" // we have to figure out which vector the first non-match was in, based on v1 and (v2&v1)
// We could just search the bytes over again, but we don't have to.
// we could also check v1 first and branch, instead of checking both and using a branchless check.
" xor %k[result], 0xFFFF\n\t"
" tzcnt %k[result], %k[result]\n\t" // runs as bsf on older CPUs: same result for non-zero inputs, but different flags. Faster than bsf on AMD
" add %k[result], 16\n\t" // result = byte count in case v1 is all-ones. In that case, v2&v1 = v2
" vpmovmskb %k[tmp], %[v1]\n\t"
" xor %k[tmp], 0xFFFF\n\t"
" bsf %k[tmp], %k[tmp]\n\t" // bsf sets ZF if its *input* was zero. tzcnt's flag results are based on its output. For AMD, it would be faster to use more insns (or a branchy strategy) and avoid bsf, but Intel has fast bsf.
" cmovnz %k[result], %k[tmp]\n\t" // if there was a non-match in v1, use it instead of tzcnt(v2)+16
" add %[result], %[p]\n\t" // If we needed to force 64bit, we could use %q[p]. But size_t should be 32bit in the x32 ABI, where pointers are 32bit. This is one advantage to using size_t over uint64_t
".Lend%=:\n\t"
: [result] "=&a" (result_pos), // force compiler to pic eax/rax to save a couple bytes of code-size from the special cmp eax, imm32 and xor eax,imm32 encodings
[p] "+&r" (bitmap),
// throw-away outputs to let the compiler allocate registers. All early-clobbered so they aren't put in the same reg as an input
[tmp] "=&r" (tmpi),
[pat] "=&x" (pattern),
[v1] "=&x" (vtmp1), [v2] "=&x" (vtmp2)
: [endp] "r" (bitmap+len)
// doesn't compile: len isn't a compile-time constant
// , "m" ( ({ struct { char x[len]; } *dummy = (typeof(dummy))bitmap ; *dummy; }) ) // tell the compiler *which* memory is an input.
: "memory" // we read from data pointed to by bitmap, but bitmap[0..len] isn't an input, only the pointer.
);
return result_pos - bitmap_start;
}
This actually compiles and assembles to asm that looks like what I expected, but I didn't test it. Note that it leaves all register allocation to the compiler, so it's more inlining-friendly. Even without inlining, it doesn't force use of a call-preserved register that has to get saved/restored (e.g. your use of a "b"
constraint).
Not done: scalar code to handle the last sub-32B chunk of data.
static perf analysis for Intel SnB-family CPUs based on Agner Fog's guides / tables. See also the x86 tag wiki. I'm assuming we're not bottlenecked on cache throughput, so this analysis only applies when the data is hot in L2 cache, or maybe only L1 cache is fast enough.
This loop can issue out of the front-end at one iteration (two vectors) per 2 clocks, because it's 7 fused-domain uops. (The front-end issues in groups of 4). (It's probably actually 8 uops, if the two cmp/jcc pairs are decoded in the same block. Haswell and later can do two macro-fusions per decode group, but previous CPUs can only macro-fuse the first. We could software-pipeline the loop so the early-out branch is farther from the p < endp branch.)
All of these fused-domain uops include an ALU uop, so the bottleneck will be on ALU execution ports. Haswell added a 4th ALU unit that can handle simple non-vector ops, including branches, so could run this loop at one iteration per 2 clocks (16B per clock). Your i5-2550k (mentioned in comments) is a SnB CPU.
I used IACA to count uops per port, since it's time consuming to do it by hand. IACA is dumb and thinks there's some kind of inter-iteration dependency other than the loop counter, so I had to use -no_interiteration
:
g++ -masm=intel -Wall -Wextra -O3 -mtune=haswell find-first-zero-bit.cpp -c -DIACA_MARKS
iaca -64 -arch IVB -no_interiteration find-first-zero-bit.o
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - find-first-zero-bit.o
Binary Format - 64Bit
Architecture - SNB
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 2.50 Cycles Throughput Bottleneck: Port1, Port5
Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 |
-------------------------------------------------------------------------
| Cycles | 2.0 0.0 | 2.5 | 1.0 1.0 | 1.0 1.0 | 0.0 | 2.5 |
-------------------------------------------------------------------------
N - port number or number of cycles resource conflict caused delay, DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is expected
! - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | |
---------------------------------------------------------------------
| 2^ | | 1.0 | 1.0 1.0 | | | | CP | vpcmpeqb xmm1, xmm0, xmmword ptr [rdx]
| 2^ | | 0.6 | | 1.0 1.0 | | 0.4 | CP | vpcmpeqb xmm2, xmm0, xmmword ptr [rdx+0x10]
| 1 | 0.9 | 0.1 | | | | 0.1 | CP | vpand xmm2, xmm2, xmm1
| 1 | 1.0 | | | | | | | vpmovmskb eax, xmm2
| 1 | | | | | | 1.0 | CP | cmp eax, 0xffff
| 0F | | | | | | | | jnz 0x18
| 1 | 0.1 | 0.9 | | | | | CP | add rdx, 0x20
| 1 | | | | | | 1.0 | CP | cmp rdx, rsi
| 0F | | | | | | | | jb 0xffffffffffffffe1
On SnB: pcmpeqb
can run on p1/p5. Fused compare-and-branch can only run on p5. Non-fused cmp
can run on p015. Anyway, if one of the branches doesn't macro-fuse, the loop can run at one iteration per 8/3 = 2.666 cycles. With macro-fusion, best-case is 7/3 = 2.333 cycles. (IACA doesn't try to simulate distribution of uops to ports exactly the way the hardware would dynamically make those decisions. However, we can't expect perfect scheduling from the hardware either, so 2 vectors per 2.5 cycles is probably reasonable with both macro-fusions happening. Uops that could have used port0 will sometimes steal port1 or port5, reducing throughput.)
As I said before, Haswell handles this loop better. IACA thinks HSW could run the loop at one iteration per 1.75c, but that's clearly wrong because the taken loop-branch ends the issue group. It will issue in a repeating 4,3 uop pattern. But the execution units can handle more throughput than the frontend for this loop, so it should really be able to keep up with the frontend on Haswell/Broadwell/Skylake and run at one iteration per 2 clocks.
Further unrolling of more vpcmpeqb
/ vpand
is only 2 uops per vector (or 3 without AVX, where we'd load into a scratch and then use that as the destination for pcmpeqb.) So with sufficient unrolling, we should be able to do 2 vector loads per clock. Without AVX, this wouldn't be possible without the PAND
trick, since a vector load/compare/movmsk/test-and-branch is 4 uops. Bigger unrolls make more work to decode the final position where we found a match: a scalar cmp
-based cleanup loop might be a good idea once we're in the area. You could maybe use the same scalar loop for cleanup of non-multiple-of-32B sizes.
If using SSE, with movdqu
/ pcmpeqb xmm,xmm
, we can use an indexed addressing mode without it costing us uops, because a movdqu
load is always a single load uop regardless of addressing mode. (It doesn't need to micro-fuse with anything, unlike a store). This lets us save a uop of loop overhead by using a base pointer pointing to the end of the array, and the index counting up from zero. e.g. add %[idx], 32
/ js
to loop while the index is negative.
With AVX, however, we can save 2 uops by using a single-register addressing mode so vpcmpeqb %[v1], %[pat], [ %[p] + 16 ]
can micro-fuse. This means we need the add/cmp/jcc loop structure I used in the example. The same applies to AVX2.
So I think I found the problem. I think one of the registers used in my inline assembly, despite the clobber list, was conflicting with g++ use of them, and was corrupting the test iteration. I fed g++ version of the code, back as an inline assembly code and got the same 260000x acceleration as my own. Also, in retrospect, the "accelerated" computation time was absurdly short.
Finally, I was so focus on the code embodied as a function that I failed to notice that g++ had, in fact, in-lined (i was using -O3 optimization) the function into the test for-loop as well. When I forced g++ to not in-line (i.e. -fno-inline), the 260000x acceleration disappeared.
I think g++ failed to take into account the inline assembly code's "clobber list" when it in-lined the entire function without my permission.
Lesson learned. I need to do better on inline assembly constraints or block inline-ing of the function with __attribute__ ((noinline))
EDIT: Definitely found that g++ is using rax
for the main() for-loop counter, in conflict with my use of rax
.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With