I am trying to pack a __m256i variable with 32 chars from an array and specified by indices. here is my code:
char array[]; // different array every time.
uint16_t offset[32]; // same offset reused many times
_mm256_set_epi8(array[offset[0]], array[offset[1]], array[offset[2]], array[offset[3]], array[offset[4]], array[offset[5]], array[offset[6]], array[offset[7]],
array[offset[8]],array[offset[9]],array[offset[10]],array[offset[11]], array[offset[12]], array[offset[13]], array[offset[14]], array[offset[15]],
array[offset[16]],array[offset[17]], array[offset[18]], array[offset[19]], array[offset[20]], array[offset[21]], array[offset[22]], array[offset[23]],
array[offset[24]],array[offset[25]],array[offset[26]], array[offset[27]], array[offset[28]], array[offset[29]], array[offset[30]],array[offset[31]])
This function will be called many times with the same offsets and different arrays. But I don't think it is optimal according to my test. Is there any idea to improve it?
Let's look first at solutions that work for a general offset
that varies with every call (which will be a drop-in solution for the existing function), and then after we'll see if we can take advantage of the same offset
array being used used for several calls (while array
always varies).
Well one big problem is that gcc
(old or new) just generates awful code for the current implementation of your function:
lea r10, [rsp+8]
and rsp, -32
push QWORD PTR [r10-8]
push rbp
mov rbp, rsp
push r15
push r14
push r13
push r12
push r10
push rbx
sub rsp, 40
movzx eax, WORD PTR [rsi+40]
movzx r14d, WORD PTR [rsi+60]
movzx r12d, WORD PTR [rsi+56]
movzx ecx, WORD PTR [rsi+44]
movzx r15d, WORD PTR [rsi+62]
movzx r13d, WORD PTR [rsi+58]
mov QWORD PTR [rbp-56], rax
movzx eax, WORD PTR [rsi+38]
movzx ebx, WORD PTR [rsi+54]
movzx r11d, WORD PTR [rsi+52]
movzx r10d, WORD PTR [rsi+50]
movzx r9d, WORD PTR [rsi+48]
movzx r8d, WORD PTR [rsi+46]
mov QWORD PTR [rbp-64], rax
movzx eax, WORD PTR [rsi+36]
movzx edx, WORD PTR [rsi+42]
mov QWORD PTR [rbp-72], rax
movzx eax, WORD PTR [rsi+34]
mov QWORD PTR [rbp-80], rax
movzx eax, WORD PTR [rsi+32]
mov QWORD PTR [rbp-88], rax
movzx eax, WORD PTR [rsi+30]
movzx r15d, BYTE PTR [rdi+r15]
mov QWORD PTR [rbp-96], rax
movzx eax, WORD PTR [rsi+28]
vmovd xmm2, r15d
vpinsrb xmm2, xmm2, BYTE PTR [rdi+r14], 1
mov QWORD PTR [rbp-104], rax
movzx eax, WORD PTR [rsi+26]
mov QWORD PTR [rbp-112], rax
movzx eax, WORD PTR [rsi+24]
mov QWORD PTR [rbp-120], rax
movzx eax, WORD PTR [rsi+22]
mov QWORD PTR [rbp-128], rax
movzx eax, WORD PTR [rsi+20]
mov QWORD PTR [rbp-136], rax
movzx eax, WORD PTR [rsi+18]
mov QWORD PTR [rbp-144], rax
movzx eax, WORD PTR [rsi+16]
mov QWORD PTR [rbp-152], rax
movzx eax, WORD PTR [rsi+14]
mov QWORD PTR [rbp-160], rax
movzx eax, WORD PTR [rsi+12]
mov QWORD PTR [rbp-168], rax
movzx eax, WORD PTR [rsi+10]
mov QWORD PTR [rbp-176], rax
movzx eax, WORD PTR [rsi+8]
mov QWORD PTR [rbp-184], rax
movzx eax, WORD PTR [rsi+6]
mov QWORD PTR [rbp-192], rax
movzx eax, WORD PTR [rsi+4]
mov QWORD PTR [rbp-200], rax
movzx eax, WORD PTR [rsi+2]
movzx esi, WORD PTR [rsi]
movzx r13d, BYTE PTR [rdi+r13]
movzx r8d, BYTE PTR [rdi+r8]
movzx edx, BYTE PTR [rdi+rdx]
movzx ebx, BYTE PTR [rdi+rbx]
movzx r10d, BYTE PTR [rdi+r10]
vmovd xmm7, r13d
vmovd xmm1, r8d
vpinsrb xmm1, xmm1, BYTE PTR [rdi+rcx], 1
mov rcx, QWORD PTR [rbp-56]
vmovd xmm5, edx
vmovd xmm3, ebx
mov rbx, QWORD PTR [rbp-72]
vmovd xmm6, r10d
vpinsrb xmm7, xmm7, BYTE PTR [rdi+r12], 1
vpinsrb xmm5, xmm5, BYTE PTR [rdi+rcx], 1
mov rcx, QWORD PTR [rbp-64]
vpinsrb xmm6, xmm6, BYTE PTR [rdi+r9], 1
vpinsrb xmm3, xmm3, BYTE PTR [rdi+r11], 1
vpunpcklwd xmm2, xmm2, xmm7
movzx edx, BYTE PTR [rdi+rcx]
mov rcx, QWORD PTR [rbp-80]
vpunpcklwd xmm1, xmm1, xmm5
vpunpcklwd xmm3, xmm3, xmm6
vmovd xmm0, edx
movzx edx, BYTE PTR [rdi+rcx]
mov rcx, QWORD PTR [rbp-96]
vpunpckldq xmm2, xmm2, xmm3
vpinsrb xmm0, xmm0, BYTE PTR [rdi+rbx], 1
mov rbx, QWORD PTR [rbp-88]
vmovd xmm4, edx
movzx edx, BYTE PTR [rdi+rcx]
mov rcx, QWORD PTR [rbp-112]
vpinsrb xmm4, xmm4, BYTE PTR [rdi+rbx], 1
mov rbx, QWORD PTR [rbp-104]
vpunpcklwd xmm0, xmm0, xmm4
vpunpckldq xmm0, xmm1, xmm0
vmovd xmm1, edx
movzx edx, BYTE PTR [rdi+rcx]
vpinsrb xmm1, xmm1, BYTE PTR [rdi+rbx], 1
mov rcx, QWORD PTR [rbp-128]
mov rbx, QWORD PTR [rbp-120]
vpunpcklqdq xmm0, xmm2, xmm0
vmovd xmm8, edx
movzx edx, BYTE PTR [rdi+rcx]
vpinsrb xmm8, xmm8, BYTE PTR [rdi+rbx], 1
mov rcx, QWORD PTR [rbp-144]
mov rbx, QWORD PTR [rbp-136]
vmovd xmm4, edx
vpunpcklwd xmm1, xmm1, xmm8
vpinsrb xmm4, xmm4, BYTE PTR [rdi+rbx], 1
movzx edx, BYTE PTR [rdi+rcx]
mov rbx, QWORD PTR [rbp-152]
mov rcx, QWORD PTR [rbp-160]
vmovd xmm7, edx
movzx eax, BYTE PTR [rdi+rax]
movzx edx, BYTE PTR [rdi+rcx]
vpinsrb xmm7, xmm7, BYTE PTR [rdi+rbx], 1
mov rcx, QWORD PTR [rbp-176]
mov rbx, QWORD PTR [rbp-168]
vmovd xmm5, eax
vmovd xmm2, edx
vpinsrb xmm5, xmm5, BYTE PTR [rdi+rsi], 1
vpunpcklwd xmm4, xmm4, xmm7
movzx edx, BYTE PTR [rdi+rcx]
vpinsrb xmm2, xmm2, BYTE PTR [rdi+rbx], 1
vpunpckldq xmm1, xmm1, xmm4
mov rbx, QWORD PTR [rbp-184]
mov rcx, QWORD PTR [rbp-192]
vmovd xmm6, edx
movzx edx, BYTE PTR [rdi+rcx]
vpinsrb xmm6, xmm6, BYTE PTR [rdi+rbx], 1
mov rbx, QWORD PTR [rbp-200]
vmovd xmm3, edx
vpunpcklwd xmm2, xmm2, xmm6
vpinsrb xmm3, xmm3, BYTE PTR [rdi+rbx], 1
add rsp, 40
vpunpcklwd xmm3, xmm3, xmm5
vpunpckldq xmm2, xmm2, xmm3
pop rbx
pop r10
vpunpcklqdq xmm1, xmm1, xmm2
pop r12
pop r13
vinserti128 ymm0, ymm0, xmm1, 0x1
pop r14
pop r15
pop rbp
lea rsp, [r10-8]
ret
Basically it's trying to do all the reads of offset
up front, and just runs out of registers, so it starts spilling a few and then goes on an orgy of spilling where it's just reading most of the 16-bit elements of offset
and then immediately storing them (as 64-bit zero-extended values) immediately on to the stack. Essentially it's copying most of the offset
array (with zero extension to 64-bits) for no purpose: where it later reads the spilled values it could have of course just read from offset
.
This same terrible code is evident in the old 4.9.2
version you're using as well as the very recent 7.2
.
Neither icc
nor clang
have any such issues - they both generate almost identical quite reasonable code that just reads once from every offset
position using movzx
and then inserts the byte using vpinsrb
with a memory source operand based on the offset
just read:
gather256(char*, unsigned short*): # @gather256(char*, unsigned short*)
movzx eax, word ptr [rsi + 30]
movzx eax, byte ptr [rdi + rax]
vmovd xmm0, eax
movzx eax, word ptr [rsi + 28]
vpinsrb xmm0, xmm0, byte ptr [rdi + rax], 1
movzx eax, word ptr [rsi + 26]
vpinsrb xmm0, xmm0, byte ptr [rdi + rax], 2
movzx eax, word ptr [rsi + 24]
...
vpinsrb xmm0, xmm0, byte ptr [rdi + rax], 14
movzx eax, word ptr [rsi]
vpinsrb xmm0, xmm0, byte ptr [rdi + rax], 15
movzx eax, word ptr [rsi + 62]
movzx eax, byte ptr [rdi + rax]
vmovd xmm1, eax
movzx eax, word ptr [rsi + 60]
vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 1
movzx eax, word ptr [rsi + 58]
vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 2
movzx eax, word ptr [rsi + 56]
vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 3
movzx eax, word ptr [rsi + 54]
vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 4
movzx eax, word ptr [rsi + 52]
...
movzx eax, word ptr [rsi + 32]
vpinsrb xmm1, xmm1, byte ptr [rdi + rax], 15
vinserti128 ymm0, ymm1, xmm0, 1
ret
Very nice. There is a small amount of additional overhead to vinserti128
two xmm
vectors together each with half of the result, apparently because vpinserb
can't write to the high 128-bits. It seems that on modern uarchs like the one you are using this would simultaneously bottleneck on the 2 read ports and port 5 (shuffle) at 1 element per cycle. So this will probably have a throughput of about 1 per 32 cycles, and a latency close to 32 cycles (the main dependence chain is through the work-in-progress xmm
register that is receiving the pinsrb
but the listed latency for the memory-source version of this instruction is only 1 cycle1.
Can we get close to this 32 performance on gcc? It seems so. Here's one approach:
uint64_t gather64(char *array, uint16_t *offset) {
uint64_t ret;
char *p = (char *)&ret;
p[0] = array[offset[0]];
p[1] = array[offset[1]];
p[2] = array[offset[2]];
p[3] = array[offset[3]];
p[4] = array[offset[4]];
p[5] = array[offset[5]];
p[6] = array[offset[6]];
p[7] = array[offset[7]];
return ret;
}
__m256i gather256_gcc(char *array, uint16_t *offset) {
return _mm256_set_epi64x(
gather64(array, offset),
gather64(array + 8, offset + 8),
gather64(array + 16, offset + 16),
gather64(array + 24, offset + 24)
);
}
Here we rely on a temporary array on the stack to gather 8 elements from array
at a time, and then we use that as input into _mm256_set_epi64x
. Overall this uses 2 loads and 1 store per 8-byte element, and a couple extra instructions for every 64-bit element, so it should be close to 1 cycle per element throughput2.
It generates the "expected" inlined code in gcc
:
gather256_gcc(char*, unsigned short*):
lea r10, [rsp+8]
and rsp, -32
push QWORD PTR [r10-8]
push rbp
mov rbp, rsp
push r10
movzx eax, WORD PTR [rsi+48]
movzx eax, BYTE PTR [rdi+24+rax]
mov BYTE PTR [rbp-24], al
movzx eax, WORD PTR [rsi+50]
movzx eax, BYTE PTR [rdi+24+rax]
mov BYTE PTR [rbp-23], al
movzx eax, WORD PTR [rsi+52]
movzx eax, BYTE PTR [rdi+24+rax]
mov BYTE PTR [rbp-22], al
...
movzx eax, WORD PTR [rsi+62]
movzx eax, BYTE PTR [rdi+24+rax]
mov BYTE PTR [rbp-17], al
movzx eax, WORD PTR [rsi+32]
vmovq xmm0, QWORD PTR [rbp-24]
movzx eax, BYTE PTR [rdi+16+rax]
movzx edx, WORD PTR [rsi+16]
mov BYTE PTR [rbp-24], al
movzx eax, WORD PTR [rsi+34]
movzx edx, BYTE PTR [rdi+8+rdx]
movzx eax, BYTE PTR [rdi+16+rax]
mov BYTE PTR [rbp-23], al
...
movzx eax, WORD PTR [rsi+46]
movzx eax, BYTE PTR [rdi+16+rax]
mov BYTE PTR [rbp-17], al
mov rax, QWORD PTR [rbp-24]
mov BYTE PTR [rbp-24], dl
movzx edx, WORD PTR [rsi+18]
vpinsrq xmm0, xmm0, rax, 1
movzx edx, BYTE PTR [rdi+8+rdx]
mov BYTE PTR [rbp-23], dl
movzx edx, WORD PTR [rsi+20]
movzx edx, BYTE PTR [rdi+8+rdx]
mov BYTE PTR [rbp-22], dl
movzx edx, WORD PTR [rsi+22]
movzx edx, BYTE PTR [rdi+8+rdx]
mov BYTE PTR [rbp-21], dl
movzx edx, WORD PTR [rsi+24]
movzx edx, BYTE PTR [rdi+8+rdx]
mov BYTE PTR [rbp-20], dl
movzx edx, WORD PTR [rsi+26]
movzx edx, BYTE PTR [rdi+8+rdx]
mov BYTE PTR [rbp-19], dl
movzx edx, WORD PTR [rsi+28]
movzx edx, BYTE PTR [rdi+8+rdx]
mov BYTE PTR [rbp-18], dl
movzx edx, WORD PTR [rsi+30]
movzx edx, BYTE PTR [rdi+8+rdx]
mov BYTE PTR [rbp-17], dl
movzx edx, WORD PTR [rsi]
vmovq xmm1, QWORD PTR [rbp-24]
movzx edx, BYTE PTR [rdi+rdx]
mov BYTE PTR [rbp-24], dl
movzx edx, WORD PTR [rsi+2]
movzx edx, BYTE PTR [rdi+rdx]
mov BYTE PTR [rbp-23], dl
movzx edx, WORD PTR [rsi+4]
movzx edx, BYTE PTR [rdi+rdx]
mov BYTE PTR [rbp-22], dl
...
movzx edx, WORD PTR [rsi+12]
movzx edx, BYTE PTR [rdi+rdx]
mov BYTE PTR [rbp-18], dl
movzx edx, WORD PTR [rsi+14]
movzx edx, BYTE PTR [rdi+rdx]
mov BYTE PTR [rbp-17], dl
vpinsrq xmm1, xmm1, QWORD PTR [rbp-24], 1
vinserti128 ymm0, ymm0, xmm1, 0x1
pop r10
pop rbp
lea rsp, [r10-8]
ret
This approach will suffer 4 (non-dependent) store forwarding stalls when trying to read the stack buffer, which will make the latency somewhat worse than 32 cycles, perhaps in the mid-40s (if you assume it's the last stall that will be the one that isn't hidden). You could also just remove the gather64
function and unroll the whole thing in a 32-byte buffer, with a single load at the end. This result in only one stall, and get rid of the small overhead to load each 64-bit value into the result one at a time, but the overall effect might be worse, since larger loads seem to sometimes suffer larger forwarding stalls.
I'm quite sure you can come with up approaches that are better. For example, you could just write out "long hand" in intrinsics the vpinsrb
approach that clang and icc use. That's simple enough that gcc
should get it right.
What about if the offset
array is used repeatedly for several different array
inputs?
We can look at pre-processing the offset
array so that our core load loop can be faster.
One viable approach is to use vgatherdd
to efficiently load elements without bottlenecking on port 5 for the shuffles. We can load the entire gather index vector in a single 256-bit load as well. Unfortunately, the finest-grained vpgather
is vpgatherdd
which loads 8 32-bit elements using 32-bit offsets. So we'll need 4 of these gathers get all 32 byte-elements, and then need to blend the resulting vectors somehow.
We can actually avoid most of the cost of combining the resulting arrays by interleaving and adjusting the offsets so that the "target" byte in each 32-bit value is actually its correct final position. So you end up with 4 256-bit vectors, each with 8 bytes that you want, in the correct position, and 24 bytes you don't want. You can vpblendw
two pairs of vectors together, and then vpblendb
those results together, for a total of 3 port 5 uops (there's got to be a better way to do this reduction?).
Adding it all together, I get something like:
movups
to load the 4 vpgatherdd index regs (can be hoisted)vpgatherdd
vpblendw
(4 results -> 2)movups
to load the vpblendb
mask (can be hoisted)vpblendb
(2 results -> 1)Apart from the vpgatherdd
s it looks like about 9 uops, with 3 of them going to port 5, so 3 cycles bottlenecked on that port or about 2.25 cycles if there are no bottleneck (because the vpgatherdd
might not use port 5). On Broadwell, the vpgather
family is much improved over Haswell, but still takes about 0.9 cycles per element for vpgatherdd
, so that's about 29 cycles right there. So we are right back to where we started, around 32 cycles.
Still, there is some hope:
vpgatherdd
activity. Perhaps then the blending code is more or less free, and we are around 29 cycles (realistically, the movups
will still be competing with the gather, however). vpgatherdd
got a lot better again in Skylake, to about 0.6 cycles per element, so this strategy will start to help significantly when you upgrade your hardware to Skylake. (And the strategy may pull slightly farther ahead of vpinsrb
with AVX512BW, where byte blends with a k
-register mask are efficient, and vpgatherdd zmm
per-element gather throughput is slightly higher than ymm
(InstLatx64).)array
. In that case, you could potentially reduce the number of gathers. For example, if only half of the elements in offset
are unique, you can only do two gathers to collect 16 elements and then pshufb
register to duplicate elements as needed. The "reduction" has to be more general, but it doesn't actually seem more expensive (and could be cheaper) since pshufb
is quite general does most of the work.Expanding on that last idea: you would dispatch at runtime to a routine that knows how to do 1, 2, 3 or 4 gathers depending on how many elements are needed. That is fairly quantized, but you could always dispatch in a more fine-grained way with scalar loads (or gathers with larger elements, which are faster) between those cutoff points. You'll hit diminishing returns pretty quickly.
You can even extend that to handling nearby elements - after all, you are grabbing 4 bytes to get a byte: so if any of those 3 wasted bytes is actually at another used offset
value, then you get it nearly for free. Now, this needs an even more general reduction phase but it still seems like pshufb
will do the heavy lifting and most of the hard work is limited to the pre-processing.
1 This is one of a handful of SSE/AVX instructions where the memory source form of the instruction is quite a bit more efficient than the reg-reg form: the reg-reg form needs 2 uops on port 5 which limits it to a throughput of 0.5 per cycle and gives it a latency of 2. Evidently the memory load path avoids one of the shuffles/blends that are needed on port 5. vpbroadcastd/q
are like that too.
2 With two loads and one store per cycle, this is will be running much close to the ragged edge of the maximum theoretical performance: it's maxing out the L1 operation throughput which often results in hiccups: for example, there may not be any spare cycles to accept incoming cache lines from L2.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With