This is my simple blitting function:
static void blit8(unsigned char* dest, unsigned char* src)
{
byte i;
for (i = 0; i < 8; ++i) {
if (*src != 0) {
*dest = *src;
}
++dest;
++src;
}
}
I'm already at -O3
, and blit8
being inlined. restrict
(gcc) has no effect here. Nor does incrementing the pointers in any different way, or using another number as transparency, or another type for i
... I even tried passing a 1-byte bitmask and checking that instead of dereferencing src
. Increasing the limit of i
to, say, 16 seems to provide a very minor speedup (~4-6%), but I'm working with 8-byte, not 16-byte chunks.
My bottleneck? No clue actually, I don't think it's the cache line since, my miss rate is low(?) and 64
(my cacheline size) has no special significance when changing things around. But I don't think it's memory speed either (since memcpy
is faster, more on that in a bit).
cg_annotate
says this about blit8
(without inlining):
Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw file:function
3,747,585,536 62 1 1,252,173,824 2,097,653 0 674,067,968 0 0 ppu.c:blit8.constprop.0
Regular cachegrind
output (with inlining):
I refs: 6,446,979,546
I1 misses: 184,752
LLi misses: 22,549
I1 miss rate: 0.00%
LLi miss rate: 0.00%
D refs: 2,150,502,425 (1,497,875,135 rd + 652,627,290 wr)
D1 misses: 17,121,968 ( 2,761,307 rd + 14,360,661 wr)
LLd misses: 253,685 ( 70,802 rd + 182,883 wr)
D1 miss rate: 0.8% ( 0.2% + 2.2% )
LLd miss rate: 0.0% ( 0.0% + 0.0% )
LL refs: 17,306,720 ( 2,946,059 rd + 14,360,661 wr)
LL misses: 276,234 ( 93,351 rd + 182,883 wr)
LL miss rate: 0.0% ( 0.0% + 0.0% )
0.8%
D1 miss rate? Sounds quite low to me.
The most interesting thing to me though, is that removing the 0
-check (becoming functionally identical to memcpy
) provides a <1% speedup, even though:
memcpy
is ~25% faster. I want as close to the speed of raw memcpy
as I can get, while preserving color 0
as transparent.
The problem is, as far as I know, no vector instructions support conditionals, but I need to to preserve dest
where src
is 0
. Is there anything [fast] that can act like OR
but on a byte level?
I read before there was an extension or something to tell the CPU to not cache some data, but I can't find it again. My idea is to not directly read from src
, only write from it to dest
, and make sure it's not be cached. Then just read from a bitmask to check for transparency. I just don't know how to actually do that. Is that even possible let alone fast? I don't know that either, hence why I'm asking this question.
I would prefer a tips on how to make faster with just C, maybe some gcc extensions, but if x86 assembly is the only way, so be it. Helping me understand my actual bottleneck (since I'm confused by my results) would help too.
You didn't mentioned if you use GCC or not but let's assume yes. GCC is picky if it comes to conditions inside loops - that is why your example fails to vectorize.
So this code:
void blit8(unsigned char* dest, unsigned char* src)
{
char i;
for (i = 0; i < 8; ++i) {
if (*src != 0) {
*dest = *src;
}
++dest;
++src;
}
}
ends up as:
blit8:
movzx eax, BYTE PTR [rsi]
test al, al
je .L5
mov BYTE PTR [rdi], al
.L5:
movzx eax, BYTE PTR [rsi+1]
test al, al
je .L6
mov BYTE PTR [rdi+1], al
.L6:
movzx eax, BYTE PTR [rsi+2]
test al, al
je .L7
mov BYTE PTR [rdi+2], al
.L7:
movzx eax, BYTE PTR [rsi+3]
test al, al
je .L8
mov BYTE PTR [rdi+3], al
.L8:
movzx eax, BYTE PTR [rsi+4]
test al, al
je .L9
mov BYTE PTR [rdi+4], al
.L9:
movzx eax, BYTE PTR [rsi+5]
test al, al
je .L10
mov BYTE PTR [rdi+5], al
.L10:
movzx eax, BYTE PTR [rsi+6]
test al, al
je .L11
mov BYTE PTR [rdi+6], al
.L11:
movzx eax, BYTE PTR [rsi+7]
test al, al
je .L37
mov BYTE PTR [rdi+7], al
.L37:
ret
It was unrolled by compiler but still it works on single bytes.
But there is one trick that works pretty often in such cases - instead of if(cond) use ternary operator. This will fix one problem. But there is another one - GCC refuses to vectorize short small block - 8 bytes in this example. So let's use another trick - do computation on bigger block but ignore part of it.
Here is my example:
void blit8(unsigned char* dest, unsigned char* src)
{
int i;
unsigned char temp_dest[16];
unsigned char temp_src[16];
for (i = 0; i < 8; ++i) temp_dest[i] = dest[i];
for (i = 0; i < 8; ++i) temp_src[i] = src[i];
for (i = 0; i < 16; ++i)
{
temp_dest[i] = (temp_src[i] != 0) ? temp_src[i] : temp_dest[i];
}
for (i = 0; i < 8; ++i) dest[i] = temp_dest[i];
}
and corresponding assembly:
blit8:
mov rax, QWORD PTR [rdi]
vpxor xmm0, xmm0, xmm0
mov QWORD PTR [rsp-40], rax
mov rax, QWORD PTR [rsi]
mov QWORD PTR [rsp-24], rax
vmovdqa xmm1, XMMWORD PTR [rsp-24]
vpcmpeqb xmm0, xmm0, XMMWORD PTR [rsp-24]
vpblendvb xmm0, xmm1, XMMWORD PTR [rsp-40], xmm0
vmovq QWORD PTR [rdi], xmm0
ret
NOTE: I didn't benchmarked it - it is just prove SIMD code could be generated by using proper coding rules and tricks ;)
If your compiler/architecture supports vector extensions (like clang and gcc) you can use something like:
//This may compile to awful code on x86_64 b/c mmx is slow (its fine on arm64)
void blit8(void* dest, void* src){
typedef __UINT8_TYPE__ u8x8 __attribute__ ((__vector_size__ (8), __may_alias__));
u8x8 *dp = dest, d = *dp, *sp = src, s = *sp, cmp;
cmp = s == (u8x8){0};
d &= cmp;
*dp = s|d;
}
//This may compile to better code on x86_64 - worse on arm64
void blit8v(void* dest, void* src){
typedef __UINT8_TYPE__ u8x16 __attribute__ ((__vector_size__ (16), __may_alias__));
typedef __UINT64_TYPE__ u64, u64x2 __attribute__ ((__vector_size__ (16), __may_alias__));
u8x16 *dp = dest, d = *dp, *sp = src, s = *sp, cmp;
cmp = s == (u8x16){0};
d &= cmp;
d |= s;
*(u64*)dest = ((u64x2)d)[0];
}
//This one is fine on both arm and x86, but 16 bytes vs. 8
void blit16(void* dest, void* src){
typedef __UINT8_TYPE__ u8x16 __attribute__ ((__vector_size__ (16), __may_alias__));
u8x16 *dp = dest, *sp = src, d = *dp, s = *sp, cmp;
cmp = s == (u8x16){0};
*dp = s|(d & cmp);
}
Compiles on arm to:
blit8:
ldr d1, [x1]
ldr d2, [x0]
cmeq v0.8b, v1.8b, #0
and v0.8b, v0.8b, v2.8b
orr v0.8b, v0.8b, v1.8b
str d0, [x0]
ret
blit16:
ldr q1, [x1]
ldr q2, [x0]
cmeq v0.16b, v1.16b, #0
and v0.16b, v0.16b, v2.16b
orr v0.16b, v0.16b, v1.16b
str q0, [x0]
ret
on x86_64:
blit8v: # @blit8v
movdqa xmm0, xmmword ptr [rsi]
pxor xmm1, xmm1
pcmpeqb xmm1, xmm0
pand xmm1, xmmword ptr [rdi]
por xmm1, xmm0
movq qword ptr [rdi], xmm1
ret
blit16: # @blit16
movdqa xmm0, xmmword ptr [rsi]
pxor xmm1, xmm1
pcmpeqb xmm1, xmm0
pand xmm1, xmmword ptr [rdi]
por xmm1, xmm0
movdqa xmmword ptr [rdi], xmm1
ret
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With