I'm trying to find an optimized method for RGB8 (actually grayscale) to RGB32 image conversion.
Source is an 8 bits grey image, Destination should be an 32 bits grey image (BGRA) with 4th channel (alpha) to be ignored. Source address is not guaranteed to be 16 byte aligned, Count is a multiple of 16, Destination address is 16 byte aligned.
Here is my optimized assembly code. Is there an even faster way of conversion?
void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) {
static unsigned int __declspec(align(64)) Masks[] = {
0x80000000, 0x80010101, 0x80020202, 0x80030303,
0x80040404, 0x80050505, 0x80060606, 0x80070707,
0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b,
0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f
};
__asm {
mov esi, Source
mov edi, Destination
mov edx, Count
xor ecx, ecx
movdqa xmm4, xmmword ptr [Masks + 0]
movdqa xmm5, xmmword ptr [Masks + 16]
movdqa xmm6, xmmword ptr [Masks + 32]
movdqa xmm7, xmmword ptr [Masks + 48]
l1:
movdqu xmm0, xmmword ptr [esi + ecx]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pshufb xmm0, xmm4
pshufb xmm1, xmm5
pshufb xmm2, xmm6
pshufb xmm3, xmm7
movntdq [edi + 0], xmm0
movntdq [edi + 16], xmm1
movntdq [edi + 32], xmm2
movntdq [edi + 48], xmm3
add edi, 64
add ecx, 16
cmp ecx, edx
jb l1
}
}
There is another approach using several PUNPCKLBW and PUNPCKHBW but that seems to be slightly slower.
Update: This is the basic non optimized algorithm:
BGRA* Destination = ...
unsigned char* Source ...
for (unsigned int i = 0; i < Size; i++) {
Destination[i].Blue = Source[i];
Destination[i].Green = Source[i];
Destination[i].Red = Source[i];
}
PS: I also tried using C code with MS VS2008 SSE compiler intrinsics. It turned out that the compiler generated a lot of unnecessary memory moves which causes the code to be 10-20% slower than pure assembly.
Update 2: This is the same code by using intrinsics only.
void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) {
static const unsigned int __declspec(align(64)) Masks[] = {
0x80000000, 0x80010101, 0x80020202, 0x80030303,
0x80040404, 0x80050505, 0x80060606, 0x80070707,
0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b,
0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f
};
register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0));
register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 4));
register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 8));
register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 12));
for (unsigned int i = 0; i < Count / 16; i++) {
__m128i r0 = _mm_load_si128(Source + i);
_mm_stream_si128(Destination + (i * 4) + 0, _mm_shuffle_epi8(r0, m0));
_mm_stream_si128(Destination + (i * 4) + 1, _mm_shuffle_epi8(r0, m1));
_mm_stream_si128(Destination + (i * 4) + 2, _mm_shuffle_epi8(r0, m2));
_mm_stream_si128(Destination + (i * 4) + 3, _mm_shuffle_epi8(r0, m3));
}
}
Update 3: This is the compiler generated code (beautified) (Visual Studio 2012, all optimization on):
push ebp
mov ebp, esp
mov edx, dword ptr [ebp+8]
movdqa xmm1, xmmword ptr ds:[Masks + 0]
movdqa xmm2, xmmword ptr ds:[Masks + 16]
movdqa xmm3, xmmword ptr ds:[Masks + 32]
movdqa xmm4, xmmword ptr ds:[Masks + 48]
push esi
test ecx, ecx
je l2
lea esi, [ecx-1]
shr esi, 4
inc esi
l1:
mov ecx, edx
movdqu xmm0, xmmword ptr [ecx]
mov ecx, eax
movdqa xmm5, xmm0
pshufb xmm5, xmm1
movdqa xmmword ptr [ecx], xmm5
movdqa xmm5, xmm0
pshufb xmm5, xmm2
movdqa xmmword ptr [eax+10h], xmm5
movdqa xmm5, xmm0
pshufb xmm5, xmm3
movdqa xmmword ptr [eax+20h], xmm5
lea ecx, [eax+30h]
add edx, 10h
add eax, 40h
dec esi
pshufb xmm0, xmm4
movdqa xmmword ptr [ecx], xmm0
jne l1
l2:
pop esi
pop ebp
ret
It seems that interleaving movdqa
with pshufb
is some what faster.
Update 4: This seems to be the optimal hand optimized code:
__asm {
mov esi, Source
mov edi, Destination
mov ecx, Count
movdqu xmm0, xmmword ptr [esi]
movdqa xmm4, xmmword ptr [Masks + 0]
movdqa xmm5, xmmword ptr [Masks + 16]
movdqa xmm6, xmmword ptr [Masks + 32]
movdqa xmm7, xmmword ptr [Masks + 48]
l1:
dec ecx
lea edi, [ edi + 64 ]
lea esi, [ esi + 16 ]
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pshufb xmm0, xmm4
movdqa [edi - 64], xmm0
pshufb xmm1, xmm5
movdqa [edi - 48], xmm1
pshufb xmm2, xmm6
movdqa [edi - 32], xmm2
pshufb xmm3, xmm7
movdqa [edi - 16], xmm3
movdqu xmm0, xmmword ptr [esi]
ja l1
}
Update 5: This conversion algorithm uses punpck
instruction. However this conversion routine is a bit slower than using masks and pushfb
.
for (unsigned int i = 0; i < Count; i += 16) {
register __m128i r0 = _mm_load_si128(Source++);
register __m128i r1 = _mm_unpackhi_epi8(r0, r0);
register __m128i r2 = _mm_unpacklo_epi8(r0, r0);
register __m128i r3 = _mm_unpackhi_epi8(r1, r1);
register __m128i r4 = _mm_unpacklo_epi8(r1, r1);
register __m128i r5 = _mm_unpackhi_epi8(r2, r2);
register __m128i r6 = _mm_unpacklo_epi8(r2, r2);
_mm_store_si128(Destination++, r6);
_mm_store_si128(Destination++, r5);
_mm_store_si128(Destination++, r4);
_mm_store_si128(Destination++, r3);
}
Update 6: For the sake of completeness this is the inverse method to convert from 32 bits back to 8 bits grey image.
static void ConvertRgb32ToGrey(const __m128i* Source, __m128i* Destination, unsigned int Count) {
static const unsigned char __declspec(align(64)) Masks[] = {
0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c,
};
register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0));
register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 16));
register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 32));
register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 48));
for (unsigned int i = 0; i < Count / 64; i++) {
__m128i a = _mm_load_si128(Source + (i * 4) + 0);
__m128i b = _mm_load_si128(Source + (i * 4) + 1);
__m128i c = _mm_load_si128(Source + (i * 4) + 2);
__m128i d = _mm_load_si128(Source + (i * 4) + 3);
a = _mm_shuffle_epi8(a, m0);
b = _mm_shuffle_epi8(b, m1);
c = _mm_shuffle_epi8(c, m2);
d = _mm_shuffle_epi8(d, m3);
__m128i e = _mm_or_si128(a, b);
__m128i f = _mm_or_si128(c, d);
__m128i g = _mm_or_si128(e, f);
_mm_stream_si128(Destination + i, g);
}
}
Would try:
__asm { mov esi, Source mov edi, Destination mov ecx, Count movdqu xmm0, xmmword ptr [esi] movdqa xmm4, xmmword ptr [Masks + 0] movdqa xmm5, xmmword ptr [Masks + 16] movdqa xmm6, xmmword ptr [Masks + 32] movdqa xmm7, xmmword ptr [Masks + 48] l1: dec ecx // modern Intel can macro-fuse this with jnz if adjacent lea edi, [ edi + 64 ] lea esi, [ esi + 16 ] movdqa xmm1, xmm0 movdqa xmm2, xmm0 movdqa xmm3, xmm0 pshufb xmm0, xmm4 pshufb xmm1, xmm5 pshufb xmm2, xmm6 pshufb xmm3, xmm7 movntdq [edi - 64], xmm0 movntdq [edi - 48], xmm1 movntdq [edi - 32], xmm2 movntdq [edi - 16], xmm3 movdqu xmm0, xmmword ptr [esi] jnz l1 }
Haven't benchmarked it though; assumptions behind these changes:
movdqu xmm0,...
latency can be a little more hidden within the loop (your code has the load of xmm0
followed directly by an instruction using the value in that register)add
ops on two regs as well as the cmp
aren't really all necessary; address generation (lea
) and the implicit zero test by dec
/jnz
can be used. That way, there'll be no EFLAGS
dependencies caused by operations on ecx
/esi
/edi
as the only ALU op in the loop is decrementing the loop counter.In the end, this is likely load/store bound in any case so the arithmetics are "free game"; I therefore expect little difference, even with the arguments as given.
If the input is large, then it'd make sense to strip the "unaligned head/tail" off, i.e. to do a duff's device for the first/last [0..15]
bytes, and the main loop using movdqa
.
Edit:
Running your intrinsics sources through gcc -msse4.2 -O8 -c
(GCC 4.7.1) gives the following assembly:
Disassembly of section .text: 0000000000000000 <ConvertGreyToRgb32Assembler>: 0: 85 d2 test edx,edx 2: 74 76 je 7a <ConvertGreyToRgb32Assembler+0x7a> 4: 66 0f 6f 2d 00 00 00 00 movdqa xmm5,XMMWORD PTR [rip+0x0] # c <ConvertGreyToRgb32Assembler+0xc> c: 48 89 f8 mov rax,rdi f: 66 0f 6f 25 00 00 00 00 movdqa xmm4,XMMWORD PTR [rip+0x0] # 17 <ConvertGreyToRgb32Assembler+0x17> 17: 66 0f 6f 1d 00 00 00 00 movdqa xmm3,XMMWORD PTR [rip+0x0] # 1f <ConvertGreyToRgb32Assembler+0x1f> 1f: 66 0f 6f 15 00 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0] # 27 <ConvertGreyToRgb32Assembler+0x27> 27: 66 0f 1f 84 00 00 00 00 00 nop WORD PTR [rax+rax*1+0x0] 30: f3 0f 6f 00 movdqu xmm0,XMMWORD PTR [rax] 34: 48 89 f1 mov rcx,rsi 37: 48 83 c0 10 add rax,0x10 3b: 66 0f 6f c8 movdqa xmm1,xmm0 3f: 66 0f 38 00 cd pshufb xmm1,xmm5 44: 66 0f e7 0e movntdq XMMWORD PTR [rsi],xmm1 48: 66 0f 6f c8 movdqa xmm1,xmm0 4c: 66 0f 38 00 cc pshufb xmm1,xmm4 51: 66 0f e7 4e 10 movntdq XMMWORD PTR [rsi+0x10],xmm1 56: 66 0f 6f c8 movdqa xmm1,xmm0 5a: 66 0f 38 00 c2 pshufb xmm0,xmm2 5f: 66 0f 38 00 cb pshufb xmm1,xmm3 64: 66 0f e7 4e 20 movntdq XMMWORD PTR [rsi+0x20],xmm1 69: 66 0f e7 41 30 movntdq XMMWORD PTR [rcx+0x30],xmm0 6e: 89 c1 mov ecx,eax 70: 29 f9 sub ecx,edi 72: 48 83 c6 40 add rsi,0x40 76: 39 ca cmp edx,ecx 78: 77 b6 ja 30 <ConvertGreyToRgb32Assembler+0x30> 7a: f3 c3 repz ret
This reminds me extremely strongly of your initial assembly code. If MSVC creates something significantly worse than that, I'd say it's a bug/limitation in the compiler (version) you used.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With