I'm trying to packing 10 bit pixels in to a continuous byte stream, using SIMD instructions. The code below does it "in principle" but the SIMD version is slower than the scalar version.
The problem seem to be that I can't find good gather/scatter operations that load the register efficiently.
Any suggestions for improvement?
// SIMD_test.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include "Windows.h"
#include <tmmintrin.h>
#include <stdint.h>
#include <string.h>
// reference non-SIMD implementation that "works"
// 4 uint16 at a time as input, and 5 uint8 as output per loop iteration
void packSlow(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
for(uint32_t j=0;j<NCOL;j+=4)
{
streamBuffer[0] = (uint8_t)(ptr[0]);
streamBuffer[1] = (uint8_t)(((ptr[0]&0x3FF)>>8) | ((ptr[1]&0x3F) <<2));
streamBuffer[2] = (uint8_t)(((ptr[1]&0x3FF)>>6) | ((ptr[2]&0x0F) <<4));
streamBuffer[3] = (uint8_t)(((ptr[2]&0x3FF)>>4) | ((ptr[3]&0x03) <<6));
streamBuffer[4] = (uint8_t)((ptr[3]&0x3FF)>>2) ;
streamBuffer += 5;
ptr += 4;
}
}
// poorly written SIMD implementation. Attempts to do the same
// as the packSlow, but 8 iterations at a time
void packFast(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
const __m128i maska = _mm_set_epi16(0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF);
const __m128i maskb = _mm_set_epi16(0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F);
const __m128i maskc = _mm_set_epi16(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F);
const __m128i maskd = _mm_set_epi16(0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03);
for(uint32_t j=0;j<NCOL;j+=4*8)
{
_mm_prefetch((const char*)(ptr+j),_MM_HINT_T0);
}
for(uint32_t j=0;j<NCOL;j+=4*8)
{
// this "fetch" stage is costly. Each term takes 2 cycles
__m128i ptr0 = _mm_set_epi16(ptr[0],ptr[4],ptr[8],ptr[12],ptr[16],ptr[20],ptr[24],ptr[28]);
__m128i ptr1 = _mm_set_epi16(ptr[1],ptr[5],ptr[9],ptr[13],ptr[17],ptr[21],ptr[25],ptr[29]);
__m128i ptr2 = _mm_set_epi16(ptr[2],ptr[6],ptr[10],ptr[14],ptr[18],ptr[22],ptr[26],ptr[30]);
__m128i ptr3 = _mm_set_epi16(ptr[3],ptr[7],ptr[11],ptr[15],ptr[19],ptr[23],ptr[27],ptr[31]);
// I think this part is fairly well optimized
__m128i streamBuffer0 = ptr0;
__m128i streamBuffer1 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr0 , maska), _mm_set_epi32(0, 0, 0,8)) , _mm_sll_epi16 (_mm_and_si128 (ptr1 , maskb) , _mm_set_epi32(0, 0, 0,2)));
__m128i streamBuffer2 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr1 , maska), _mm_set_epi32(0, 0, 0,6)) , _mm_sll_epi16 (_mm_and_si128 (ptr2 , maskc) , _mm_set_epi32(0, 0, 0,4)));
__m128i streamBuffer3 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr2 , maska), _mm_set_epi32(0, 0, 0,4)) , _mm_sll_epi16 (_mm_and_si128 (ptr3 , maskd) , _mm_set_epi32(0, 0, 0,6)));
__m128i streamBuffer4 = _mm_srl_epi16 (_mm_and_si128 (ptr3 , maska), _mm_set_epi32(0, 0, 0,2)) ;
// this again is terribly slow. ~2 cycles per byte output
for(int j=15;j>=0;j-=2)
{
streamBuffer[0] = streamBuffer0.m128i_u8[j];
streamBuffer[1] = streamBuffer1.m128i_u8[j];
streamBuffer[2] = streamBuffer2.m128i_u8[j];
streamBuffer[3] = streamBuffer3.m128i_u8[j];
streamBuffer[4] = streamBuffer4.m128i_u8[j];
streamBuffer += 5;
}
ptr += 32;
}
}
int _tmain(int argc, _TCHAR* argv[])
{
uint16_t pixels[512];
uint8_t packed1[512*10/8];
uint8_t packed2[512*10/8];
for(int i=0;i<512;i++)
{
pixels[i] = i;
}
LARGE_INTEGER t0,t1,t2;
QueryPerformanceCounter(&t0);
for(int k=0;k<1000;k++) packSlow(pixels,packed1,512);
QueryPerformanceCounter(&t1);
for(int k=0;k<1000;k++) packFast(pixels,packed2,512);
QueryPerformanceCounter(&t2);
printf("%d %d\n",t1.QuadPart-t0.QuadPart,t2.QuadPart-t1.QuadPart);
if (memcmp(packed1,packed2,sizeof(packed1)))
{
printf("failed\n");
}
return 0;
}
On re-reading your code, it looks like you are almost definitely murdering your load/store unit, which wouldn't even get complete relief with the new AVX2 VGATHER[D/Q]P[D/S]
instruction family. Even Haswell's architecture still requires a uop per load element, each hitting the L1D TLB and cache, regardless of locality, with efficiency improvements showing in Skylake ca. 2016 at earliest.
Your best recourse at present is probably to do 16B register reads and manually construct your streamBuffer
values with register copies, _mm_shuffle_epi8()
, and _mm_or_si128()
calls, and the inverse for the finishing stores.
In the near future, AVX2 will provide (and does for newer desktops already) VPS[LL/RL/RA]V[D/Q]
instructions that allow variable element shifting that, combined with a horizontal add, could do this packing pretty quickly. In this case, you could use simple MOVDQU
instructions for loading your values, since you could process contiguous uint16_t
input values in a single xmm register.
Also, consider reworking your prefetching. Your j
in NCOL
loop is processing 64B/1 cache line at a time, so you should probably do a single prefetch for ptr + 32
at the beginning of your second loop's body. You might even consider omitting it, since it's a simple forward scan that the hardware prefetcher will detect and automate for you after a very small number of iterations anyway.
I have no experience specifically in SSE. But I would have tried to optimize the code as follows.
// warning. This routine requires streamBuffer to have at least 3 extra spare bytes
// at the end to be used as scratch space. It will write 0's to those bytes.
// for example, streamBuffer needs to be 640+3 bytes of allocated memory if
// 512 10-bit samples are output.
void packSlow1(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
for(uint32_t j=0;j<NCOL;j+=4*4)
{
uint64_t *dst;
uint64_t src[4][4];
// __m128i s01 = _mm_set_epi64(ptr[0], ptr[1]);
// __m128i s23 = _mm_set_epi64(ptr[2], ptr[3]);
// ---- or ----
// __m128i s0123 = _mm_load_si128(ptr[0])
// __m128i s01 = _?????_(s0123) // some instruction to extract s01 from s0123
// __m128i s23 = _?????_(s0123) // some instruction to extract s23
src[0][0] = ptr[0] & 0x3ff;
src[0][1] = ptr[1] & 0x3ff;
src[0][2] = ptr[2] & 0x3ff;
src[0][3] = ptr[3] & 0x3ff;
src[1][0] = ptr[4] & 0x3ff;
src[1][1] = ptr[5] & 0x3ff;
src[1][2] = ptr[6] & 0x3ff;
src[1][3] = ptr[7] & 0x3ff;
src[2][0] = ptr[8] & 0x3ff;
src[2][1] = ptr[9] & 0x3ff;
src[2][2] = ptr[10] & 0x3ff;
src[2][3] = ptr[11] & 0x3ff;
src[3][0] = ptr[12] & 0x3ff;
src[3][1] = ptr[13] & 0x3ff;
src[3][2] = ptr[14] & 0x3ff;
src[3][3] = ptr[15] & 0x3ff;
// looks like _mm_maskmoveu_si128 can store result efficiently
dst = (uint64_t*)streamBuffer;
dst[0] = src[0][0] | (src[0][1] << 10) | (src[0][2] << 20) | (src[0][3] << 30);
dst = (uint64_t*)(streamBuffer + 5);
dst[0] = src[1][0] | (src[1][1] << 10) | (src[1][2] << 20) | (src[1][3] << 30);
dst = (uint64_t*)(streamBuffer + 10);
dst[0] = src[2][0] | (src[2][1] << 10) | (src[2][2] << 20) | (src[2][3] << 30);
dst = (uint64_t*)(streamBuffer + 15);
dst[0] = src[3][0] | (src[3][1] << 10) | (src[3][2] << 20) | (src[3][3] << 30);
streamBuffer += 5 * 4;
ptr += 4 * 4;
}
}
UPDATE:
Benchmarks:
Ubuntu 12.04, x86_64 GNU/Linux, gcc v4.6.3 (Virtual Box)
Intel Core i7 (Macbook pro)
compiled with -O3
5717633386 (1X): packSlow
3868744491 (1.4X): packSlow1 (version from the post)
4471858853 (1.2X): packFast2 (from Mark Lakata's post)
1820784764 (3.1X): packFast3 (version from the post)
Windows 8.1, x64, VS2012 Express
Intel Core i5 (Asus)
compiled with standard 'Release' options and SSE2 enabled
00413185 (1X) packSlow
00782005 (0.5X) packSlow1
00236639 (1.7X) packFast2
00148906 (2.8X) packFast3
I see completely different results on Asus notebook with Windows 8.1 and VS Express 2012 (code compiled with -O2). packSlow1 is 2x slower than original packSlow, while packFast2 is 1.7X (not 2.9X) faster than packSlow. After researching this problem, I understood the reason. VC compiler was unable to save all the constants into XMMS registers for packFast2 , so it inserted additional memory accesses into the loop (see generated assembly). Slow memory access explains performance degradation.
In order to get more stable results I increased pixels buffer to 256x512 and increased loop counter from 1000 to 10000000/256.
Here is my version of SSE optimized function.
// warning. This routine requires streamBuffer to have at least 3 extra spare bytes
// at the end to be used as scratch space. It will write 0's to those bytes.
// for example, streamBuffer needs to be 640+3 bytes of allocated memory if
// 512 10-bit samples are output.
void packFast3(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
const __m128i m0 = _mm_set_epi16(0, 0x3FF, 0, 0x3FF, 0, 0x3FF, 0, 0x3FF);
const __m128i m1 = _mm_set_epi16(0x3FF, 0, 0x3FF, 0, 0x3FF, 0, 0x3FF, 0);
const __m128i m2 = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF);
const __m128i m3 = _mm_set_epi32(0xFFFFFFFF, 0, 0xFFFFFFFF, 0);
const __m128i m4 = _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
const __m128i m5 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
__m128i s0, t0, r0, x0, x1;
// unrolled and normal loop gives the same result
for(uint32_t j=0;j<NCOL;j+=8)
{
// load 8 samples into s0
s0 = _mm_loadu_si128((__m128i*)ptr); // s0=00070006_00050004_00030002_00010000
// join 16-bit samples into 32-bit words
x0 = _mm_and_si128(s0, m0); // x0=00000006_00000004_00000002_00000000
x1 = _mm_and_si128(s0, m1); // x1=00070000_00050000_00030000_00010000
t0 = _mm_or_si128(x0, _mm_srli_epi32(x1, 6)); // t0=00001c06_00001404_00000c02_00000400
// join 32-bit words into 64-bit dwords
x0 = _mm_and_si128(t0, m2); // x0=00000000_00001404_00000000_00000400
x1 = _mm_and_si128(t0, m3); // x1=00001c06_00000000_00000c02_00000000
t0 = _mm_or_si128(x0, _mm_srli_epi64(x1, 12)); // t0=00000001_c0601404_00000000_c0200400
// join 64-bit dwords
x0 = _mm_and_si128(t0, m4); // x0=00000000_00000000_00000000_c0200400
x1 = _mm_and_si128(t0, m5); // x1=00000001_c0601404_00000000_00000000
r0 = _mm_or_si128(x0, _mm_srli_si128(x1, 3)); // r0=00000000_000001c0_60140400_c0200400
// and store result
_mm_storeu_si128((__m128i*)streamBuffer, r0);
streamBuffer += 10;
ptr += 8;
}
}
I came up with a "better" solution using SIMD, but it doesn't not leverage parallelization, just more efficient loads and stores (I think).
I'm posting it here for reference, not necessarily the best answer.
The benchmarks are (in arbitrary ticks)
gcc4.8.1 -O3 VS2012 /O2 Implementation
-----------------------------------------
369 (1X) 3394 (1X) packSlow (original code)
212 (1.7X) 2010 (1.7X) packSlow (from @alexander)
147 (2.5X) 1178 (2.9X) packFast2 (below)
Here's the code. Essentially @alexander's code except using 128 bit registers instead of 64 bit registers, and unrolled 2x instead of 4x.
void packFast2(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
const __m128i maska = _mm_set_epi16(0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF);
const __m128i mask0 = _mm_set_epi16(0,0,0,0,0,0,0,0x3FF);
const __m128i mask1 = _mm_set_epi16(0,0,0,0,0,0,0x3FF,0);
const __m128i mask2 = _mm_set_epi16(0,0,0,0,0,0x3FF,0,0);
const __m128i mask3 = _mm_set_epi16(0,0,0,0,0x3FF,0,0,0);
const __m128i mask4 = _mm_set_epi16(0,0,0,0x3FF,0,0,0,0);
const __m128i mask5 = _mm_set_epi16(0,0,0x3FF,0,0,0,0,0);
const __m128i mask6 = _mm_set_epi16(0,0x3FF,0,0,0,0,0,0);
const __m128i mask7 = _mm_set_epi16(0x3FF,0,0,0,0,0,0,0);
for(uint32_t j=0;j<NCOL;j+=16)
{
__m128i s = _mm_load_si128((__m128i*)ptr); // load 8 16 bit values
__m128i s2 = _mm_load_si128((__m128i*)(ptr+8)); // load 8 16 bit values
__m128i a = _mm_and_si128(s,mask0);
a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask1),6));
a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask2),12));
a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask3),18));
a = _mm_or_si128( a, _mm_srli_si128 (_mm_and_si128(s, mask4),24/8)); // special shift 24 bits to the right, staddling the middle. luckily use just on 128 byte shift (24/8)
a = _mm_or_si128( a, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s, mask5),6),24/8)); // special. shift net 30 bits. first shift 6 bits, then 3 bytes.
a = _mm_or_si128( a, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s, mask6),4),32/8)); // special. shift net 36 bits. first shift 4 bits, then 4 bytes (32 bits).
a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask7),42));
_mm_storeu_si128((__m128i*)streamBuffer, a);
__m128i a2 = _mm_and_si128(s2,mask0);
a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask1),6));
a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask2),12));
a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask3),18));
a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_and_si128(s2, mask4),24/8)); // special shift 24 bits to the right, staddling the middle. luckily use just on 128 byte shift (24/8)
a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s2, mask5),6),24/8)); // special. shift net 30 bits. first shift 6 bits, then 3 bytes.
a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s2, mask6),4),32/8)); // special. shift net 36 bits. first shift 4 bits, then 4 bytes (32 bits).
a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask7),42));
_mm_storeu_si128((__m128i*)(streamBuffer+10), a2);
streamBuffer += 20 ;
ptr += 16 ;
}
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With