Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Permuting bytes inside SSE __m128i register

I have following problem:

In __m128i register there are 16 8bit values in following ordering:

[ 1, 5, 9, 13 ] [ 2, 6, 10, 14] [3, 7, 11, 15]  [4, 8, 12, 16]

What I would like to achieve is efficiently shuffle bytes to get this ordering:

[ 1, 2, 3, 4 ] [ 5, 6, 7, 8] [9, 10, 11, 12]  [13, 14, 15, 16]

It is actually analog to 4x4 matrix transposition, but operating on 8-bits element inside one register.

Do you please can point me to what kind of SSE (preferabbly <= SSE2) instructions are suitable for realizing this ?

like image 373
born49 Avatar asked Jul 06 '14 10:07

born49


1 Answers

You really will want to go SSSE3 for this, it's much more clean than trying to go <= SSE2

Your code will look something like this:

   #include <tmmintrin.h> // _mm_shuffle_epi8
   #include <tmmintrin.h> // _mm_set_epi8
   ...
   // check if your hardware supports SSSE3
   ...
   __m128i mask = _mm_set_epi8(15, 11, 7, 3,
                               14, 10, 6, 2,
                               13,  9, 5, 1,
                               12,  8, 4, 0);
   __m128i mtrx = _mm_set_epi8(16, 12, 8, 4,
                               15, 11, 7, 3,
                               14, 10, 6, 2,
                               13,  9, 5, 1);
   mtrx         = _mm_shuffle_epi8(mtrx, mask);

If you really want SSE2 this will suffice:
(assuming I'm interpreting your initial ordering correctly)

  __m128i mask = _mm_set_epi8(0x00, 0xFF, 0x00, 0xFF,
                              0x00, 0xFF, 0x00, 0xFF,
                              0x00, 0xFF, 0x00, 0xFF,
                              0x00, 0xFF, 0x00, 0xFF);
  __m128i mtrx = _mm_set_epi8(16, 12, 8, 4,
                              15, 11, 7, 3,
                              14, 10, 6, 2,
                              13,  9, 5, 1);                                   // [1, 5, 9, 13] [2,  6, 10, 14] [3,  7, 11, 15] [ 4,  8, 12, 16]
  mtrx = _mm_packus_epi16(_mm_and_si128(mtrx, mask), _mm_srli_epi16(mtrx, 8)); // [1, 9, 2, 10] [3, 11,  4, 12] [5, 13,  6, 14] [ 7, 15,  8, 16]
  mtrx = _mm_packus_epi16(_mm_and_si128(mtrx, mask), _mm_srli_epi16(mtrx, 8)); // [1, 2, 3,  4] [5,  6,  7,  8] [9, 10, 11, 12] [13, 14, 15, 16]

Or more easily debuggable:

  __m128i mtrx = _mm_set_epi8(16, 12, 8, 4,
                              15, 11, 7, 3,
                              14, 10, 6, 2,
                              13, 9, 5, 1);            // [1, 5,  9, 13] [ 2,  6, 10, 14] [ 3,  7, 11, 15] [ 4,  8, 12, 16]
  __m128i mask = _mm_set_epi8(0x00, 0xFF, 0x00, 0xFF,
                              0x00, 0xFF, 0x00, 0xFF,
                              0x00, 0xFF, 0x00, 0xFF,
                              0x00, 0xFF, 0x00, 0xFF);
  __m128i temp = _mm_srli_epi16(mtrx, 8);              // [5, 0, 13,  0] [ 6,  0, 14,  0] [ 7,  0, 15,  0] [ 8,  0, 16,  0]
  mtrx         = _mm_and_si128(mtrx, mask);            // [1, 0,  9,  0] [ 2,  0, 10,  0] [ 3,  0, 11,  0] [ 4,  0, 12,  0]
  mtrx         = _mm_packus_epi16(mtrx, temp);         // [1, 9,  2, 10] [ 3, 11,  4, 12] [ 5, 13,  6, 14] [ 7, 15,  8, 16]
  temp         = _mm_srli_epi16(mtrx, 8);              // [9, 0, 10,  0] [11,  0, 12,  0] [13,  0, 14,  0] [15,  0, 16,  0]
  mtrx         = _mm_and_si128(mtrx, mask);            // [1, 0,  2,  0] [ 3,  0,  4,  0] [ 5,  0,  6,  0] [ 7,  0,  8,  0] 
  mtrx         = _mm_packus_epi16(mtrx, temp);         // [1, 2,  3,  4] [ 5,  6,  7,  8] [ 9, 10, 11, 12] [13, 14, 15, 16]
like image 115
Apriori Avatar answered Nov 04 '22 23:11

Apriori