What I am trying to do is, if I have a vector of 27 (not 32!) int8_t
:
x = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26}
I want to first cyclic-shift it to the right by n (not a constant), e.g. if n=1:
x2 = {26,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}
Then this vector is used to perform some very complex calculation, but for simplicity, let's assume that the next step is just to cyclic-shift it back to the left by n, and store it to the memory. So I should have a new vector of 27 int8_t
:
y = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26}
So there're thousands of such vectors and performance is very critical here. The CPU that we're using has AVX2 support so we want to use it to speed things up.
To get x2
, I use two _mm256_loadu_si256()
with a _mm256_blendv_epi8()
:
int8_t x[31+27+31];
for(int i=0; i<27; i++){
x[31+i] = i;
}
__m256i mask = _mm256_set_epi32 (0x0, 0x00800000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
__m256i x_second_part = _mm256_loadu_si256((__m256i*)(x+31+1)); //{1,2,...,26}
__m256i x_first_part = _mm256_loadu_si256((__m256i*)(x+31-26)); //{0}
__m256i x2 = _mm256_blendv_epi8(x_second_part, x_first_part, mask); //{1,2,...,26, 0}
int8_t y[31+27+31];
_mm256_storeu_si256((__m256i*)(y+31-26), x2);
_mm256_storeu_si256((__m256i*)(y+31+1), x2);
The reason why the x
and y
are declared to be of size [31+27+31]
is that in this case _mm256_loadu_si256()
and _mm256_storeu_si256()
won't cause segfault.
And I can get the value of y
by:
for(int i=0; i<27; i++){
cout << (int)y[31+i] << ' ';
}
Unfortunately all the vectors must be continuous in memory, for example, if there are totally two vectors that need to be processed:
x = {[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26];
[27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]};
Then I cannot just use _mm256_storeu_si256()
to put the value of y
back to memory because when the value of the second vector is written to memory it will overwrite some values of the first vector:
int8_t x[31+27+27+31];
int8_t y[31+27+27+31];
for(int i=0; i<27*2; i++){
x[31+i] = i;
}
for(int i=0; i<2; i++){
__m256i mask = _mm256_set_epi32 (0x0, 0x00800000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
__m256i x_second_part = _mm256_loadu_si256((__m256i*)(x+31+27*i+1)); //{1,2,...,26}
__m256i x_first_part = _mm256_loadu_si256((__m256i*)(x+31+27*i-26)); //{0}
__m256i x2 = _mm256_blendv_epi8(x_second_part, x_first_part, mask); //{1,2,...,26, 0}
_mm256_storeu_si256((__m256i*)(y+31+27*i-26), x2);
_mm256_storeu_si256((__m256i*)(y+31+27*i+1), x2);
}
for(int i=0; i<27; i++){
cout << (int)y[31+i] << ' ';
}cout << endl;
for(int i=0; i<27; i++){
cout << (int)y[31+27+i] << ' ';
}cout << endl;
will output
0 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
instead of
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
So I was thinking of using maskstore. But in the Intel Intrinsic Guide I couldn't find _mm256_maskstore_epi8
. This leads me back to the topic:
There is another implementation of cyclic-shift inside 27-bytes vector with using AVX2:
#include <iostream>
#include <immintrin.h>
const __m256i K0 = _mm256_setr_epi8(
0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0);
const __m256i K1 = _mm256_setr_epi8(
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70);
inline const __m256i Shuffle(const __m256i & value, const __m256i & shuffle)
{
return _mm256_or_si256(_mm256_shuffle_epi8(value, _mm256_add_epi8(shuffle, K0)),
_mm256_shuffle_epi8(_mm256_permute4x64_epi64(value, 0x4E), _mm256_add_epi8(shuffle, K1)));
}
__m256i shuffles[27];
void Init()
{
uint8_t * p = (uint8_t *)shuffles;
for (int s = 0; s < 27; ++s)
for (int i = 0; i < 32; ++i)
p[s*32 + i] = i < 27 ? (27 + i - s)%27 : i;
}
void CyclicShift27(const uint8_t * src, size_t shift, uint8_t * dst)
{
_mm256_storeu_si256((__m256i*)dst, Shuffle(_mm256_loadu_si256((__m256i*)src), shuffles[shift]));
}
int main()
{
Init();
uint8_t src[32] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 }, dst[32];
for (int j = 0; j < 27; ++j)
{
CyclicShift27(src, j, dst);
std::cout << "\t";
for (int i = 0; i < 32; i++)
std::cout << (int)dst[i] << ' ';
std::cout << std::endl;
}
return 0;
}
Output:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 27 28 29 30 31
25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 27 28 29 30 31
24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 27 28 29 30 31
23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 27 28 29 30 31
22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 27 28 29 30 31
21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 27 28 29 30 31
20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 27 28 29 30 31
19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 27 28 29 30 31
18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 27 28 29 30 31
17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 27 28 29 30 31
16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 27 28 29 30 31
15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 27 28 29 30 31
14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 27 28 29 30 31
13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 27 28 29 30 31
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 27 28 29 30 31
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 27 28 29 30 31
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 27 28 29 30 31
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 27 28 29 30 31
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 27 28 29 30 31
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 27 28 29 30 31
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 27 28 29 30 31
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 27 28 29 30 31
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 27 28 29 30 31
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 27 28 29 30 31
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 27 28 29 30 31
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 27 28 29 30 31
It looks more simple than my previous answer.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With