So, I want to set an individual bit of a __m256i
register.
Say, my __m256i
contains: [ 1 0 1 0 | 1 0 1 0 | ... | 1 0 1 0 ]
, how do I set and unset the n-th bit?
This is an implementation of function which can set individual bit inside a vector:
#include <immintrin.h>
#include <assert.h>
void SetBit(__m256i & vector, size_t position, bool value)
{
assert(position <= 255);
uint8_t lut[32] = { 0 };
lut[position >> 3] = 1 << (position & 7);
__m256i mask = _mm256_loadu_si256((__m256i*)lut);
if (value)
vector = _mm256_or_si256(mask, vector);
else
vector = _mm256_andnot_si256(mask, vector);
}
int main(int argc, char* argv[])
{
__m256i a = _mm256_set1_epi8(-1);
SetBit(a, 54, false);
__m256i b = _mm256_set1_epi8(0);
SetBit(b, 54, true);
return 0;
}
If you'd like to avoid a LUT and/or store-forwarding stalls, you can do this to set the k-th bit of an avx-256 register:
inline __m256i setbit_256(__m256i x,int k){
// constants that will (hopefully) be hoisted out of a loop after inlining
__m256i indices = _mm256_set_epi32(224,192,160,128,96,64,32,0);
__m256i one = _mm256_set1_epi32(-1);
one = _mm256_srli_epi32(one, 31); // set1(0x1)
__m256i kvec = _mm256_set1_epi32(k);
// if 0<=k<=255 then kvec-indices has exactly one element with a value between 0 and 31
__m256i shiftcounts = _mm256_sub_epi32(kvec, indices);
__m256i kbit = _mm256_sllv_epi32(one, shiftcounts); // shift counts outside 0..31 shift the bit out of the element
// kth bit set, all 255 other bits zero.
return _mm256_or_si256(kbit, x); // use _mm256_andnot_si256 to unset the k-th bit
}
Below is my previous answer, which is less straight forward and now obsolete.
#include <immintrin.h>
inline __m256i setbit_256(__m256i x,int k){
__m256i c1, c2, c3;
__m256i t, y, msk;
// constants that will (hopefully) be hoisted out of a loop after inlining
c1=_mm256_set_epi32(7,6,5,4,3,2,1,0);
c2=_mm256_set1_epi32(-1);
c3=_mm256_srli_epi32(c2,27); // set1(0x1f) mask for the shift within elements
c2=_mm256_srli_epi32(c2,31); // set1(0x1)
// create a vector with the kth bit set
t=_mm256_set1_epi32(k);
y=_mm256_and_si256(c3,t); // shift count % 32: distance within each elem
y=_mm256_sllv_epi32(c2,y); // set1( 1<<(k%32) )
t=_mm256_srli_epi32(t,5); // set1( k>>5 )
msk=_mm256_cmpeq_epi32(t,c1); // all-ones in the selected element
y=_mm256_and_si256(y,msk); // kth bit set, all 255 other bits zero.
x=_mm256_or_si256(y,x); /* use _mm256_andnot_si256 to unset the k-th bit */
return x;
}
I'am not sure if this will be any faster than the approaches suggested in the other answers.
This compiles to pretty good asm with clang or gcc (Godbolt compiler explorer), considering that the constants will be hoisted out of loops. As usual, clang defeats the attempt to generate constants on the fly, and broadcast-loads them from memory (which is very efficient on modern CPUs).
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With