I want to XOR two blocks of memory as quickly as possible, How can I use SIMD to accelerate it?
My original code is below:
void region_xor_w64( unsigned char *r1, /* Region 1 */
unsigned char *r2, /* Region 2 */
int nbytes) /* Number of bytes in region */
{
uint64_t *l1;
uint64_t *l2;
uint64_t *ltop;
unsigned char *ctop;
ctop = r1 + nbytes;
ltop = (uint64_t *) ctop;
l1 = (uint64_t *) r1;
l2 = (uint64_t *) r2;
while (l1 < ltop) {
*l2 = ((*l1) ^ (*l2));
l1++;
l2++;
}
}
I wrote one myself, but little speed increased.
void region_xor_sse( unsigned char* dst,
unsigned char* src,
int block_size){
const __m128i* wrd_ptr = (__m128i*)src;
const __m128i* wrd_end = (__m128i*)(src+block_size);
__m128i* dst_ptr = (__m128i*)dst;
do{
__m128i xmm1 = _mm_load_si128(wrd_ptr);
__m128i xmm2 = _mm_load_si128(dst_ptr);
xmm2 = _mm_xor_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr, xmm2);
++dst_ptr;
++wrd_ptr;
}while(wrd_ptr < wrd_end);
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With