I'm trying to come up with a very fast threshold algorithm using SSE to replace this:
uint8_t *pSrc, *pDst;
// Assume pSrc and pDst point to valid data
// Handle left edge
*pDst++ = *pSrc++;
// Likeness filter
for (uint32_t k = 2; k < width; k++, pSrc++, pDst++)
if ((*pDst - *pSrc) * (*pDst - *pSrc) > 100 /*THRESHOLD_SQUARED*/) {
*pDst = *pSrc;
}
}
// Handle right edge
*pDst++ = *pSrc++;
So far I have this:
const uint8_t THRESHOLD = 10;
__attribute__((aligned (16))) static const uint8_t mask[16] = {
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD
};
__m128i xmm1, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
xmm1 = _mm_load_si128((__m128i const *)mask);
xmm6 = _mm_setzero_si128();
uint8_t *pSrc, *pDst;
// Assume pSrc and pDst point to valid data
// I have other code with another mask for the first 16 entries
for (uint32_t k = 16; k < (width - 16); k += 16, pSrc += 16, pDst += 16) {
xmm3 = _mm_load_si128((__m128i const *)pDst);
xmm4 = _mm_load_si128((__m128i const *)pSrc);
xmm5 = _mm_unpacklo_epi8(xmm3, xmm6);
xmm7 = _mm_unpackhi_epi8(xmm3, xmm6);
xmm8 = _mm_unpacklo_epi8(xmm4, xmm6);
xmm9 = _mm_unpackhi_epi8(xmm4, xmm6);
xmm5 = _mm_sub_epi16(xmm5, xmm8);
xmm7 = _mm_sub_epi16(xmm7, xmm9);
xmm5 = _mm_abs_epi16(xmm5);
xmm7 = _mm_abs_epi16(xmm7);
xmm5 = _mm_packs_epi16(xmm5, xmm7);
xmm5 = _mm_cmpgt_epi8(xmm5, xmm1);
xmm3 = _mm_blendv_epi8(xmm3, xmm4, xmm5);
_mm_store_si128((__m128i *)pDst, xmm3);
}
// I have other code with another mask for the last 16 entries
I have idea using another type of algorithm to handle the absolute value of the difference of two values (mainly to stay in U8 (uchar) space):
a' = a >> 1;
b' = b >> 1;
diff = (abs(sub(a' - b')) << 1) + ((a ^ b) & 1);
This would take 8 SSE instructions instead of the above 9 (not including any extra register moves the compiler generates) but I'm not sure if it is faster due to dependency latencies.
Does any other SSE expert have any better suggestions (using up to SSE 4.2)?
Update 1 - Thanks to Yves's suggestion!
const uint8_t THRESHOLD = 10;
__attribute__((aligned (16))) static const uint8_t mask[16] = {
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD
};
__m128i xmm1, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm1 = _mm_load_si128((__m128i const *)mask);
xmm6 = _mm_setzero_si128();
uint8_t *pSrc, *pDst;
// Assume pSrc and pDst point to valid data
// I have other code with another mask for the first 16 entries
for (uint32_t k = 16; k < (width - 16); k += 16, pSrc += 16, pDst += 16) {
xmm3 = _mm_load_si128((__m128i const *)pDst);
xmm4 = _mm_load_si128((__m128i const *)pSrc);
xmm5 = _mm_subs_epu8(xmm3, xmm4);
xmm7 = _mm_subs_epu8(xmm4, xmm3);
xmm5 = _mm_adds_epu8(xmm5, xmm7);
xmm5 = _mm_subs_epu8(xmm5, xmm1);
xmm5 = _mm_cmpeq_epi8(xmm5, xmm6);
xmm4 = _mm_blendv_epi8(xmm4, xmm3, xmm5);
_mm_store_si128((__m128i *)pDst, xmm4);
}
// I have other code with another mask for the last 16 entries
There is an efficient alternative to compute the absolute difference, exploiting arithmetic saturation.
Indeed, saturated subtraction computes A - B = Max(A - B, 0)
, so that |A-B| = (A - B) + (B - A)
.
Diff= _mm_adds_epu8(_mm_subs_epu8(A, B), _mm_subs_epu8(B, A));
The sum will not saturate. This way, you stay with 16 x 8 bits unsigned and get a maximum throughput.
There are some useful functions from Simd library:
inline __m128i Combine(__m128i mask, __m128i positive, __m128i negative)
{
return _mm_or_si128(_mm_and_si128(mask, positive), _mm_andnot_si128(mask, negative));
}
inline __m128i AbsDifferenceU8(__m128i a, __m128i b)
{
return _mm_sub_epi8(_mm_max_epu8(a, b), _mm_min_epu8(a, b));
}
inline __m128i LesserOrEqual8u(__m128i a, __m128i b)
{
return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
}
So SSE2 optimization will be look like this:
__m128i t = _mm_set1_epi8(threshold);
for (uint32_t k = 16; k < width - 16; pSrc += 16, pDst += 16)
{
__m128i src = _mm_load_si128((__m128i*)pSrc);
__m128i dst = _mm_load_si128((__m128i*)pDst);
__m128i mask = LesserOrEqual8u(AbsDifferenceU8(src, dst), t);
_mm_strore_si128((__m128i*)pDst, Combine(mask, dst, src);
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With