Fast SSE threshold algorithm

Question

I'm trying to come up with a very fast threshold algorithm using SSE to replace this:

uint8_t *pSrc, *pDst;

// Assume pSrc and pDst point to valid data

// Handle left edge
*pDst++ = *pSrc++;

// Likeness filter
for (uint32_t k = 2; k < width; k++, pSrc++, pDst++)
    if ((*pDst - *pSrc) * (*pDst - *pSrc) > 100 /*THRESHOLD_SQUARED*/) {
        *pDst = *pSrc;
    }
}

// Handle right edge
*pDst++ = *pSrc++;

So far I have this:

const uint8_t THRESHOLD = 10;

__attribute__((aligned (16))) static const uint8_t mask[16] = {
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD
};

__m128i xmm1, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
xmm1 = _mm_load_si128((__m128i const *)mask);
xmm6 = _mm_setzero_si128();

uint8_t *pSrc, *pDst;

// Assume pSrc and pDst point to valid data

// I have other code with another mask for the first 16 entries

for (uint32_t k = 16; k < (width - 16); k += 16, pSrc += 16, pDst += 16) {
    xmm3 = _mm_load_si128((__m128i const *)pDst);
    xmm4 = _mm_load_si128((__m128i const *)pSrc);
    xmm5 = _mm_unpacklo_epi8(xmm3, xmm6);
    xmm7 = _mm_unpackhi_epi8(xmm3, xmm6);
    xmm8 = _mm_unpacklo_epi8(xmm4, xmm6);
    xmm9 = _mm_unpackhi_epi8(xmm4, xmm6);
    xmm5 = _mm_sub_epi16(xmm5, xmm8);
    xmm7 = _mm_sub_epi16(xmm7, xmm9);
    xmm5 = _mm_abs_epi16(xmm5);
    xmm7 = _mm_abs_epi16(xmm7);
    xmm5 = _mm_packs_epi16(xmm5, xmm7);
    xmm5 = _mm_cmpgt_epi8(xmm5, xmm1);
    xmm3 = _mm_blendv_epi8(xmm3, xmm4, xmm5);
    _mm_store_si128((__m128i *)pDst, xmm3);
}

// I have other code with another mask for the last 16 entries

I have idea using another type of algorithm to handle the absolute value of the difference of two values (mainly to stay in U8 (uchar) space):

a' = a >> 1;
b' = b >> 1;
diff = (abs(sub(a' - b')) << 1) + ((a ^ b) & 1);

This would take 8 SSE instructions instead of the above 9 (not including any extra register moves the compiler generates) but I'm not sure if it is faster due to dependency latencies.

Does any other SSE expert have any better suggestions (using up to SSE 4.2)?

Update 1 - Thanks to Yves's suggestion!

const uint8_t THRESHOLD = 10;

__attribute__((aligned (16))) static const uint8_t mask[16] = {
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD
};

__m128i xmm1, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm1 = _mm_load_si128((__m128i const *)mask);
xmm6 = _mm_setzero_si128();

uint8_t *pSrc, *pDst;

// Assume pSrc and pDst point to valid data

// I have other code with another mask for the first 16 entries

for (uint32_t k = 16; k < (width - 16); k += 16, pSrc += 16, pDst += 16) {
    xmm3 = _mm_load_si128((__m128i const *)pDst);
    xmm4 = _mm_load_si128((__m128i const *)pSrc);
    xmm5 = _mm_subs_epu8(xmm3, xmm4);
    xmm7 = _mm_subs_epu8(xmm4, xmm3);
    xmm5 = _mm_adds_epu8(xmm5, xmm7);
    xmm5 = _mm_subs_epu8(xmm5, xmm1);
    xmm5 = _mm_cmpeq_epi8(xmm5, xmm6);
    xmm4 = _mm_blendv_epi8(xmm4, xmm3, xmm5);
    _mm_store_si128((__m128i *)pDst, xmm4);
}

// I have other code with another mask for the last 16 entries

Yves Daoust · Accepted Answer

There is an efficient alternative to compute the absolute difference, exploiting arithmetic saturation.

Indeed, saturated subtraction computes A - B = Max(A - B, 0), so that |A-B| = (A - B) + (B - A).

Diff= _mm_adds_epu8(_mm_subs_epu8(A, B), _mm_subs_epu8(B, A));

The sum will not saturate. This way, you stay with 16 x 8 bits unsigned and get a maximum throughput.

ErmIg · Answer

There are some useful functions from Simd library:

inline __m128i Combine(__m128i mask, __m128i positive, __m128i negative)
{
    return _mm_or_si128(_mm_and_si128(mask, positive), _mm_andnot_si128(mask, negative));
}

inline __m128i AbsDifferenceU8(__m128i a, __m128i b)
{
    return _mm_sub_epi8(_mm_max_epu8(a, b), _mm_min_epu8(a, b));
}

inline __m128i LesserOrEqual8u(__m128i a, __m128i b)
{
    return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
}

So SSE2 optimization will be look like this:

__m128i t = _mm_set1_epi8(threshold);
for (uint32_t k = 16; k < width - 16; pSrc += 16, pDst += 16)
{
    __m128i src = _mm_load_si128((__m128i*)pSrc);
    __m128i dst = _mm_load_si128((__m128i*)pDst);
    __m128i mask = LesserOrEqual8u(AbsDifferenceU8(src, dst), t);
    _mm_strore_si128((__m128i*)pDst, Combine(mask, dst, src);
}

Fast SSE threshold algorithm

Tags:

performance

algorithm

optimization

simd

sse

ChipK

2 Answers

Yves Daoust

ErmIg

Recent Activity

Donate For Us

Fast SSE threshold algorithm

Tags:

performance

algorithm

optimization

simd

sse

ChipK

2 Answers

Yves Daoust

ErmIg

Related questions

Recent Activity

Donate For Us