Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Fast SSE threshold algorithm

I'm trying to come up with a very fast threshold algorithm using SSE to replace this:

uint8_t *pSrc, *pDst;

// Assume pSrc and pDst point to valid data

// Handle left edge
*pDst++ = *pSrc++;

// Likeness filter
for (uint32_t k = 2; k < width; k++, pSrc++, pDst++)
    if ((*pDst - *pSrc) * (*pDst - *pSrc) > 100 /*THRESHOLD_SQUARED*/) {
        *pDst = *pSrc;
    }
}

// Handle right edge
*pDst++ = *pSrc++;

So far I have this:

const uint8_t THRESHOLD = 10;

__attribute__((aligned (16))) static const uint8_t mask[16] = {
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD
};

__m128i xmm1, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
xmm1 = _mm_load_si128((__m128i const *)mask);
xmm6 = _mm_setzero_si128();

uint8_t *pSrc, *pDst;

// Assume pSrc and pDst point to valid data

// I have other code with another mask for the first 16 entries

for (uint32_t k = 16; k < (width - 16); k += 16, pSrc += 16, pDst += 16) {
    xmm3 = _mm_load_si128((__m128i const *)pDst);
    xmm4 = _mm_load_si128((__m128i const *)pSrc);
    xmm5 = _mm_unpacklo_epi8(xmm3, xmm6);
    xmm7 = _mm_unpackhi_epi8(xmm3, xmm6);
    xmm8 = _mm_unpacklo_epi8(xmm4, xmm6);
    xmm9 = _mm_unpackhi_epi8(xmm4, xmm6);
    xmm5 = _mm_sub_epi16(xmm5, xmm8);
    xmm7 = _mm_sub_epi16(xmm7, xmm9);
    xmm5 = _mm_abs_epi16(xmm5);
    xmm7 = _mm_abs_epi16(xmm7);
    xmm5 = _mm_packs_epi16(xmm5, xmm7);
    xmm5 = _mm_cmpgt_epi8(xmm5, xmm1);
    xmm3 = _mm_blendv_epi8(xmm3, xmm4, xmm5);
    _mm_store_si128((__m128i *)pDst, xmm3);
}

// I have other code with another mask for the last 16 entries

I have idea using another type of algorithm to handle the absolute value of the difference of two values (mainly to stay in U8 (uchar) space):

a' = a >> 1;
b' = b >> 1;
diff = (abs(sub(a' - b')) << 1) + ((a ^ b) & 1);

This would take 8 SSE instructions instead of the above 9 (not including any extra register moves the compiler generates) but I'm not sure if it is faster due to dependency latencies.

Does any other SSE expert have any better suggestions (using up to SSE 4.2)?

Update 1 - Thanks to Yves's suggestion!

const uint8_t THRESHOLD = 10;

__attribute__((aligned (16))) static const uint8_t mask[16] = {
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD
};

__m128i xmm1, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm1 = _mm_load_si128((__m128i const *)mask);
xmm6 = _mm_setzero_si128();

uint8_t *pSrc, *pDst;

// Assume pSrc and pDst point to valid data

// I have other code with another mask for the first 16 entries

for (uint32_t k = 16; k < (width - 16); k += 16, pSrc += 16, pDst += 16) {
    xmm3 = _mm_load_si128((__m128i const *)pDst);
    xmm4 = _mm_load_si128((__m128i const *)pSrc);
    xmm5 = _mm_subs_epu8(xmm3, xmm4);
    xmm7 = _mm_subs_epu8(xmm4, xmm3);
    xmm5 = _mm_adds_epu8(xmm5, xmm7);
    xmm5 = _mm_subs_epu8(xmm5, xmm1);
    xmm5 = _mm_cmpeq_epi8(xmm5, xmm6);
    xmm4 = _mm_blendv_epi8(xmm4, xmm3, xmm5);
    _mm_store_si128((__m128i *)pDst, xmm4);
}

// I have other code with another mask for the last 16 entries
like image 988
ChipK Avatar asked Oct 26 '14 05:10

ChipK


2 Answers

There is an efficient alternative to compute the absolute difference, exploiting arithmetic saturation.

Indeed, saturated subtraction computes A - B = Max(A - B, 0), so that |A-B| = (A - B) + (B - A).

Diff= _mm_adds_epu8(_mm_subs_epu8(A, B), _mm_subs_epu8(B, A));

The sum will not saturate. This way, you stay with 16 x 8 bits unsigned and get a maximum throughput.

like image 123
Yves Daoust Avatar answered Oct 18 '22 04:10

Yves Daoust


There are some useful functions from Simd library:

inline __m128i Combine(__m128i mask, __m128i positive, __m128i negative)
{
    return _mm_or_si128(_mm_and_si128(mask, positive), _mm_andnot_si128(mask, negative));
}

inline __m128i AbsDifferenceU8(__m128i a, __m128i b)
{
    return _mm_sub_epi8(_mm_max_epu8(a, b), _mm_min_epu8(a, b));
}

inline __m128i LesserOrEqual8u(__m128i a, __m128i b)
{
    return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a);
}

So SSE2 optimization will be look like this:

__m128i t = _mm_set1_epi8(threshold);
for (uint32_t k = 16; k < width - 16; pSrc += 16, pDst += 16)
{
    __m128i src = _mm_load_si128((__m128i*)pSrc);
    __m128i dst = _mm_load_si128((__m128i*)pDst);
    __m128i mask = LesserOrEqual8u(AbsDifferenceU8(src, dst), t);
    _mm_strore_si128((__m128i*)pDst, Combine(mask, dst, src);
}
like image 31
ErmIg Avatar answered Oct 18 '22 04:10

ErmIg