I'm attempting to optimise a routine used in VLC, that converts NV12 frame into a YV12 frame.
For background information, NV12 is identical to YV12 with the exception that the U and V chroma plane are interleaved. So to convert one format into another, it's simply a matter of deinterleaving a channel like: UVUVUVUVUVUVU becomes UUUUUUU VVVVVVV
The routine I'm attempting to improve is this one: http://git.videolan.org/?p=vlc.git;a=blob;f=modules/video_chroma/copy.c;h=d29843c037e494170f0d6bc976bea8439dd6115b;hb=HEAD#l286
Now, the primary issue with this routine, is that it requires a 16-bytes aligned memory cache as intermediary storage So the routine first deinterleave the data into the cache (4kiB max) and then copy the result found in the cache back into the destination frame.
I have rewritten this function, so it doesn't require the use of a cache, using SSE2/3 instructions working on unaligned memory when required, and instructions using aligned memory when possible.
The code is as follow:
static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
uint8_t *dstv, size_t dstv_pitch,
const uint8_t *src, size_t src_pitch,
uint8_t *cache, size_t cache_size,
unsigned width, unsigned height, unsigned cpu)
{
VLC_UNUSED(cache);
VLC_UNUSED(cache_size);
const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
1, 3, 5, 7, 9, 11, 13, 15 };
const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
const bool aligned = ((uintptr_t)src & 0xf) == 0;
asm volatile ("mfence");
#define LOAD64A \
"movdqa 0(%[src]), %%xmm0\n" \
"movdqa 16(%[src]), %%xmm1\n" \
"movdqa 32(%[src]), %%xmm2\n" \
"movdqa 48(%[src]), %%xmm3\n"
#define LOAD64U \
"movdqu 0(%[src]), %%xmm0\n" \
"movdqu 16(%[src]), %%xmm1\n" \
"movdqu 32(%[src]), %%xmm2\n" \
"movdqu 48(%[src]), %%xmm3\n"
#define STORE2X32 \
"movq %%xmm0, 0(%[dst1])\n" \
"movq %%xmm1, 8(%[dst1])\n" \
"movhpd %%xmm0, 0(%[dst2])\n" \
"movhpd %%xmm1, 8(%[dst2])\n" \
"movq %%xmm2, 16(%[dst1])\n" \
"movq %%xmm3, 24(%[dst1])\n" \
"movhpd %%xmm2, 16(%[dst2])\n" \
"movhpd %%xmm3, 24(%[dst2])\n"
if (aligned)
{
for (unsigned y = 0; y < height; y++)
{
unsigned x = 0;
#ifdef CAN_COMPILE_SSSE3
if (vlc_CPU_SSSE3()) {
for (x = 0; x < (width & ~31); x += 32) {
asm volatile (
"movdqu (%[shuffle]), %%xmm7\n"
LOAD64A
"pshufb %%xmm7, %%xmm0\n"
"pshufb %%xmm7, %%xmm1\n"
"pshufb %%xmm7, %%xmm2\n"
"pshufb %%xmm7, %%xmm3\n"
STORE2X32
: : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
}
} else
#endif
{
for (x = 0; x < (width & ~31); x += 32) {
asm volatile (
"movdqu (%[mask]), %%xmm7\n"
LOAD64A
"movdqa %%xmm0, %%xmm4\n"
"movdqa %%xmm1, %%xmm5\n"
"movdqa %%xmm2, %%xmm6\n"
"psrlw $8, %%xmm0\n"
"psrlw $8, %%xmm1\n"
"pand %%xmm7, %%xmm4\n"
"pand %%xmm7, %%xmm5\n"
"pand %%xmm7, %%xmm6\n"
"packuswb %%xmm4, %%xmm0\n"
"packuswb %%xmm5, %%xmm1\n"
"pand %%xmm3, %%xmm7\n"
"psrlw $8, %%xmm2\n"
"psrlw $8, %%xmm3\n"
"packuswb %%xmm6, %%xmm2\n"
"packuswb %%xmm7, %%xmm3\n"
STORE2X32
: : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
}
}
for (; x < width; x++) {
dstu[x] = src[2*x+0];
dstv[x] = src[2*x+1];
}
src += src_pitch;
dstu += dstu_pitch;
dstv += dstv_pitch;
}
}
else
{
for (unsigned y = 0; y < height; y++)
{
unsigned x = 0;
#ifdef CAN_COMPILE_SSSE3
if (vlc_CPU_SSSE3()) {
for (x = 0; x < (width & ~31); x += 32) {
asm volatile (
"movdqu (%[shuffle]), %%xmm7\n"
LOAD64U
"pshufb %%xmm7, %%xmm0\n"
"pshufb %%xmm7, %%xmm1\n"
"pshufb %%xmm7, %%xmm2\n"
"pshufb %%xmm7, %%xmm3\n"
STORE2X32
: : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
}
} else
#endif
{
for (x = 0; x < (width & ~31); x += 32) {
asm volatile (
"movdqu (%[mask]), %%xmm7\n"
LOAD64U
"movdqu %%xmm0, %%xmm4\n"
"movdqu %%xmm1, %%xmm5\n"
"movdqu %%xmm2, %%xmm6\n"
"psrlw $8, %%xmm0\n"
"psrlw $8, %%xmm1\n"
"pand %%xmm7, %%xmm4\n"
"pand %%xmm7, %%xmm5\n"
"pand %%xmm7, %%xmm6\n"
"packuswb %%xmm4, %%xmm0\n"
"packuswb %%xmm5, %%xmm1\n"
"pand %%xmm3, %%xmm7\n"
"psrlw $8, %%xmm2\n"
"psrlw $8, %%xmm3\n"
"packuswb %%xmm6, %%xmm2\n"
"packuswb %%xmm7, %%xmm3\n"
STORE2X32
: : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
}
}
for (; x < width; x++) {
dstu[x] = src[2*x+0];
dstv[x] = src[2*x+1];
}
src += src_pitch;
dstu += dstu_pitch;
dstv += dstv_pitch;
}
}
#undef STORE2X32
#undef LOAD64U
#undef LOAD64A
}
Now, benchmarking this function alone, it runs around 26% faster on an i7-2600 CPU (ivybridge 3.4GHz), a little faster on an i7-4650U (haswell 1.7GHz) with a 30% speed increase over the original function
Which was expected as you go from 2 reads + 2 writes, to 1 read + 1 write.
However, when used within VLC (the function is used to display every frame decoded via Intel VAAPI interface), the CPU usage for the same video jumps from 20% to 32-34%
So I'm puzzled why that would be. and how could that be resolved. I had expected an opposite result. Both routines use SSE2/3, one runs faster, yet cause increase CPU usage
thanks
Ok.
I've found out what was going on.
While the new routine is much faster with traditional memory, when it comes to working on frames generated by hardware decode method, it is actually slower:
This Intel white paper explains things: https://software.intel.com/en-us/articles/copying-accelerated-video-decode-frame-buffers
All my benchmarks and tests were done using traditionally allocated memory. Not from Uncacheable Speculative Write Combining (USWC)
back to the drawing board
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With