I am currently doing a color conversion routine in order to convert from YUY2 to NV12. I have a function which is quite fast, but not as fast as I would expect, mainly due to cache misses.
void convert_hd(uint8_t *orig, uint8_t *result) {
uint32_t width = 1280;
uint32_t height = 720;
uint8_t *lineOdd = orig;
uint8_t *lineEven = orig + width*2;
uint8_t *resultYOdd = result;
uint8_t *resultYEven = result + width;
uint8_t *resultUV = result + height*width;
uint32_t totalLoop = height/2;
while (totalLoop-- > 0) {
uint32_t lineLoop = 1280/32; // Bytes length: width*2, read by iter 16Bytes
while(lineLoop-- > 0) {
__asm__ __volatile__(
"pld [%[lineOdd]] \n\t"
"vld4.8 {d0, d1, d2, d3}, [%[lineOdd],:128]! \n\t" // d0:Y d1:U0 d2:Y d3:V0
"pld [%[lineEven]] \n\t"
"vld4.8 {d4, d5, d6, d7}, [%[lineOdd],:128]! \n\t" // d4:Y d5:U1 d6:Y d7:V1
"vld4.8 {d8, d9, d10, d11}, [%[lineEven],:128]! \n\t" // d8:Y d9:U0' d10:Y d11:V0'
"vld4.8 {d12, d13, d14, d15}, [%[lineEven],:128]! \n\t" // d12:Y d13:U1' d14:Y d15:V1'
"vhadd.u8 d1, d1, d9 \n\t" // (U0+U0') / 2
"vhadd.u8 d3, d3, d11 \n\t" // (V0+V0') / 2
"vhadd.u8 d5, d5, d13 \n\t" // (U1+U1') / 2
"vhadd.u8 d7, d7, d15 \n\t" // (V1+V1') / 2
// Save
"vst2.8 {d0, d2}, [%[resultYOdd],:128]! \n\t"
"vst2.8 {d4, d6}, [%[resultYOdd],:128]! \n\t"
"vst2.8 {d8, d10}, [%[resultYEven],:128]! \n\t"
"vst2.8 {d12, d14}, [%[resultYEven],:128]! \n\t"
"vst2.8 {d1, d3}, [%[resultUV],:128]! \n\t"
"vst2.8 {d5, d7}, [%[resultUV],:128]! \n\t"
: [lineOdd]"+r"(lineOdd), [lineEven]"+r"(lineEven), [resultYOdd]"+r"(resultYOdd), [resultYEven]"+r"(resultYEven), [resultUV]"+r"(resultUV)
:
: "memory"
);
}
lineOdd += width*2;
lineEven += width*2;
resultYOdd += width;
resultYEven += width;
}
}
When I ask oprofile what is taking time, it says the following :
: 220c: add r2, r0, #2560 ;
: 2210: add r3, r1, #1280 ;
: 2214: add ip, r1, #921600 ;
: 2218: push {r4, lr}
: 221c: mov r4, #360 ;
6 0.1243 10 0.5787 4 0.4561 : 2220: mov lr, #40 ; 0x28
9 0.1864 5 0.2894 0 0 : 2224: pld [r0]
45 0.9321 7 0.4051 3 0.3421 : 2228: vld4.8 {d0-d3}, [r0 :128]!
51 1.0563 7 0.4051 1 0.1140 : 222c: pld [r2]
1 0.0207 1 0.0579 0 0 : 2230: vld4.8 {d4-d7}, [r0 :128]!
1360 28.1690 770 44.5602 463 52.7936 : 2234: vld4.8 {d8-d11}, [r2 :128]!
980 20.2983 329 19.0394 254 28.9624 : 2238: vld4.8 {d12-d15}, [r2 :128]!
: 223c: vhadd.u8 d1, d1, d9
1000 20.7125 170 9.8380 104 11.8586 : 2240: vhadd.u8 d3, d3, d11
: 2244: vhadd.u8 d5, d5, d13
5 0.1036 2 0.1157 2 0.2281 : 2248: vhadd.u8 d7, d7, d15
: 224c: vst2.8 {d0,d2}, [r1 :128]!
1125 23.3016 293 16.9560 15 1.7104 : 2250: vst2.8 {d4,d6}, [r1 :128]!
34 0.7042 41 2.3727 0 0 : 2254: vst2.8 {d8,d10}, [r3 :128]!
74 1.5327 8 0.4630 0 0 : 2258: vst2.8 {d12,d14}, [r3 :128]!
60 1.2428 39 2.2569 6 0.6842 : 225c: vst2.8 {d1,d3}, [ip :128]!
53 1.0978 24 1.3889 14 1.5964 : 2260: vst2.8 {d5,d7}, [ip :128]!
: 2264: subs lr, lr, #1
0 0 0 0 1 0.1140 : 2268: bne 2224 <convert_hd+0x18>
11 0.2278 14 0.8102 10 1.1403 : 226c: subs r4, r4, #1
: 2270: add r0, r0, #2560 ;
: 2274: add r2, r2, #2560 ;
2 0.0414 6 0.3472 0 0 : 2278: add r1, r1, #1280 ;
: 227c: add r3, r3, #1280 ;
2 0.0414 1 0.0579 0 0 : 2280: bne 2220 <convert_hd+0x14>
: 2284: pop {r4, pc}
Any help would be appreciated, as this is a quite difficult task right now to find out ideas and avoid cache misses...
Thanks !
The cache line length is fixed at eight words (32 bytes). In addition to the pld
you currently have, you need pld[lineEven+cacheLine]
. The misses are vld4.8 {d8-d11}
, which is the 2nd half of lineEven
. pld
will only fetch a cache line. Also, you should alter the pld
position. Put one at the head and another before vhadd
, maybe with next memory target. You then have the ALU and memory units active in parallel.
Also, interleave vst2.8 {d0, d2}
with the vhadd
; It looks like most data is a memory transfer. The vhadd
will block on data dependencies, like d9
which you may/may not have loading from pld
, but not scheduled well.
I am not that familiar with NEON, but the following is an attempt to follow what I said.
__asm__ __volatile__(
"pld [%[lineOdd], #32]\n\t" // 2nd part of odd.
"vld4.8 {d0, d1, d2, d3}, [%[lineOdd],:128]!\n\t"
"pld [%[lineEven], #32]\n\t" // 2nd part of even.
"vld4.8 {d8, d9, d10, d11}, [%[lineEven],:128]!\n\t"
"vld4.8 {d4, d5, d6, d7}, [%[lineOdd],:128]!\n\t"
"vld4.8 {d12, d13, d14, d15}, [%[lineEven],:128]!\n\t"
"vhadd.u8 d1, d1, d9\n\t"
// First in memory pipe, so write early.
"vst2.8 {d0, d2}, [%[resultYOdd],:128]!\n\t"
"vhadd.u8 d3, d3, d11\n\t"
"vst2.8 {d8, d10}, [%[resultYEven],:128]!\n\t"
"vhadd.u8 d5, d5, d13\n\t"
"vst2.8 {d4, d6}, [%[resultYOdd],:128]! \n\t"
"vhadd.u8 d7, d7, d15\n\t"
"vst2.8 {d12, d14}, [%[resultYEven],:128]! \n\t"
"pld [%[lineOdd]]\n\t" // 1st part of odd.
"vst2.8 {d1, d3}, [%[resultUV],:128]! \n\t"
"pld [%[lineEven]]\n\t" // 1st part of even.
"vst2.8 {d5, d7}, [%[resultUV],:128]! \n\t"
: [lineOdd]"+r"(lineOdd), [lineEven]"+r"(lineEven),
[resultYOdd]"+r"(resultYOdd), [resultYEven]"+r"(resultYEven),
[resultUV]"+r"(resultUV)
:
: "memory"
);
Things I may have wrong are the stride of the NEON operations; I have no idea how wide your registers are (64/128), so more PLD
maybe needed, etc. It is better to interleave the store operations with the additions. Especially, some dX
will be loaded before others and they will be ready to use. Otherwise, your ALU (vhadd
) will block waiting for the data to load.
You may also wish to prime the loop with pld[lineOdd]
and pld[lineEven]
before things begin.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With