Optimizing Cortex-A8 color conversion using NEON

Question

I am currently doing a color conversion routine in order to convert from YUY2 to NV12. I have a function which is quite fast, but not as fast as I would expect, mainly due to cache misses.

void convert_hd(uint8_t *orig, uint8_t *result) {
uint32_t width          = 1280;
uint32_t height         = 720;
uint8_t *lineOdd        = orig;
uint8_t *lineEven       = orig + width*2;
uint8_t *resultYOdd     = result;
uint8_t *resultYEven    = result + width;
uint8_t *resultUV       = result + height*width;
uint32_t totalLoop      = height/2;

while (totalLoop-- > 0) {
  uint32_t lineLoop = 1280/32; // Bytes length: width*2, read by iter 16Bytes

  while(lineLoop-- > 0) {
    __asm__ __volatile__(
        "pld [%[lineOdd]]   
	"
        "vld4.8   {d0, d1, d2, d3}, [%[lineOdd],:128]!   
	" // d0:Y d1:U0 d2:Y d3:V0
        "pld [%[lineEven]]   
	"
        "vld4.8   {d4, d5, d6, d7}, [%[lineOdd],:128]!   
	" // d4:Y d5:U1 d6:Y d7:V1
        "vld4.8   {d8, d9, d10, d11}, [%[lineEven],:128]!  
	" // d8:Y d9:U0' d10:Y d11:V0'
        "vld4.8   {d12, d13, d14, d15}, [%[lineEven],:128]!  
	" // d12:Y d13:U1' d14:Y d15:V1'
        "vhadd.u8   d1, d1, d9    
	" // (U0+U0') / 2
        "vhadd.u8   d3, d3, d11    
	" // (V0+V0') / 2
        "vhadd.u8   d5, d5, d13    
	" // (U1+U1') / 2
        "vhadd.u8   d7, d7, d15    
	" // (V1+V1') / 2
        // Save
        "vst2.8 {d0, d2}, [%[resultYOdd],:128]!           
	"
        "vst2.8 {d4, d6}, [%[resultYOdd],:128]!           
	"
        "vst2.8 {d8, d10}, [%[resultYEven],:128]!          
	"
        "vst2.8 {d12, d14}, [%[resultYEven],:128]!          
	"
        "vst2.8 {d1, d3}, [%[resultUV],:128]!   
	"
        "vst2.8 {d5, d7}, [%[resultUV],:128]!   
	"
        : [lineOdd]"+r"(lineOdd), [lineEven]"+r"(lineEven), [resultYOdd]"+r"(resultYOdd), [resultYEven]"+r"(resultYEven), [resultUV]"+r"(resultUV)
        :
        : "memory"
    );
  }
  lineOdd += width*2;
  lineEven += width*2;
  resultYOdd += width;
  resultYEven += width;
}
}

When I ask oprofile what is taking time, it says the following :

                                           :    220c:   add r2, r0, #2560   ;
                                           :    2210:   add r3, r1, #1280   ;
                                           :    2214:   add ip, r1, #921600 ;
                                           :    2218:   push    {r4, lr}
                                           :    221c:   mov r4, #360    ;
 6  0.1243    10  0.5787     4  0.4561     :    2220:   mov lr, #40 ; 0x28
 9  0.1864     5  0.2894     0       0     :    2224:   pld [r0]
45  0.9321     7  0.4051     3  0.3421     :    2228:   vld4.8  {d0-d3}, [r0 :128]!
51  1.0563     7  0.4051     1  0.1140     :    222c:   pld [r2]
 1  0.0207     1  0.0579     0       0     :    2230:   vld4.8  {d4-d7}, [r0 :128]!
1360 28.1690   770 44.5602   463 52.7936     :    2234: vld4.8  {d8-d11}, [r2 :128]!
 980 20.2983   329 19.0394   254 28.9624     :    2238: vld4.8  {d12-d15}, [r2 :128]!
                                             :    223c: vhadd.u8    d1, d1, d9
1000 20.7125   170  9.8380   104 11.8586     :    2240: vhadd.u8    d3, d3, d11
                                             :    2244: vhadd.u8    d5, d5, d13
   5  0.1036     2  0.1157     2  0.2281     :    2248: vhadd.u8    d7, d7, d15
                                             :    224c: vst2.8  {d0,d2}, [r1 :128]!
1125 23.3016   293 16.9560    15  1.7104     :    2250: vst2.8  {d4,d6}, [r1 :128]!
  34  0.7042    41  2.3727     0       0     :    2254: vst2.8  {d8,d10}, [r3 :128]!
  74  1.5327     8  0.4630     0       0     :    2258: vst2.8  {d12,d14}, [r3 :128]!
  60  1.2428    39  2.2569     6  0.6842     :    225c: vst2.8  {d1,d3}, [ip :128]!
  53  1.0978    24  1.3889    14  1.5964     :    2260: vst2.8  {d5,d7}, [ip :128]!
                                             :    2264: subs    lr, lr, #1
   0       0     0       0     1  0.1140     :    2268: bne 2224 <convert_hd+0x18>
  11  0.2278    14  0.8102    10  1.1403     :    226c: subs    r4, r4, #1
                                             :    2270: add r0, r0, #2560   ;
                                             :    2274: add r2, r2, #2560   ;
   2  0.0414     6  0.3472     0       0     :    2278: add r1, r1, #1280   ;
                                             :    227c: add r3, r3, #1280   ;
   2  0.0414     1  0.0579     0       0     :    2280: bne 2220 <convert_hd+0x14>
                                             :    2284: pop {r4, pc}

the first two columns are the cycle count (absolute and relative)
the two next ones are L1 cache miss (absolute and relative)
the last ones are L2 cache miss (absolute and relative)

Any help would be appreciated, as this is a quite difficult task right now to find out ideas and avoid cache misses...

Thanks !

artless noise · Accepted Answer

The cache line length is fixed at eight words (32 bytes). In addition to the pld you currently have, you need pld[lineEven+cacheLine]. The misses are vld4.8 {d8-d11}, which is the 2nd half of lineEven. pld will only fetch a cache line. Also, you should alter the pld position. Put one at the head and another before vhadd, maybe with next memory target. You then have the ALU and memory units active in parallel.

Also, interleave vst2.8 {d0, d2} with the vhadd; It looks like most data is a memory transfer. The vhadd will block on data dependencies, like d9 which you may/may not have loading from pld, but not scheduled well.

I am not that familiar with NEON, but the following is an attempt to follow what I said.

__asm__ __volatile__(
    "pld [%[lineOdd], #32]
	" // 2nd part of odd.
    "vld4.8   {d0, d1, d2, d3}, [%[lineOdd],:128]!
	"
    "pld [%[lineEven], #32]
	" // 2nd part of even.
    "vld4.8   {d8, d9, d10, d11}, [%[lineEven],:128]!
	"
    "vld4.8   {d4, d5, d6, d7}, [%[lineOdd],:128]!
	"
    "vld4.8   {d12, d13, d14, d15}, [%[lineEven],:128]!
	" 
    "vhadd.u8   d1, d1, d9
	"
    // First in memory pipe, so write early.
    "vst2.8 {d0, d2}, [%[resultYOdd],:128]!
	"  
    "vhadd.u8   d3, d3, d11
	"
    "vst2.8 {d8, d10}, [%[resultYEven],:128]!
	"
    "vhadd.u8   d5, d5, d13
	"
    "vst2.8 {d4, d6}, [%[resultYOdd],:128]!           
	"
    "vhadd.u8   d7, d7, d15
	"
    "vst2.8 {d12, d14}, [%[resultYEven],:128]!          
	"
    "pld [%[lineOdd]]
	"   // 1st part of odd.
    "vst2.8 {d1, d3}, [%[resultUV],:128]!   
	"
    "pld [%[lineEven]]
	"  // 1st part of even.
    "vst2.8 {d5, d7}, [%[resultUV],:128]!   
	"
    : [lineOdd]"+r"(lineOdd), [lineEven]"+r"(lineEven),
      [resultYOdd]"+r"(resultYOdd), [resultYEven]"+r"(resultYEven),
      [resultUV]"+r"(resultUV)
    :
    : "memory"
);

Things I may have wrong are the stride of the NEON operations; I have no idea how wide your registers are (64/128), so more PLD maybe needed, etc. It is better to interleave the store operations with the additions. Especially, some dX will be loaded before others and they will be ready to use. Otherwise, your ALU (vhadd) will block waiting for the data to load.

You may also wish to prime the loop with pld[lineOdd] and pld[lineEven] before things begin.

Optimizing Cortex-A8 color conversion using NEON

Tags:

assembly

cpu-cache

arm

neon

cortex-a8

jmh

1 Answers

artless noise

Recent Activity

Donate For Us

Optimizing Cortex-A8 color conversion using NEON

Tags:

assembly

cpu-cache

arm

neon

cortex-a8

jmh

1 Answers

artless noise

Related questions

Recent Activity

Donate For Us