C optimzation techniques

Question

I am looking for the fastest optimization from the below function w/o getting into assembly because it seems to be the bottle neck of my application. Keep in mind that the following function is already declared inline.

definitions: P = 10 and N = 240

void autocorrelation( int32_t *data , float *r){
    for ( int m=0 ; m < P+1 ; m++)
    {
        register float temp = 0;
        for ( int n=0 ; n<N-m ; n++)
        {
            temp += (float)(data[n])*(float)(data[n+m]);
        }
        r[m] = temp;
    }
}

any help is appreciated.

Thanks!

EDIT:

assembly:

temp += (float)(data[n])*(float)(data[n+m]);
800063A8  lddsp R8, SP[0x0]      
800063AA  add R1, R2, R8<<0      
800063AE  ld.w R12, R1[R7<<0]        
800063B2  mcall 0x80006f58       
800063B6  mov R4, R12        
800063B8  ld.w R12, R2[R7<<0]        
800063BC  mcall 0x80006f58       
800063C0  mov R11, R12       
800063C2  mov R12, R4        
800063C4  mcall 0x80006f5c       
800063C8  mov R11, R12       
800063CA  mov R12, R5        
800063CC  mcall 0x80006f60       
800063D0  mov R5, R12        
for ( int n=0 ; n<N-m ; n++)
800063D2  sub R6, -1         
800063D4  sub R7, -4         
800063D6  cp.w R6, R3        
800063D8  brne 0x800063ae        
r[m] = temp;
800063DA  ld.w R10, PC[2954]         
800063DE  lddsp R9, SP[0x0]      
800063E0  st.w R10[R9<<0], R5        
800063E4  sub R0, 1      
800063E6  sub R9, -4         
800063E8  stdsp SP[0x0], R9      
for ( int m=0 ; m < P+1 ; m++)
800063EA  cp.w R0, 229       
800063EE  breq 0x800063fc        
800063F0  mov R3, R0         
for ( int n=0 ; n<N-m ; n++)
800063F2  cp.w R0, 0         
800063F4  brgt 0x800063a2        
800063F8  mov R5, 0      
800063FA  rjmp 0x800063da

////////////////////////////////////////////////////////////////////////////////

So I changed the code to:

void autocorrelation( float *data , float *r){
    for ( int m=0 ; m < P+1 ; m++)
    {
        register float temp = 0;
        for ( int n=0 ; n<N-m ; n++)
        {
            temp += data[n]*data[n+m];
        }
        r[m] = temp;
    }
}

and cut the time by a third (each tick is 1/16000Hz) - originally - 108 ticks now - 70 ticks

new assembly:

temp += data[n]*data[n+m];
800063C2  add R2, R3, R0<<0      
800063C6  ld.w R11, R3[R7<<0]        
800063CA  ld.w R12, R2[R7<<0]        
800063CE  mcall 0x80006f68       
800063D2  mov R11, R12       
800063D4  mov R12, R5        
800063D6  mcall 0x80006f6c       
800063DA  mov R5, R12        
for ( int n=0 ; n<N-m ; n++)
800063DC  sub R6, -1         
800063DE  sub R7, -4         
800063E0  cp.w R6, R4        
800063E2  brne 0x800063c6        
r[m] = temp;
800063E4  ld.w R9, PC[2960]      
800063E8  st.w R9[R0<<0], R5         
800063EC  sub R1, 1      
800063EE  sub R0, -4         
for ( int m=0 ; m < P+1 ; m++)
800063F0  cp.w R1, 229       
800063F4  breq 0x80006402        
800063F6  mov R4, R1         
for ( int n=0 ; n<N-m ; n++)
800063F8  cp.w R1, 0         
800063FA  brgt 0x800063bc        
800063FE  mov R5, 0      
80006400  rjmp 0x800063e4

////////////////////////////////////////////////////// FINAL Final Solution: (changed again)

I combined the marked solution and unwound the loop with an application I wrote and stayed in 64bits until the end, performance increase from 60ticks to 20ticks from last. Over 6 functions with the same tweaks, I was able to pull from the very beginning of what seemed to be optimized code at 250 ticks down to 50 ticks where my ping pong buffers needed everything to get done within 160 ticks, so I have some head room:

void fastAutocorrelation( int64_t *data , float *r){

int64_t *temp;
int64_t *datan = data;
int64_t *datanm = data;

*temp = (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
*r++ = (float)(*temp)/int64ToFloat;

datan = data;
datanm = data + 1;
*temp = (*datan++)*(*datanm++);
*temp += (*datan++)*(*datanm++);
...

Raymond Chen · Accepted Answer

Observe that your processor does not have any floating point capabilities; the floating point operations are being emulated in software. This means that the bottleneck is not the loop control (which the compiler is already doing a good job of strength-reducing). The bottleneck is the floating point emulator.

Given that your processor does not have native floating point, it's likely that it also does not have a large L1 cache. Changing the order of the loop controls may improve data locality. The original code makes 10 sweeps across a 240-element array, which is poor locality. Better would be to make one sweep across the array, studying 10 items at a time.

void autocorrelation( int32_t *data , float *r){
  int m, n;
  for (m = 0; m < P + 1; m++) r[m] = 0.0f;
  for (n = 0; n < N; n++) {
    int limit = min(P + 1, N - n);
    for (m = 0; m < limit; m++) {
      r[m] += data[n] * data[n+m];
    }
  }
}

(Note that conversion to pointers won't help, because the original code was already optimized by the compiler to use pointers.)

paddy · Answer

I see you've cut back a bit by removing the casts. If you're after pointer tricks, here's one to try. Depending on your compiler, your mileage will vary:

void autocorrelation( float *restrict data, float *restrict r)
{
    float *data_end = data + N;

    for ( int m=0 ; m < P+1 ; m++)
    {
        float temp = 0;

        for( float *data_n = data, *data_nm = data + m;
             data_nm != data_end;
             data_n++, data_nm++ )
        {
            temp += *data_n * *data_nm;
        }

        r[m] = temp;
    }
}

I added in the restrict keyword so the compiler knows data and r do not overlap or point to the same thing.

C optimzation techniques

Tags:

performance

c

function

optimization

user2368363

2 Answers

Raymond Chen

paddy

Recent Activity

Donate For Us

C optimzation techniques

Tags:

performance

c

function

optimization

user2368363

2 Answers

Raymond Chen

paddy

Related questions

Recent Activity

Donate For Us