Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

C++ array access optimization

Im modding an open-source project(powder toy) and always compiling with /O2 (maximize speed) option, SSE2 code generation enabled and just inspected this:

void membwand(void * destv, void * srcv, size_t destsize, size_t srcsize)
{
    size_t i;
    unsigned char * dest = (unsigned char*)destv;
    unsigned char * src = (unsigned char*)srcv;
    for(i = 0; i < destsize; i++){
        dest[i] = dest[i] & src[i%srcsize];
    }
}

Here I replaced below lines

membwand(gravy, gravmask, size_dst, size_src);

membwand(gravx, gravmask, size_dst, size_src);

    // gravy, gravx and gravmask are 1MB each

with

membwand2(gravy,gravx, gravmask, size_dst, size_src);   
    // sizes are same, why not put all in a single function?

which is implemented as:

void membwand2(void * destv1, void * destv2,void * srcv, size_t destsize, size_t srcsize)
{
    if(destsize!=srcsize)
    {
        size_t i;
        unsigned char * dest1 = (unsigned char*)destv1;
        unsigned char * dest2 = (unsigned char*)destv2;
        unsigned char * src = (unsigned char*)srcv;
        for(i = 0; i < destsize; i++)
        {
            dest1[i] = dest1[i] & src[i%srcsize];
            dest2[i] = dest2[i] & src[i%srcsize];
        }
    }
    else
    {
        size_t i;
        unsigned char * dest1 = (unsigned char*)destv1;unsigned char * dest2 = (unsigned char*)destv2;
        unsigned char * src = (unsigned char*)srcv;
        unsigned char tmpChar0=0;unsigned char tmpChar1=0;unsigned char tmpChar2=0;
        unsigned char tmpChar3=0;unsigned char tmpChar4=0;unsigned char tmpChar5=0;
        unsigned char tmpChar6=0;unsigned char tmpChar7=0;unsigned char tmpChar8=0;
        unsigned char tmpChar9=0;unsigned char tmpChar10=0;unsigned char tmpChar11=0;
        unsigned char tmpChar12=0;unsigned char tmpChar13=0;unsigned char tmpChar14=0;
        unsigned char tmpChar15=0;unsigned char tmpChar16=0;unsigned char tmpChar17=0;
        unsigned char tmpChar18=0;unsigned char tmpChar19=0;unsigned char tmpChar20=0;
        unsigned char tmpChar21=0;unsigned char tmpChar22=0;unsigned char tmpChar23=0;
        unsigned char tmpChar24=0;unsigned char tmpChar25=0;unsigned char tmpChar26=0;
        unsigned char tmpChar27=0;unsigned char tmpChar28=0;unsigned char tmpChar29=0;
        unsigned char tmpChar30=0;unsigned char tmpChar31=0;
        for(i = 0; i < destsize; i+=32)
        {
            tmpChar0=src[i];tmpChar1=src[i+1];tmpChar2=src[i+2];tmpChar3=src[i+3];
            tmpChar4=src[i+4];tmpChar5=src[i+5];tmpChar6=src[i+6];tmpChar7=src[i+7];

            tmpChar8=src[i+8];tmpChar9=src[i+9];tmpChar10=src[i+10];tmpChar11=src[i+11];
            tmpChar12=src[i+12];tmpChar13=src[i+13];tmpChar14=src[i+14];tmpChar15=src[i+15];

            tmpChar16=src[i+16];tmpChar17=src[i+17];tmpChar18=src[i+18];tmpChar19=src[i+19];
            tmpChar20=src[i+20];tmpChar21=src[i+21];tmpChar22=src[i+22];tmpChar23=src[i+23];

            tmpChar24=src[i+24];tmpChar25=src[i+25];tmpChar26=src[i+26];tmpChar27=src[i+27];
            tmpChar28=src[i+28];tmpChar29=src[i+29];tmpChar30=src[i+30];tmpChar31=src[i+31];


            dest1[i] = dest1[i] & tmpChar0;
            dest1[i+1] = dest1[i+1] & tmpChar1;
            dest1[i+2] = dest1[i+2] & tmpChar2;
            dest1[i+3] = dest1[i+3] & tmpChar3;
            dest1[i+4] = dest1[i+4] & tmpChar4;
            dest1[i+5] = dest1[i+5] & tmpChar5;
            dest1[i+6] = dest1[i+6] & tmpChar6;
            dest1[i+7] = dest1[i+7] & tmpChar7;
            dest1[i+8] = dest1[i+8] & tmpChar8;
            dest1[i+9] = dest1[i+9] & tmpChar9;
            dest1[i+10] = dest1[i+10] & tmpChar10;
            dest1[i+11] = dest1[i+11] & tmpChar11;
            dest1[i+12] = dest1[i+12] & tmpChar12;
            dest1[i+13] = dest1[i+13] & tmpChar13;
            dest1[i+14] = dest1[i+14] & tmpChar14;
            dest1[i+15] = dest1[i+15] & tmpChar15;
            dest1[i+16] = dest1[i+16] & tmpChar16;
            dest1[i+17] = dest1[i+17] & tmpChar17;
            dest1[i+18] = dest1[i+18] & tmpChar18;
            dest1[i+19] = dest1[i+19] & tmpChar19;
            dest1[i+20] = dest1[i+20] & tmpChar20;
            dest1[i+21] = dest1[i+21] & tmpChar21;
            dest1[i+22] = dest1[i+22] & tmpChar22;
            dest1[i+23] = dest1[i+23] & tmpChar23;
            dest1[i+24] = dest1[i+24] & tmpChar24;
            dest1[i+25] = dest1[i+25] & tmpChar25;
            dest1[i+26] = dest1[i+26] & tmpChar26;
            dest1[i+27] = dest1[i+27] & tmpChar27;
            dest1[i+28] = dest1[i+28] & tmpChar28;
            dest1[i+29] = dest1[i+29] & tmpChar29;
            dest1[i+30] = dest1[i+30] & tmpChar30;
            dest1[i+31] = dest1[i+31] & tmpChar31;

            dest2[i] = dest2[i] & tmpChar0;
            dest2[i+1] = dest2[i+1] & tmpChar1;
            dest2[i+2] = dest2[i+2] & tmpChar2;
            dest2[i+3] = dest2[i+3] & tmpChar3;
            dest2[i+4] = dest2[i+4] & tmpChar4;
            dest2[i+5] = dest2[i+5] & tmpChar5;
            dest2[i+6] = dest2[i+6] & tmpChar6;
            dest2[i+7] = dest2[i+7] & tmpChar7;
            dest2[i+8] = dest2[i+8] & tmpChar8;
            dest2[i+9] = dest2[i+9] & tmpChar9;
            dest2[i+10] = dest2[i+10] & tmpChar10;
            dest2[i+11] = dest2[i+11] & tmpChar11;
            dest2[i+12] = dest2[i+12] & tmpChar12;
            dest2[i+13] = dest2[i+13] & tmpChar13;
            dest2[i+14] = dest2[i+14] & tmpChar14;
            dest2[i+15] = dest2[i+15] & tmpChar15;
            dest2[i+16] = dest2[i+16] & tmpChar16;
            dest2[i+17] = dest2[i+17] & tmpChar17;
            dest2[i+18] = dest2[i+18] & tmpChar18;
            dest2[i+19] = dest2[i+19] & tmpChar19;
            dest2[i+20] = dest2[i+20] & tmpChar20;
            dest2[i+21] = dest2[i+21] & tmpChar21;
            dest2[i+22] = dest2[i+22] & tmpChar22;
            dest2[i+23] = dest2[i+23] & tmpChar23;
            dest2[i+24] = dest2[i+24] & tmpChar24;
            dest2[i+25] = dest2[i+25] & tmpChar25;
            dest2[i+26] = dest2[i+26] & tmpChar26;
            dest2[i+27] = dest2[i+27] & tmpChar27;
            dest2[i+28] = dest2[i+28] & tmpChar28;
            dest2[i+29] = dest2[i+29] & tmpChar29;
            dest2[i+30] = dest2[i+30] & tmpChar30;
            dest2[i+31] = dest2[i+31] & tmpChar31;


        }
    }
}

Omp wall clock timings were 17ms for two-lines version and 1.5ms for one-line version. (after several thousand iterations)

Question: What could have cause the 9x speed-up? Data re-use of src[] or cache-utilization? Maybe code was not fit for vectorizing before, and unrolling enabled it, could it be the missing modulus operation for indexing? Should I add #pragma omp on top of loop?

Edit: #pragma omp parallel for num_threads(4) made it worse somehow. Maybe just 1MB is not hiding multithread overhead?

Edit2: ommiting the modulus operator made it 2.5 ms but the performance increase from 2.5ms to 1.5ms (~%60 increase) must have come from unrolling/cache/...

Note: enabling "favor fast code", enabling whole program optimization, full optimization(/Ox) and enabling intrinsics did not change the speed(maybe enabling SSE2 and /O2 was enough)

CPU: FX8150 IDE: Msvc

like image 280
huseyin tugrul buyukisik Avatar asked Feb 13 '14 12:02

huseyin tugrul buyukisik


2 Answers

I think loop unrolling is the booster.You may google it for detailed info.

Of course modulus operation can be slow..You may test it by replacing

src[i%srcsize] 

with

src[i] 

for the sake of timing tests.

like image 170
Semih Ozmen Avatar answered Oct 22 '22 12:10

Semih Ozmen


there are several thing to consider here when comparing the two functions; from a high level perspective it looks as though the 2 line version should run faster than the 1 line version, however:

- the 1 line version has better instruction cache locatity
- the 1 line version puts less pressure on the registers( this is important 
 because if the compiler runs out of registers to use it may start using the 
 stack to move parameters around for the instructions it needs - which causes 
 an overhead)

As a optimization tip:

- you can try and use the "restrict"   keyword: **void membwand(void * _restrict_
destv, void *  _restrict_ srcv, size_t destsize, size_t srcsize);** This tells the
compiler that the two pointers are not aliasing - they are will never point at the
same memory location so the read/write instruction can be executed in the same cycle.
like image 29
Pandrei Avatar answered Oct 22 '22 11:10

Pandrei