Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Intel x86 assembly optimization techniques for expanding 8 bits to 8 boolean bytes of 0 or 1

Tags:

I am learning assembler quite a while and I am trying to rewrite some simple procedures \ functions to it to see performance benefits (if any). My main development tool is Delphi 2007 and first examples will be in that language but they can be easily translated to other languages as well.

The problem states as:

We have given an unsigned byte value in which each of the eight bits represents a pixel in one row of a screen. Each single pixel can be solid (1) or transparent (0). So in other words, we have 8 pixels packed in one byte value. I want to unpack those pixels into an eight byte array in the way that youngest pixel(bit) will land under the lowest index of the array and so on. Here is an example:

One byte value -----------> eight byte array  10011011 -----------------> [1][1][0][1][1][0][0][1]  Array index number ------->  0  1  2  3  4  5  6  7 

Below I present five methods which are solving the problem. Next I will show their time comparison and how I did measure those times.

My questions consist of two parts:

1.

I am asking you for detailed answer concerning methods DecodePixels4a and DecodePixels4b. Why method 4b is somewhat slower than the 4a?

If for example it is slower because my code is not aligned correctly then show me which instructions in a given method could be better aligned and how to do this to not break the method.

I would like to see real examples behind the theory. Please bear in mind that I am learning assembly and I want to gain knowledge from your answers which allows me in the future to writing better optimized code.

2.

Can you write faster routine than DecodePixels4a? If so, please present it and describe optimization steps that you have taken. By faster routine I mean routine that runs in the shortest period of time in your test environment among all the routines presented here.

All Intel family processors are allowed and those which are compatible with them.

Below you will find routines written by me:

procedure DecodePixels1(EncPixels: Byte; var DecPixels: TDecodedPixels); var   i3: Integer; begin   DecPixels[0] := EncPixels and $01;   for i3 := 1 to 7 do   begin     EncPixels := EncPixels shr 1;     DecPixels[i3] := EncPixels and $01;     //DecPixels[i3] := (EncPixels shr i3) and $01;  //this is even slower if you replace above 2 lines with it   end; end;   //Lets unroll the loop and see if it will be faster. procedure DecodePixels2(EncPixels: Byte; var DecPixels: TDecodedPixels); begin   DecPixels[0] := EncPixels and $01;   EncPixels := EncPixels shr 1;   DecPixels[1] := EncPixels and $01;   EncPixels := EncPixels shr 1;   DecPixels[2] := EncPixels and $01;   EncPixels := EncPixels shr 1;   DecPixels[3] := EncPixels and $01;   EncPixels := EncPixels shr 1;   DecPixels[4] := EncPixels and $01;   EncPixels := EncPixels shr 1;   DecPixels[5] := EncPixels and $01;   EncPixels := EncPixels shr 1;   DecPixels[6] := EncPixels and $01;   EncPixels := EncPixels shr 1;   DecPixels[7] := EncPixels and $01; end;   procedure DecodePixels3(EncPixels: Byte; var DecPixels: TDecodedPixels); begin   asm     push eax;     push ebx;     push ecx;     mov bl, al;     and bl, $01;     mov [edx], bl;     mov ecx, $00; @@Decode:     inc ecx;     shr al, $01;     mov bl, al;     and bl, $01;     mov [edx + ecx], bl;     cmp ecx, $07;     jnz @@Decode;     pop ecx;     pop ebx;     pop eax;   end; end;   //Unrolled assembly loop procedure DecodePixels4a(EncPixels: Byte; var DecPixels: TDecodedPixels); begin   asm     push eax;     push ebx;     mov bl, al;     and bl, $01;     mov  [edx], bl;     shr al, $01;     mov bl, al;     and bl, $01;     mov [edx + $01], bl;     shr al, $01;     mov bl, al;     and bl, $01;     mov [edx + $02], bl;     shr al, $01;     mov bl, al;     and bl, $01;     mov [edx + $03], bl;     shr al, $01;     mov bl, al;     and bl, $01;     mov [edx + $04], bl;     shr al, $01;     mov bl, al;     and bl, $01;     mov [edx + $05], bl;     shr al, $01;     mov bl, al;     and bl, $01;     mov [edx + $06], bl;     shr al, $01;     mov bl, al;     and bl, $01;     mov [edx + $07], bl;     pop ebx;     pop eax;   end; end;   // it differs compared to 4a only in switching two instructions (but seven times) procedure DecodePixels4b(EncPixels: Byte; var DecPixels: TDecodedPixels); begin   asm     push eax;     push ebx;     mov bl, al;     and bl, $01;     shr al, $01;          //     mov [edx], bl;        //     mov bl, al;     and bl, $01;     shr al, $01;          //     mov [edx + $01], bl;  //     mov bl, al;     and bl, $01;     shr al, $01;          //     mov [edx + $02], bl;  //     mov bl, al;     and bl, $01;     shr al, $01;          //     mov [edx + $03], bl;  //     mov bl, al;     and bl, $01;     shr al, $01;          //     mov [edx + $04], bl;  //     mov bl, al;     and bl, $01;     shr al, $01;          //     mov [edx + $05], bl;  //     mov bl, al;     and bl, $01;     shr al, $01;          //     mov [edx + $06], bl;  //     mov bl, al;     and bl, $01;     mov [edx + $07], bl;     pop ebx;     pop eax;   end; end; 

And here is how do I test them:

program Test;  {$APPTYPE CONSOLE}  uses   SysUtils, Windows;  type   TDecodedPixels = array[0..7] of Byte; var   Pixels: TDecodedPixels;   Freq, TimeStart, TimeEnd :Int64;   Time1, Time2, Time3, Time4a, Time4b: Extended;   i, i2: Integer;  begin   if QueryPerformanceFrequency(Freq) then   begin     for i2 := 1 to 100 do     begin       QueryPerformanceCounter(TimeStart);       for i := 1 to 100000 do         DecodePixels1(155, Pixels);       QueryPerformanceCounter(TimeEnd);       Time1 := Time1 + ((TimeEnd - TimeStart) / Freq * 1000);        QueryPerformanceCounter(TimeStart);       for i := 1 to 100000 do         DecodePixels2(155, Pixels);       QueryPerformanceCounter(TimeEnd);       Time2 := Time2 + ((TimeEnd - TimeStart) / Freq * 1000);        QueryPerformanceCounter(TimeStart);       for i := 1 to 100000 do         DecodePixels3(155, Pixels);       QueryPerformanceCounter(TimeEnd);       Time3 := Time3 + ((TimeEnd - TimeStart) / Freq * 1000);        QueryPerformanceCounter(TimeStart);       for i := 1 to 100000 do         DecodePixels4a(155, Pixels);       QueryPerformanceCounter(TimeEnd);       Time4a := Time4a + ((TimeEnd - TimeStart) / Freq * 1000);        QueryPerformanceCounter(TimeStart);       for i := 1 to 100000 do         DecodePixels4b(155, Pixels);       QueryPerformanceCounter(TimeEnd);       Time4b := Time4b + ((TimeEnd - TimeStart) / Freq * 1000);      end;     Writeln('Time1 : ' + FloatToStr(Time1 / 100) + ' ms.    <- Delphi loop.');     Writeln('Time2 : ' + FloatToStr(Time2 / 100) + ' ms.    <- Delphi unrolled loop.');     Writeln('Time3 : ' + FloatToStr(Time3/ 100) + ' ms.    <- BASM loop.');     Writeln('Time4a : ' + FloatToStr(Time4a / 100) + ' ms.    <- BASM unrolled loop.');     Writeln('Time4b : ' + FloatToStr(Time4b / 100) + ' ms.    <- BASM unrolled loop instruction switch.');   end;   Readln; end. 

Here are the results from my machine ( Intel® Pentium® E2180 on Win32 XP) :

Time1  : 1,68443549919493 ms.     <- Delphi loop. Time2  : 1,33773024572211 ms.     <- Delphi unrolled loop. Time3  : 1,37015271374424 ms.     <- BASM loop. Time4a : 0,822916962526627 ms.    <- BASM unrolled loop. Time4b : 0,862914462301607 ms.    <- BASM unrolled loop instruction switch. 

The results are pretty stable - times vary only by few percent between each test I've made. And that was always true: Time1 > Time3 > Time 2 > Time4b > Time4a

So I think that de difference between Time4a and Time4b depends of that instructions switch in the method DecodePixels4b. Sometimes it is 4% sometimes it is up to 10% but 4b is always slower than 4a.

I was thinking about another method with usage of MMX instructions to write into memory eight bytes at one time, but I can't figure out fast way to unpack byte into the 64 bit register.

Thank you for your time.


Thank you guys for your valuable input. Whish I could answer all of you at the same time, unfortunately compared to the modern CPU's I have only one "pipe" and can execute only one instruction "reply" at the time ;-) So, I will try sum up some things over here and write additional comments under your answers.

First of all, I wanted to say that before posting my question I came up with the solution presented by Wouter van Nifterick and it was actually way slower then my assembly code. So I've decided not to post that routine here, but you may see that I took the same approach also in my loop Delphi version of the routine. It is commented there because it was giving me worser results.

This is a mystery for me. I've run my code once again with Wouter's and PhilS's routines and here are the results:

Time1  : 1,66535493194387 ms.     <- Delphi loop. Time2  : 1,29115785420688 ms.     <- Delphi unrolled loop. Time3  : 1,33716934524107 ms.     <- BASM loop. Time4a : 0,795041753757838 ms.    <- BASM unrolled loop. Time4b : 0,843520166815013 ms.    <- BASM unrolled loop instruction switch. Time5  : 1,49457681191307 ms.     <- Wouter van Nifterick, Delphi unrolled Time6  : 0,400587402866258 ms.    <- PhiS, table lookup Delphi Time7  : 0,325472442519827 ms.    <- PhiS, table lookup Delphi inline Time8  : 0,37350491544239 ms.     <- PhiS, table lookup BASM 

Look at the Time5 result, quite strange isn't it? I guess I have different Delphi version, since my generated assembly code differs from that provided by Wouter.

Second major edit:


I know why routine 5 was slower on my machnie. I had checked "Range checking" and "Overflow checking" in my compiler options. I've added assembler directive to routine 9 to see if it helps. It seems that with this directive assembly procedure is as good as Delphi inline variant or even slightly better.

Here are the final results:

Time1  : 1,22508325749317 ms.     <- Delphi loop. Time2  : 1,33004145373084 ms.     <- Delphi unrolled loop. Time3  : 1,1473583622526 ms.      <- BASM loop. Time4a : 0,77322594033463 ms.     <- BASM unrolled loop. Time4b : 0,846033593023372 ms.    <- BASM unrolled loop instruction switch. Time5  : 0,688689382044384 ms.    <- Wouter van Nifterick, Delphi unrolled Time6  : 0,503233741036693 ms.    <- PhiS, table lookup Delphi Time7  : 0,385254722925063 ms.    <- PhiS, table lookup Delphi inline Time8  : 0,432993919452751 ms.    <- PhiS, table lookup BASM Time9  : 0,362680491244212 ms.    <- PhiS, table lookup BASM with assembler directive 

Third major edit:


In opinion @Pascal Cuoq and @j_random_hacker the difference in execution times between routines 4a, 4b and 5 is caused by the data dependency. However I have to disagree with that opinion basing on the further tests that I've made.

I've also invented new routine 4c based on 4a. Here it is:

procedure DecodePixels4c(EncPixels: Byte; var DecPixels: TDecodedPixels); begin   asm     push ebx;     mov bl, al;     and bl, 1;     mov [edx], bl;     mov bl, al;     shr bl, 1;     and bl, 1;     mov [edx + $01], bl;     mov bl, al;     shr bl, 2;     and bl, 1;     mov [edx + $02], bl;     mov bl, al;     shr bl, 3;     and bl, 1;     mov [edx + $03], bl;     mov bl, al;     shr bl, 4;     and bl, 1;     mov [edx + $04], bl;     mov bl, al;     shr bl, 5;     and bl, 1;     mov [edx + $05], bl;     mov bl, al;     shr bl, 6;     and bl, 1;     mov [edx + $06], bl;     shr al, 7;     and al, 1;     mov [edx + $07], al;     pop ebx;   end; end; 

I would say it is pretty data dependent.

And here are the tests and results. I've made four tests to make sure there is no accident. I've also added new times for the routines proposed by GJ (Time10a, Time10b).

          Test1  Test2  Test3  Test4  Time1   : 1,211  1,210  1,220  1,213 Time2   : 1,280  1,258  1,253  1,332 Time3   : 1,129  1,138  1,130  1,160  Time4a  : 0,690  0,682  0,617  0,635 Time4b  : 0,707  0,698  0,706  0,659 Time4c  : 0,679  0,685  0,626  0,625 Time5   : 0,715  0,682  0,686  0,679  Time6   : 0,490  0,485  0,522  0,514 Time7   : 0,323  0,333  0,336  0,318 Time8   : 0,407  0,403  0,373  0,354 Time9   : 0,352  0,378  0,355  0,355 Time10a : 1,823  1,812  1,807  1,813 Time10b : 1,113  1,120  1,115  1,118 Time10c : 0,652  0,630  0,653  0,633 Time10d : 0,156  0,155  0,172  0,160  <-- current winner! 

As you may see the results of 4a, 4b, 4c and 5 are very close to each other. Why is that? Because I've removed from 4a, 4b (4c already doesn't have it) two instructions: push eax and pop eax. Since I know I wont use anywhere else in my code the value under eax I do not have to prereserve it. Now my code has only one pair of push/pop so as the routine 5. Routine 5 prereserves value of eax beacause it firstly make copy of it under ecx but it deson't prereserve ecx.

So my conclusion is that: the difference in time execution of 5 and 4a and 4b (before the third edit) didn't concern data dependecny but was caused by additional pair of push / pop instructions.

I am very interested in your comments.

After a few days GJ invented even faster routine (Time 10d) than PhiS's. Nice work GJ!

like image 396
Wodzu Avatar asked Sep 12 '09 11:09

Wodzu


People also ask

What does pop do in x86?

The pop instruction removes the 4-byte data element from the top of the hardware-supported stack into the specified operand (i.e. register or memory location).

What does push do in x86?

A push is a single instruction in x86, which does two things internally. Decrement the ESP register by the size of pushed value. Store the pushed value at current address of ESP register.

What is eax in assembly language?

eax is the 32-bit, "int" size register. It was added in 1985 during the transition to 32-bit processors with the 80386 CPU. I'm in the habit of using this register size, since they also work in 32 bit mode, although I'm trying to use the longer rax registers for everything. ax is the 16-bit, "short" size register.

What does the push instruction do?

The PUSH instruction saves the current PRINT, USING, or ACONTROL status in push-down storage on a last-in, first-out basis. You restore this PRINT, USING, or ACONTROL status later, also on a last-in, first-out basis, by using a POP instruction.


2 Answers

In general, I'd personally stay away from trying to optimize code by using tricks on assembler level, unless you really need that extra 2 or 3% of speed, and you're willing to pay the price of code that is harder to read, maintain and port.

To squeeze that last 1%, you might even have to maintain several versions optimized per processor, and if newer processors and an improved pascal compiler comes along, you're not going to benefit from it.

This Delphi code is faster than your fastest assembler code:

procedure DecodePixels5(EncPixels: Byte; var DecPixels: TDecodedPixels); begin   DecPixels[0] := (EncPixels shr 0) and $01;   DecPixels[1] := (EncPixels shr 1) and $01;   DecPixels[2] := (EncPixels shr 2) and $01;   DecPixels[3] := (EncPixels shr 3) and $01;   DecPixels[4] := (EncPixels shr 4) and $01;   DecPixels[5] := (EncPixels shr 5) and $01;   DecPixels[6] := (EncPixels shr 6) and $01;   DecPixels[7] := (EncPixels shr 7) and $01; end;   Results:  Time1  : 1,03096806151283 ms.    <- Delphi loop. Time2  : 0,740308641141395 ms.   <- Delphi unrolled loop. Time3  : 0,996602425688886 ms.   <- BASM loop. Time4a : 0,608267951561275 ms.   <- BASM unrolled loop. Time4b : 0,574162510648039 ms.   <- BASM unrolled loop instruction switch. Time5  : 0,499628206138524 ms. !!!  <- Delphi unrolled loop 5. 

It's fast because the operations can be done with registers only, instead of needing to store and fetch memory. Modern processors execute this partly in parallel (a new operation can be started before the previous finished), because the results of the consecutive instructions are independent of each other.

The machine code looks like this:

  push ebx;   // DecPixels[0] := (EncPixels shr 0) and 1;   movzx ecx,al   mov ebx,ecx   //  shr ebx,$00   and bl,$01   mov [edx],bl   // DecPixels[1] := (EncPixels shr 1) and 1;   mov ebx,ecx   shr ebx,1   and bl,$01   mov [edx+$01],bl   // DecPixels[2] := (EncPixels shr 2) and 1;   mov ebx,ecx   shr ebx,$02   and bl,$01   mov [edx+$02],bl   // DecPixels[3] := (EncPixels shr 3) and 1;   mov ebx,ecx   shr ebx,$03   and bl,$01   mov [edx+$03],bl   // DecPixels[4] := (EncPixels shr 4) and 1;   mov ebx,ecx   shr ebx,$04   and bl,$01   mov [edx+$04],bl   // DecPixels[5] := (EncPixels shr 5) and 1;   mov ebx,ecx   shr ebx,$05   and bl,$01   mov [edx+$05],bl   // DecPixels[6] := (EncPixels shr 6) and 1;   mov ebx,ecx   shr ebx,$06   and bl,$01   mov [edx+$06],bl   // DecPixels[7] := (EncPixels shr 7) and 1;   shr ecx,$07   and cl,$01   mov [edx+$07],cl   pop ebx; 

Edit: As suggested, a table lookup is indeed faster.

var   PixelLookup:Array[byte] of TDecodedPixels;  // You could precalculate, but the performance gain would hardly be worth it because you call this once only. for I := 0 to 255 do   DecodePixels5b(I, PixelLookup[I]);   procedure DecodePixels7(EncPixels: Byte; var DecPixels: TDecodedPixels); begin   DecPixels := PixelLookup[EncPixels]; end;  Results:  Time1  : 1,03096806151283 ms.    <- Delphi loop. Time2  : 0,740308641141395 ms.   <- Delphi unrolled loop. Time3  : 0,996602425688886 ms.   <- BASM loop. Time4a : 0,608267951561275 ms.   <- BASM unrolled loop. Time4b : 0,574162510648039 ms.   <- BASM unrolled loop instruction switch. Time5  : 0,499628206138524 ms. !!!  <- Delphi unrolled loop 5. Time7 : 0,251533475182096 ms.    <- simple table lookup 
like image 81
Wouter van Nifterick Avatar answered Nov 19 '22 03:11

Wouter van Nifterick


Your asm code is relativity slow because use stack end write 8 times to memory. Check this one...

procedure DecodePixels(EncPixels: Byte; var DecPixels: TDecodedPixels); asm   xor   ecx, ecx   add   al, al   rcl   ecx, 8   add   al, al   rcl   ecx, 8   add   al, al   rcl   ecx, 8   add   al, al   rcl   ecx, 1   mov   [DecPixels + 4], ecx   xor   ecx, ecx   add   al, al   rcl   ecx, 8   add   al, al   rcl   ecx, 8   add   al, al   rcl   ecx, 8   add   al, al   rcl   ecx, 1   mov   [DecPixels], ecx end; 

Maybe is even faster than code with lookup table!

Improved version:

procedure DecodePixelsI(EncPixels: Byte; var DecPixels: TDecodedPixels); asm   mov   ecx, 0    //Faster than: xor   ecx, ecx   add   al, al   rcl   ch, 1   add   al, al   rcl   cl, 1   ror   ecx, 16   add   al, al   rcl   ch, 1   add   al, al   rcl   cl, 1   mov   [DecPixels + 4], ecx   mov   ecx, 0    //Faster than: xor   ecx, ecx   add   al, al   rcl   ch, 1   add   al, al   rcl   cl, 1   ror   ecx, 16   add   al, al   rcl   ch, 1   add   al, al   rcl   cl, 1   mov   [DecPixels], ecx end; 

Version 3:

procedure DecodePixelsX(EncPixels: Byte; var DecPixels: TDecodedPixels); asm   add   al, al   setc  byte ptr[DecPixels + 7]   add   al, al   setc  byte ptr[DecPixels + 6]   add   al, al   setc  byte ptr[DecPixels + 5]   add   al, al   setc  byte ptr[DecPixels + 4]   add   al, al   setc  byte ptr[DecPixels + 3]   add   al, al   setc  byte ptr[DecPixels + 2]   add   al, al   setc  byte ptr[DecPixels + 1]   setnz byte ptr[DecPixels] end; 

Version 4:

const Uint32DecPix : array [0..15] of cardinal = (   $00000000, $00000001, $00000100, $00000101,   $00010000, $00010001, $00010100, $00010101,   $01000000, $01000001, $01000100, $01000101,   $01010000, $01010001, $01010100, $01010101   );  procedure DecodePixelsY(EncPixels: byte; var DecPixels: TDecodedPixels); inline; begin   pcardinal(@DecPixels)^ := Uint32DecPix[EncPixels and $0F];   pcardinal(cardinal(@DecPixels) + 4)^ := Uint32DecPix[(EncPixels and $F0) shr 4]; end; 
like image 44
GJ. Avatar answered Nov 19 '22 05:11

GJ.