Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Fastest way to operate on individual bytes in an int

I found that my application spends 25% of its time doing this in a loop:

private static int Diff (int c0, int c1)
{
    unsafe {
        byte* pc0 = (byte*) &c0;
        byte* pc1 = (byte*) &c1;
        int d0 = pc0[0] - pc1[0];
        int d1 = pc0[1] - pc1[1];
        int d2 = pc0[2] - pc1[2];
        int d3 = pc0[3] - pc1[3];
        d0 *= d0;
        d1 *= d1;
        d2 *= d2;
        d3 *= d3;
        return d0 + d1 + d2 + d3;
    }
}

How can I improve the performance of this method? My ideas so far:

  1. Most obviously, this would benefit from SIMD, but let us suppose I don't want to go there because it is a bit of a hassle.
  2. Same goes for lower level stuff (calling a C library, executing on GPGPU)
  3. Multithreading - I'll use that.

Edit: For your convenience, some test code which reflects the real environment and use case. (In reality even more data are involved, and data are not compared in single large blocks but in many chunks of several kb each.)

public static class ByteCompare
{
    private static void Main ()
    {
        const int n = 1024 * 1024 * 20;
        const int repeat = 20;
        var rnd = new Random (0);

        Console.Write ("Generating test data... ");
        var t0 = Enumerable.Range (1, n)
            .Select (x => rnd.Next (int.MinValue, int.MaxValue))
            .ToArray ();
        var t1 = Enumerable.Range (1, n)
            .Select (x => rnd.Next (int.MinValue, int.MaxValue))
            .ToArray ();
        Console.WriteLine ("complete.");
        GC.Collect (2, GCCollectionMode.Forced);
        Console.WriteLine ("GCs: " + GC.CollectionCount (0));

        {
            var sw = Stopwatch.StartNew ();
            long res = 0;
            for (int reps = 0; reps < repeat; reps++) {
                for (int i = 0; i < n; i++) {
                    int c0 = t0[i];
                    int c1 = t1[i];
                    res += ByteDiff_REGULAR (c0, c1);
                }
            }
            sw.Stop ();
            Console.WriteLine ("res=" + res + ", t=" + sw.Elapsed.TotalSeconds.ToString ("0.00") + "s - ByteDiff_REGULAR");
        }
        {
            var sw = Stopwatch.StartNew ();
            long res = 0;
            for (int reps = 0; reps < repeat; reps++) {
                for (int i = 0; i < n; i++) {
                    int c0 = t0[i];
                    int c1 = t1[i];
                    res += ByteDiff_UNSAFE (c0, c1);
                }
            }
            sw.Stop ();
            Console.WriteLine ("res=" + res + ", t=" + sw.Elapsed.TotalSeconds.ToString ("0.00") + "s - ByteDiff_UNSAFE_PTR");
        }

        Console.WriteLine ("GCs: " + GC.CollectionCount (0));
        Console.WriteLine ("Test complete.");
        Console.ReadKey (true);
    }

    public static int ByteDiff_REGULAR (int c0, int c1)
    {
        var c00 = (byte) (c0 >> (8 * 0));
        var c01 = (byte) (c0 >> (8 * 1));
        var c02 = (byte) (c0 >> (8 * 2));
        var c03 = (byte) (c0 >> (8 * 3));
        var c10 = (byte) (c1 >> (8 * 0));
        var c11 = (byte) (c1 >> (8 * 1));
        var c12 = (byte) (c1 >> (8 * 2));
        var c13 = (byte) (c1 >> (8 * 3));
        var d0 = (c00 - c10);
        var d1 = (c01 - c11);
        var d2 = (c02 - c12);
        var d3 = (c03 - c13);
        d0 *= d0;
        d1 *= d1;
        d2 *= d2;
        d3 *= d3;
        return d0 + d1 + d2 + d3;
    }

    private static int ByteDiff_UNSAFE (int c0, int c1)
    {
        unsafe {
            byte* pc0 = (byte*) &c0;
            byte* pc1 = (byte*) &c1;
            int d0 = pc0[0] - pc1[0];
            int d1 = pc0[1] - pc1[1];
            int d2 = pc0[2] - pc1[2];
            int d3 = pc0[3] - pc1[3];
            d0 *= d0;
            d1 *= d1;
            d2 *= d2;
            d3 *= d3;
            return d0 + d1 + d2 + d3;
        }
    }
}

which yields for me (running as x64 Release on an i5):

Generating test data... complete.
GCs: 8
res=18324555528140, t=1.46s - ByteDiff_REGULAR
res=18324555528140, t=1.15s - ByteDiff_UNSAFE
res=18324555528140, t=1.73s - Diff_Alex1
res=18324555528140, t=1.63s - Diff_Alex2
res=18324555528140, t=3.59s - Diff_Alex3
res=18325828513740, t=3.90s - Diff_Alex4
GCs: 8
Test complete.
like image 540
mafu Avatar asked May 02 '15 22:05

mafu


People also ask

Is a C++ int always 4 bytes?

The size of an int is really compiler dependent. Back in the day, when processors were 16 bit, an int was 2 bytes. Nowadays, it's most often 4 bytes on a 32-bit as well as 64-bit systems. Still, using sizeof(int) is the best way to get the size of an integer for the specific system the program is executed on.

Does int use 2 bytes or 4 bytes?

On 16-bit systems (like in arduino), int takes up 2 bytes while on 32-bit systems, int takes 4 bytes since 32-bit=4bytes but even on 64-bit systems, int occupies 4 bytes.

How are 4 bytes int stored?

Integers are commonly stored using a word of memory, which is 4 bytes or 32 bits, so integers from 0 up to 4,294,967,295 (232 - 1) can be stored.


2 Answers

Most obviously, this would benefit from SIMD, but let us suppose I don't want to go there because it is a bit of a hassle.

Well avoid it if you want, but it's actually fairly well supported directly from C#. Short of offloading to the GPU, I would expect this to be by far the largest performance winner if the larger algorithm lends itself to SIMD processing.

http://www.drdobbs.com/architecture-and-design/simd-enabled-vector-types-with-c/240168888

Multithreading

Sure, use one thread per CPU core. You can also use constructs like Parallel.For and let .NET sort out how many threads to use. It's pretty good at that, but since you know this is certainly CPU bound you might (or might not) get a more optimal result by managing threads yourself.

As for speeding up the actual code block, it may be faster to use bit masking and bit shifting to get the individual values to work on, rather than using pointers. That has the additional benefit that you don't need an unsafe code block, e.g.

byte b0_leftmost = (c0 & 0xff000000) >> 24;
like image 160
Eric J. Avatar answered Oct 05 '22 23:10

Eric J.


Besides the already mentioned SIMD options and running multiple operations in parallel, have you tried to benchmark some possible implementation variations on the theme? Like some of the below options.

I almost forgot to mention a very important optimization:

  • Add a using System.Runtime.CompilerServices;
  • Add the [MethodImpl(MethodImplOptions.AggressiveInlining)] attribute to your method.

Like this:

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Diff(int c0, int c1)
{
    unsafe
    {
        byte* pc0 = (byte*)&c0;
        byte* pc1 = (byte*)&c1;
        int sum = 0;
        int dif = 0;
        for (var i = 0; i < 4; i++, pc0++, pc1++)
        {
            dif = *pc0 - *pc1;
            sum += (dif * dif);
        }
        return sum;
    }
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Diff(int c0, int c1)
{
    unchecked
    {
        int sum = 0;
        int dif = 0;
        for (var i = 0; i < 4; i++)
        {
            dif = (c0 & 0xFF) - (c1 & 0xFF);
            c0 >>= 8;
            c1 >>= 8;
            sum += (dif * dif);
        }
        return sum;
    }
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Diff(int c0, int c1)
{
    unsafe
    {
        int* difs = stackalloc int[4];
        byte* pc0 = (byte*)&c0;
        byte* pc1 = (byte*)&c1;
        difs[0] = pc0[0] - pc1[0];
        difs[1] = pc0[1] - pc1[1];
        difs[2] = pc0[2] - pc1[2];
        difs[3] = pc0[3] - pc1[3];
        return difs[0] * difs[0] + difs[1] * difs[1] + difs[2] * difs[2] + difs[3] * difs[3];
    }
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Diff(int c0, int c1)
{
    unsafe
    {
        int* difs = stackalloc int[4];
        difs[0] = (c0 >> 24) - (c1 >> 24);
        difs[1] = ((c0 >> 16) & 0xFF) - ((c1 >> 16) & 0xFF);
        difs[2] = ((c0 >> 8) & 0xFF) - ((c1 >> 8) & 0xFF);
        difs[3] = (c0 & 0xFF) - (c1  & 0xFF);
        return difs[0] * difs[0] + difs[1] * difs[1] + difs[2] * difs[2] + difs[3] * difs[3];
    }
}
like image 38
Alex Avatar answered Oct 06 '22 01:10

Alex