Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Parallelizing the histogram function

I have implemented a normal and parallel verion of a simple function that calculates a histogram from a 32bppArgb bitmap. The normal version takes about 0.03 seconds on a 1920x1080 image while the parallel version takes 0.07 seconds.

Is threading overhead really that heavy? Is there some other construct besides Parallel.For that could speed up this process? I need to speed this up since I am working with 30fps video.

Here is the simplified code:

public sealed class Histogram
{
    public int MaxA = 0;
    public int MaxR = 0;
    public int MaxG = 0;
    public int MaxB = 0;
    public int MaxT = 0;

    public int [] A = null;
    public int [] R = null;
    public int [] G = null;
    public int [] B = null;

    public Histogram ()
    {
        this.A = new int [256];
        this.R = new int [256];
        this.G = new int [256];
        this.B = new int [256];

        this.Initialize();
    }

    public void Initialize ()
    {
        this.MaxA = 0;
        this.MaxR = 0;
        this.MaxG = 0;
        this.MaxB = 0;
        this.MaxT = 0;

        for (int i = 0; i < this.A.Length; i++)
            this.A [i] = 0;
        for (int i = 0; i < this.R.Length; i++)
            this.R [i] = 0;
        for (int i = 0; i < this.G.Length; i++)
            this.G [i] = 0;
        for (int i = 0; i < this.B.Length; i++)
            this.B [i] = 0;
    }

    public void ComputeHistogram (System.Drawing.Bitmap bitmap, bool parallel = false)
    {
        System.Drawing.Imaging.BitmapData data = null;

        data = bitmap.LockBits
        (
            new System.Drawing.Rectangle(0, 0, bitmap.Width, bitmap.Height),
            System.Drawing.Imaging.ImageLockMode.ReadOnly,
            System.Drawing.Imaging.PixelFormat.Format32bppArgb
        );

        try
        {
            ComputeHistogram(data, parallel);
        }
        catch
        {
            bitmap.UnlockBits(data);

            throw;
        }

        bitmap.UnlockBits(data);
    }

    public void ComputeHistogram (System.Drawing.Imaging.BitmapData data, bool parallel = false)
    {
        int stride = System.Math.Abs(data.Stride);

        this.Initialize();

        if (parallel)
        {
            unsafe
            {
                System.Threading.Tasks.Parallel.For
                (
                    0,
                    data.Height,
                    new System.Threading.Tasks.ParallelOptions() { MaxDegreeOfParallelism = System.Environment.ProcessorCount },
                    y =>
                    {
                        byte* pointer = ((byte*) data.Scan0) + (stride * y);

                        for (int x = 0; x < stride; x += 4)
                        {
                            this.B [pointer [x + 0]]++;
                            this.G [pointer [x + 1]]++;
                            this.R [pointer [x + 2]]++;
                            this.A [pointer [x + 3]]++;
                        }
                    }
                );
            }
        }
        else
        {
            unsafe
            {
                for (int y = 0; y < data.Height; y++)
                {
                    byte* pointer = ((byte*) data.Scan0) + (stride * y);

                    for (int x = 0; x < stride; x += 4)
                    {
                        this.B [pointer [x + 0]]++;
                        this.G [pointer [x + 1]]++;
                        this.R [pointer [x + 2]]++;
                        this.A [pointer [x + 3]]++;
                    }
                }
            }
        }

        for (int i = 0; i < this.A.Length; i++)
            if (this.MaxA < this.A [i]) this.MaxA = this.A [i];
        for (int i = 0; i < this.R.Length; i++)
            if (this.MaxR < this.R [i]) this.MaxR = this.R [i];
        for (int i = 0; i < this.G.Length; i++)
            if (this.MaxG < this.G [i]) this.MaxG = this.G [i];
        for (int i = 0; i < this.B.Length; i++)
            if (this.MaxB < this.B [i]) this.MaxB = this.B [i];

        if (this.MaxT < this.MaxA) this.MaxT = this.MaxA;
        if (this.MaxT < this.MaxR) this.MaxT = this.MaxR;
        if (this.MaxT < this.MaxG) this.MaxT = this.MaxG;
        if (this.MaxT < this.MaxB) this.MaxT = this.MaxB;
    }
}
like image 872
Raheel Khan Avatar asked Feb 15 '13 15:02

Raheel Khan


1 Answers

Well, first off, you've got a huge bug in your Parallel loop:

You're going to have multiple threads accessing, incrementing, and updating shared arrays - just running your sample code on the same image multiple times results in wildly different results due to the inherent race conditions.

But that's not what you asked.

As for why you're seeing a decrease in performance using the parallel implementation, the simple answer is you are probably not doing enough work in the body of each parallel task to offset the "spinup cost" of creating the new task, scheduling it, etc.

Probably more critical is that I believe you are thrashing the hell out of the L1/L2 cache with all the jumping around in memory - each task thread is going to try and load what it thinks it will need into cache memory, but as you are indexing all over the place, you're no longer creating a consistent access pattern, so you'll likely be getting cache misses every time you try to access either the bitmap buffer or the internal arrays.

There's also an equally performant way of getting at the readonly data of the bitmap without using unsafe code...actually, let's do that first:

So you have, by calling LockBits, a pointer to unmanaged memory. Let's make a copy of it:

System.Drawing.Imaging.BitmapData data = null;
data = bitmap.LockBits
(
    new System.Drawing.Rectangle(0, 0, bitmap.Width, bitmap.Height),
    System.Drawing.Imaging.ImageLockMode.ReadOnly,
    System.Drawing.Imaging.PixelFormat.Format32bppArgb
);

// For later usage
var imageStride = data.Stride;
var imageHeight = data.Height;

// allocate space to hold the data
byte[] buffer = new byte[data.Stride * data.Height];

// Source will be the bitmap scan data
IntPtr pointer = data.Scan0;

// the CLR marshalling system knows how to move blocks of bytes around, FAST.
Marshal.Copy(pointer, buffer, 0, buffer.Length);

// and now we can unlock this since we don't need it anymore
bitmap.UnlockBits(data);

ComputeHistogram(buffer, imageStride, imageHeight, parallel);

Now, as for the race condition - you can overcome this in a reasonably performant manner by using the Interlocked calls to bump up the counts (NOTE!!! Multithreaded programming is HARD, and it's entirely possible my solution here isn't perfect!)

public void ComputeHistogram (byte[] data, int stride, int height, bool parallel = false)
{
    this.Initialize();

    if (parallel)
    {
        System.Threading.Tasks.Parallel.For
        (
            0,
            height,
            new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount },
            y =>
            {
                int startIndex = (stride * y);
                int endIndex = stride * (y+1);
                for (int x = startIndex; x < endIndex; x += 4)
                {
                    // Interlocked actions are more-or-less atomic 
                    // (caveats abound, but this should work for us)
                    Interlocked.Increment(ref this.B[data[x]]);
                    Interlocked.Increment(ref this.G[data[x+1]]);
                    Interlocked.Increment(ref this.R[data[x+2]]);
                    Interlocked.Increment(ref this.A[data[x+3]]);
                }
            }
        );
    }
    else
    {
        // the original way is ok for non-parallel, since only one
        // thread is mucking around with the data
    }

    // Sorry, couldn't help myself, this just looked "cleaner" to me
    this.MaxA = this.A.Max();
    this.MaxR = this.R.Max();
    this.MaxG = this.G.Max();
    this.MaxB = this.B.Max();
    this.MaxT = new[] { this.MaxA, this.MaxB, this.MaxG, this.MaxR }.Max();
}

So, what does this do to the runtime behavior?

Not a whole lot, but at least the parallel fork calculates the correct results now. :)

Using a really cheapo test rig:

void Main()
{    
    foreach(var useParallel in new[]{false, true})
    {
        var totalRunTime = TimeSpan.Zero;
        var sw = new Stopwatch();
        var runCount = 10;
        for(int run=0; run < runCount; run++)
        {
            GC.Collect();
            GC.WaitForPendingFinalizers();
            GC.Collect();
            sw.Reset();
            sw.Start();
            var bmp = Bitmap.FromFile(@"c:\temp\banner.bmp") as Bitmap;
            var hist = new Histogram();
            hist.ComputeHistogram(bmp, useParallel);
            sw.Stop();
            totalRunTime = totalRunTime.Add(sw.Elapsed);
        }
        Console.WriteLine("Parallel={0}, Avg={1} ms", useParallel, totalRunTime.TotalMilliseconds / runCount);
    }
}

I get results like this:

Parallel=False, Avg=1.69777 ms
Parallel=True, Avg=5.33584 ms

As you can see, we still haven't addressed your original question. :)

So let's take a stab at making the parallel work "more better":

Let's see what "giving more work" to the tasks does:

if (parallel)
{
    var batchSize = 2;
    System.Threading.Tasks.Parallel.For
    (
        0,
        height / batchSize,
        new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount },
        y =>
        {
            int startIndex = (stride * y * batchSize);
            int endIndex = startIndex + (stride * batchSize);
            for (int x = startIndex; x < endIndex; x += 4)
            {
                // Interlocked actions are more-or-less atomic 
                // (caveats abound, but this should work for us)
                Interlocked.Increment(ref this.B[data[x]]);
                Interlocked.Increment(ref this.G[data[x+1]]);
                Interlocked.Increment(ref this.R[data[x+2]]);
                Interlocked.Increment(ref this.A[data[x+3]]);
            }
        }
    );
}

Results:

Parallel=False, Avg=1.70273 ms
Parallel=True, Avg=4.82591 ms

Ooh, that looks promising...I wonder what happens as we vary batchSize?

Let's change our test rig thusly:

void Main()
{    
    foreach(var useParallel in new[]{false, true})
    {
        for(int batchSize = 1; batchSize < 1024; batchSize <<= 1)
        {
            var totalRunTime = TimeSpan.Zero;
            var sw = new Stopwatch();
            var runCount = 10;
            for(int run=0; run < runCount; run++)
            {
                GC.Collect();
                GC.WaitForPendingFinalizers();
                GC.Collect();
                sw.Reset();
                sw.Start();
                var bmp = Bitmap.FromFile(@"c:\temp\banner.bmp") as Bitmap;
                var hist = new Histogram();
                hist.ComputeHistogram(bmp, useParallel, batchSize);
                sw.Stop();
                totalRunTime = totalRunTime.Add(sw.Elapsed);
            }
            Console.WriteLine("Parallel={0}, BatchSize={1} Avg={2} ms", useParallel, batchSize, totalRunTime.TotalMilliseconds / runCount);
        }        
    }
}

Results: (only showing parallel=true, since non-parallel won't change)

Parallel=True, BatchSize=1 Avg=5.57644 ms
Parallel=True, BatchSize=2 Avg=5.49982 ms
Parallel=True, BatchSize=4 Avg=5.20434 ms
Parallel=True, BatchSize=8 Avg=5.1721 ms
Parallel=True, BatchSize=16 Avg=5.00405 ms
Parallel=True, BatchSize=32 Avg=4.44973 ms
Parallel=True, BatchSize=64 Avg=2.28332 ms
Parallel=True, BatchSize=128 Avg=1.39957 ms
Parallel=True, BatchSize=256 Avg=1.29156 ms
Parallel=True, BatchSize=512 Avg=1.28656 ms

We seem to be approaching an asymptote of sorts once we crest the 64-128 range in batch size, although of course your mileage may vary depending on your bitmap sizes, etc.

I hope this helps! It was a fun distraction from my day of waiting for production builds to complete! :)

like image 175
JerKimball Avatar answered Oct 15 '22 12:10

JerKimball