Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

5x Performance with Parallel.For... on a Dual Core?

I was doing some experimental calculations for fun, when I came across an interesting result:

Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 19636ms
For Loop: 12612ms
Parallel.For Loop: 3835ms

Which is not what I expected.

System: Windows 7 64, i3 2120 [dual core, 4 threads], Visual Studio 2010.

Build : Optimization's on, Release mode [no debugger], 32 Bit.

Of secondary interest is the disappointing 64 bit performance. While it's more inline of what I'd expect in terms of ratio's it accomplishes this by being slower across the board.

Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 23409ms
For Loop: 24373ms
Parallel.For Loop: 6839ms

The calculation is simple: For the indices x & y find the closest Vector3 and store it in 2D array.

The question, if you dare, is to try to explain why the inline for loop is so slow. Bonus points for explaining the 64bit versions lack of performance.

using System;
using System.Diagnostics;
using System.Threading.Tasks;

namespace TextureFromPoints
{
    class Program
    {
        const int numPoints = 700;
        const int textureSize = 1024;

        static Random rnd = new Random();

        static void Main(string[] args)
        {
            while (true)
            {
                Console.WriteLine("Starting");
                Console.WriteLine();

                var pointCloud = new Vector3[numPoints];

                for (int i = 0; i < numPoints; i++)
                    pointCloud[i] = new Vector3(textureSize);

                var result1 = new Vector3[textureSize, textureSize];
                var result2 = new Vector3[textureSize, textureSize];
                var result3 = new Vector3[textureSize, textureSize];

                var sw1 = Stopwatch.StartNew();
                for (int x = 0; x < textureSize; x++)
                    for (int y = 0; y < textureSize; y++)
                    {
                        var targetPos = new Vector3(x, y, 0);
                        var nearestV3 = pointCloud[0];
                        var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);

                        for (int i = 1; i < numPoints; i++)
                        {
                            var currentV3 = pointCloud[i];
                            var currentV3Distance = currentV3.DistanceToPoint(targetPos);
                            if (currentV3Distance < nearestV3Distance)
                            {
                                nearestV3 = currentV3;
                                nearestV3Distance = currentV3Distance;
                            }
                        }
                        result1[x, y] = nearestV3;
                    }
                sw1.Stop();

                var sw2 = Stopwatch.StartNew();
                for (int x = 0; x < textureSize; x++)
                    for (int y = 0; y < textureSize; y++)
                        Computation(pointCloud, result2, x, y);
                sw2.Stop();


                var sw3 = Stopwatch.StartNew();

                Parallel.For(0, textureSize, x =>
                {
                    for (int y = 0; y < textureSize; y++)
                        Computation(pointCloud, result3, x, y);
                });
                sw3.Stop();

                Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
                Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
                Console.WriteLine();
                Console.Write("Verifying Data: ");
                Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
                Console.WriteLine(); Console.WriteLine();
                Console.ReadLine();
            }
        }

        private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
        {
            for (int x = 0; x < textureSize; x++)
                for (int y = 0; y < textureSize; y++)
                    if (!lhs[x, y].Equals(rhs[x, y]))
                        return false;
            return true;
        }

        private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
        {
            var targetPos = new Vector3(x, y, 0);
            var nearestV3 = pointCloud[0];
            var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);

            for (int i = 1; i < numPoints; i++)
            {
                var currentV3 = pointCloud[i];
                var currentV3Distance = currentV3.DistanceToPoint(targetPos);
                if (currentV3Distance < nearestV3Distance)
                {
                    nearestV3 = currentV3;
                    nearestV3Distance = currentV3Distance;
                }
            }
            result[x, y] = nearestV3;
        }

        struct Vector3
        {
            public float x;
            public float y;
            public float z;

            public Vector3(float x, float y, float z)
            {
                this.x = x;
                this.y = y;
                this.z = z;
            }
            public Vector3(float randomDistance)
            {
                this.x = (float)rnd.NextDouble() * randomDistance;
                this.y = (float)rnd.NextDouble() * randomDistance;
                this.z = (float)rnd.NextDouble() * randomDistance;
            }

            public static Vector3 operator -(Vector3 a, Vector3 b)
            {
                return new Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
            }

            public float sqrMagnitude()
            {
                return x * x + y * y + z * z;
            }

            public float DistanceToPoint(Vector3 point)
            {
                return (this - point).sqrMagnitude();
            }
        }
    }
}

UPDATE: Thanks to the efforts of Drew Marsh we now have this super optimized version that inlines all the V3 operations.

using System;
using System.Diagnostics;
using System.Threading.Tasks;

namespace TextureFromPoints
{
    class RevisedProgram
    {
        const int numPoints = 700;
        const int textureSize = 1024;

        static Random rnd = new Random();

        static void Main(string[] args)
        {
            while (true)
            {
                Console.WriteLine("Starting REVISED");
                Console.WriteLine();

                var pointCloud = new Vector3[numPoints];

                for (int i = 0; i < numPoints; i++)
                    pointCloud[i] = new Vector3(textureSize);

                var result1 = new Vector3[textureSize, textureSize];
                var result2 = new Vector3[textureSize, textureSize];
                var result3 = new Vector3[textureSize, textureSize];

                var sw1 = Inline(pointCloud, result1);

                var sw2 = NotInline(pointCloud, result2);

                var sw3 = Parallelized(pointCloud, result3);

                Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
                Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
                Console.WriteLine();
                Console.Write("Verifying Data: ");
                Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
                Console.WriteLine();
                Console.WriteLine();
                Console.ReadLine();
            }
        }

        private static Stopwatch Parallelized(Vector3[] pointCloud, Vector3[,] result3)
        {
            var sw3 = Stopwatch.StartNew();

            Parallel.For(0, textureSize, x =>
            {
                for (int y = 0; y < textureSize; y++)
                    Computation(pointCloud, result3, x, y);
            });
            sw3.Stop();
            return sw3;
        }

        private static Stopwatch NotInline(Vector3[] pointCloud, Vector3[,] result2)
        {
            var sw2 = Stopwatch.StartNew();
            for (int x = 0; x < textureSize; x++)
                for (int y = 0; y < textureSize; y++)
                    Computation(pointCloud, result2, x, y);
            sw2.Stop();
            return sw2;
        }

        private static Stopwatch Inline(Vector3[] pointCloud, Vector3[,] result1)
        {
            var sw1 = Stopwatch.StartNew();
            for (int x = 0; x < textureSize; x++)
                for (int y = 0; y < textureSize; y++)
                {
                    var targetPos = new Vector3(x, y, 0);
                    var nearestV3 = pointCloud[0];
                    Vector3 temp1 = new Vector3(nearestV3.x - targetPos.x, nearestV3.y - targetPos.y, nearestV3.z - targetPos.z);
                    var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;

                    for (int i = 1; i < numPoints; i++)
                    {
                        var currentV3 = pointCloud[i];
                        Vector3 temp2 = new Vector3(currentV3.x - targetPos.x, currentV3.y - targetPos.y, currentV3.z - targetPos.z);
                        var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
                        if (currentV3Distance < nearestV3Distance)
                        {
                            nearestV3 = currentV3;
                            nearestV3Distance = currentV3Distance;
                        }
                    }
                    result1[x, y] = nearestV3;
                }
            sw1.Stop();
            return sw1;
        }

        private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
        {
            for (int x = 0; x < textureSize; x++)
                for (int y = 0; y < textureSize; y++)
                    if (!lhs[x, y].Equals(rhs[x, y]))
                        return false;
            return true;
        }

        private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
        {
            var targetPos = new Vector3(x, y, 0);
            var nearestV3 = pointCloud[0];
            Vector3 temp1 = new Vector3(nearestV3.x - targetPos.x, nearestV3.y - targetPos.y, nearestV3.z - targetPos.z);

            var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;

            for (int i = 1; i < numPoints; i++)
            {
                var currentV3 = pointCloud[i];
                Vector3 temp2 = new Vector3(currentV3.x - targetPos.x, currentV3.y - targetPos.y, currentV3.z - targetPos.z);
                var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
                if (currentV3Distance < nearestV3Distance)
                {
                    nearestV3 = currentV3;
                    nearestV3Distance = currentV3Distance;
                }
            }
            result[x, y] = nearestV3;
        }


        struct Vector3
        {
            public float x;
            public float y;
            public float z;

            public Vector3(float x, float y, float z)
            {
                this.x = x;
                this.y = y;
                this.z = z;
            }
            public Vector3(float randomDistance)
            {
                this.x = (float)rnd.NextDouble() * randomDistance;
                this.y = (float)rnd.NextDouble() * randomDistance;
                this.z = (float)rnd.NextDouble() * randomDistance;
            }
        }
    }
}

And it gives the following results:

x86

Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 3820ms
For Loop: 3962ms
Parallel.For Loop: 1681ms

x64

Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 10978ms
For Loop: 10924ms
Parallel.For Loop: 3073ms

So the good news is that we can drastically increase the performance of this code - and get the single threaded version to be operating at a speed somewhat in keeping with its parallel cousin.

The bad news is that this means ditching x64 entirely and manually in-lining all math.

At this stage, I'm very disappointed in the performance of the compilers - I expected them to be much better.

Conclusion

This is fubar and sad... and while we don't really know why we can make an educated guess to it being caused by a stupid compiler/s. 24s to 3.8s simply by changing the compiler from x64 to x86 and doing some manual in-lining is not what I'd expect. However I've finished off the proof of concept I was writing, and thanks to a simple spacial hash I can compute a 1024 by 1024 image with 70,000 'points' in 0.7s - ~340000% faster than that of my original x64 scenario and with no threading or in-lining. As such I've accepted an answer - the immediate need is gone, though I'l be still looking into the issue.

The code is available here and here - it generates a nice Voronoi diagram as a side effect :P

like image 524
NPSF3000 Avatar asked Jul 20 '12 03:07

NPSF3000


2 Answers

All data from 8 core i7, Win7, x64

It's surprising that you get 5x for sure. One problem with this test as you've written it is that you've put all three approaches in your Main method which is forcing gobblygook that the compiler has to create and keep synched to fulfill the needs of the closure used in the Parallel.For is getting in the way of the inline method. If you break out the work as follows you will see significantly faster performance in all three implementations... for x86 at least:

Before x86:

For Loop (Inline): 24313ms 
For Loop: 25236ms 
Parallel.For Loop: 3840ms

After x86:

For Loop (Inline): 13007ms
For Loop: 13013ms
Parallel.For Loop: 2208ms

So, looking at my x86 Parallel.For results, you see it scales at about ~5.9x and each version is much quicker when isolated.

Next, it's interesting to note that there's absolutely no gain in x64 after this same change. In fact, it ended just a little higher in each run on 2 of 3 tests consistently.

Before x64

For Loop (Inline): 24222ms
For Loop: 25197ms
Parallel.For Loop: 3810ms

After x64

For Loop (Inline): 25302ms
For Loop: 25209ms
Parallel.For Loop: 3821ms

I don't have a direct answer why why x64 would be so bad other than the fact that people consistently come up with code like this that makes the x64 JIT look bad, so maybe someone else can chime in on that.

That said I do have one other thing you might want to consider looking into in such an implementation: cache line invalidation. There is an awesome MSDN article here written by @StephenToub that explains what this is all about. The TL;DR; of it is that, because all your data is stored in one array and diff. cores with different local (L2) caches are going to modify parts of that array they have to synchronize the data with the other cores with whom they overlap. If the sections the diff. cores are working on are too close together you're going to end up with a lot of these synchronizations which can eat into your parallel gains. The article shows a technique where you actually allocate extra space in your working array sufficient enough to separate the actual sections containing the data you're going to work on so that when those cores work on the data they don't have to invalidate the other cores. of the for loop rather than being closer to 8x than that. I would bet if you put in the work to address any cache line invalidation that you could squeeze another 10%+ out of it. Just remember there's always some overhead in setting up and coordinating the parallel work so you'll never get 100% perfection.

Here's the revised version of your program with each approach factored into separate methods:

using System;
using System.Diagnostics;
using System.Threading.Tasks;

namespace TextureFromPoints
{
    class RevisedProgram
    {
        const int numPoints = 700;
        const int textureSize = 1024;

        static Random rnd = new Random();

        static void Main(string[] args)
        {
            while(true)
            {
                Console.WriteLine("Starting REVISED");
                Console.WriteLine();

                var pointCloud = new Vector3[numPoints];

                for(int i = 0; i < numPoints; i++)
                    pointCloud[i] = new Vector3(textureSize);

                var result1 = new Vector3[textureSize, textureSize];
                var result2 = new Vector3[textureSize, textureSize];
                var result3 = new Vector3[textureSize, textureSize];

                var sw1 = Inline(pointCloud, result1);

                var sw2 = NotInline(pointCloud, result2);


                var sw3 = Parallelized(pointCloud, result3);

                Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
                Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
                Console.WriteLine();
                Console.Write("Verifying Data: ");
                Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
                Console.WriteLine();
                Console.WriteLine();
                Console.ReadLine();
            }
        }

        private static Stopwatch Parallelized(Vector3[] pointCloud, Vector3[,] result3)
        {
            var sw3 = Stopwatch.StartNew();

            Parallel.For(0, textureSize, x =>
            {
                for(int y = 0; y < textureSize; y++)
                    Computation(pointCloud, result3, x, y);
            });
            sw3.Stop();
            return sw3;
        }

        private static Stopwatch NotInline(Vector3[] pointCloud, Vector3[,] result2)
        {
            var sw2 = Stopwatch.StartNew();
            for(int x = 0; x < textureSize; x++)
                for(int y = 0; y < textureSize; y++)
                    Computation(pointCloud, result2, x, y);
            sw2.Stop();
            return sw2;
        }

        private static Stopwatch Inline(Vector3[] pointCloud, Vector3[,] result1)
        {
            var sw1 = Stopwatch.StartNew();
            for(int x = 0; x < textureSize; x++)
                for(int y = 0; y < textureSize; y++)
                {
                    var targetPos = new Vector3(x, y, 0);
                    var nearestV3 = pointCloud[0];
                    var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);

                    for(int i = 1; i < numPoints; i++)
                    {
                        var currentV3 = pointCloud[i];
                        var currentV3Distance = currentV3.DistanceToPoint(targetPos);
                        if(currentV3Distance < nearestV3Distance)
                        {
                            nearestV3 = currentV3;
                            nearestV3Distance = currentV3Distance;
                        }
                    }
                    result1[x, y] = nearestV3;
                }
            sw1.Stop();
            return sw1;
        }

        private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
        {
            for(int x = 0; x < textureSize; x++)
                for(int y = 0; y < textureSize; y++)
                    if(!lhs[x, y].Equals(rhs[x, y]))
                        return false;
            return true;
        }

        private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
        {
            var targetPos = new Vector3(x, y, 0);
            var nearestV3 = pointCloud[0];
            var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);

            for(int i = 1; i < numPoints; i++)
            {
                var currentV3 = pointCloud[i];
                var currentV3Distance = currentV3.DistanceToPoint(targetPos);
                if(currentV3Distance < nearestV3Distance)
                {
                    nearestV3 = currentV3;
                    nearestV3Distance = currentV3Distance;
                }
            }
            result[x, y] = nearestV3;
        }

        struct Vector3
        {
            public float x;
            public float y;
            public float z;

            public Vector3(float x, float y, float z)
            {
                this.x = x;
                this.y = y;
                this.z = z;
            }
            public Vector3(float randomDistance)
            {
                this.x = (float)rnd.NextDouble() * randomDistance;
                this.y = (float)rnd.NextDouble() * randomDistance;
                this.z = (float)rnd.NextDouble() * randomDistance;
            }

            public static Vector3 operator -(Vector3 a, Vector3 b)
            {
                return new Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
            }

            public float sqrMagnitude()
            {
                return x * x + y * y + z * z;
            }

            public float DistanceToPoint(Vector3 point)
            {
                return (this - point).sqrMagnitude();
            }
        }
    }
}

Update:

Based on what Feng Yuan pointed out about the methods not being inlined by the x64 JIT, you can change the program to do the calculations inline instead and get better performance out of the x64 version than the x86 version. This obviously sucks, but this is the kind of thing that I've seen the x64 JIT destroy before. Here's the new numbers:

After inlining x64:

For Loop (Inline): 19032ms
For Loop: 19209ms
Parallel.For Loop: 3015ms

Inlined version of the code:

using System;
using System.Diagnostics;
using System.Threading.Tasks;

namespace TextureFromPoints
{
    class RevisedProgram
    {
        const int numPoints = 700;
        const int textureSize = 1024;

        static Random rnd = new Random();

        static void Main(string[] args)
        {
            while(true)
            {
                Console.WriteLine("Starting REVISED");
                Console.WriteLine();

                var pointCloud = new Vector3[numPoints];

                for(int i = 0; i < numPoints; i++)
                    pointCloud[i] = new Vector3(textureSize);

                var result1 = new Vector3[textureSize, textureSize];
                var result2 = new Vector3[textureSize, textureSize];
                var result3 = new Vector3[textureSize, textureSize];

                var sw1 = Inline(pointCloud, result1);

                var sw2 = NotInline(pointCloud, result2);


                var sw3 = Parallelized(pointCloud, result3);

                Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
                Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
                Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
                Console.WriteLine();
                Console.Write("Verifying Data: ");
                Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
                Console.WriteLine();
                Console.WriteLine();
                Console.ReadLine();
            }
        }

        private static Stopwatch Parallelized(Vector3[] pointCloud, Vector3[,] result3)
        {
            var sw3 = Stopwatch.StartNew();

            Parallel.For(0, textureSize, x =>
            {
                for(int y = 0; y < textureSize; y++)
                    Computation(pointCloud, result3, x, y);
            });
            sw3.Stop();
            return sw3;
        }

        private static Stopwatch NotInline(Vector3[] pointCloud, Vector3[,] result2)
        {
            var sw2 = Stopwatch.StartNew();
            for(int x = 0; x < textureSize; x++)
                for(int y = 0; y < textureSize; y++)
                    Computation(pointCloud, result2, x, y);
            sw2.Stop();
            return sw2;
        }

        private static Stopwatch Inline(Vector3[] pointCloud, Vector3[,] result1)
        {
            var sw1 = Stopwatch.StartNew();
            for(int x = 0; x < textureSize; x++)
                for(int y = 0; y < textureSize; y++)
                {
                    var targetPos = new Vector3(x, y, 0);
                    var nearestV3 = pointCloud[0];
                    Vector3 temp1 = nearestV3 - targetPos;
                    var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;

                    for(int i = 1; i < numPoints; i++)
                    {
                        var currentV3 = pointCloud[i];
                        Vector3 temp2 = currentV3 - targetPos;
                        var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
                        if(currentV3Distance < nearestV3Distance)
                        {
                            nearestV3 = currentV3;
                            nearestV3Distance = currentV3Distance;
                        }
                    }
                    result1[x, y] = nearestV3;
                }
            sw1.Stop();
            return sw1;
        }

        private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
        {
            for(int x = 0; x < textureSize; x++)
                for(int y = 0; y < textureSize; y++)
                    if(!lhs[x, y].Equals(rhs[x, y]))
                        return false;
            return true;
        }

        private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
        {
            var targetPos = new Vector3(x, y, 0);
            var nearestV3 = pointCloud[0];
            Vector3 temp1 = nearestV3 - targetPos;
            var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;

            for(int i = 1; i < numPoints; i++)
            {
                var currentV3 = pointCloud[i];
                Vector3 temp2 = currentV3 - targetPos;
                var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
                if(currentV3Distance < nearestV3Distance)
                {
                    nearestV3 = currentV3;
                    nearestV3Distance = currentV3Distance;
                }
            }
            result[x, y] = nearestV3;
        }

        private static float DistanceToPoint(Vector3 vector, Vector3 point)
        {
            Vector3 final = vector - point;

            return final.x * final.x + final.y * final.y + final.z * final.z;
        }

        struct Vector3
        {
            public float x;
            public float y;
            public float z;

            public Vector3(float x, float y, float z)
            {
                this.x = x;
                this.y = y;
                this.z = z;
            }
            public Vector3(float randomDistance)
            {
                this.x = (float)rnd.NextDouble() * randomDistance;
                this.y = (float)rnd.NextDouble() * randomDistance;
                this.z = (float)rnd.NextDouble() * randomDistance;
            }

            public static Vector3 operator -(Vector3 a, Vector3 b)
            {
                return new Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
            }
        }
    }
}
like image 57
Drew Marsh Avatar answered Sep 22 '22 23:09

Drew Marsh


The struct is still 12 bytes on 64-bit system.

64-bit is slower due to no inlining for DistanceToPoint

 2     0 [  0] TextureFromPoints.Program+Vector3.DistanceToPoint(Vector3)
23     0 [  0] Texture!TextureFromPoints.Program+Vector3.DistanceToPoint(Vector3)
22     0 [  1]   Texture!TextureFromPoints.Program+Vector3.op_Subtraction(Vector3, Vector3)
30    22 [  0] Texture!TextureFromPoints.Program+Vector3.DistanceToPoint(Vector3)
10     0 [  1]   Texture!TextureFromPoints.Program+Vector3.sqrMagnitude()
33    32 [  0] Texture!TextureFromPoints.Program+Vector3.DistanceToPoint(Vector3)

On 32-bit system, only sqrtMagnitude is a function call, DistanceToPoint and op_Subtraction are inlined.

like image 25
Feng Yuan Avatar answered Sep 20 '22 23:09

Feng Yuan