I'm banging my head against the wall here, so I hope that some of you may be able to educate me. I was doing some performance benchmarks using BenchmarkDotNet and I ran into this weird case where it seems that declaring a member const
degrades performance considerably.
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using System;
namespace PerfTest
{
[DisassemblyDiagnoser(printAsm: true, printSource: true)]
public class Test
{
private int[] data;
private int Threshold = 90;
private const int ConstThreshold = 90;
[GlobalSetup]
public void GlobalSetup()
{
data = new int[1000];
var random = new Random(42);
for (var i = 0; i < data.Length; i++)
{
data[i] = random.Next(100);
}
}
static void Main(string[] args)
{
var summary = BenchmarkRunner.Run<Test>();
}
[Benchmark(Baseline = true)]
public void ClampToMemberValue()
{
for (var i = 0; i < data.Length; i++)
{
if (data[i] > Threshold) data[i] = Threshold;
}
}
[Benchmark]
public void ClampToConstValue()
{
for (var i = 0; i < data.Length; i++)
{
if (data[i] > ConstThreshold) data[i] = ConstThreshold;
}
}
}
}
Notice that the only difference between the two test methods is whether they compare against a regular member variable or a const member.
According to BenchmarkDotNet using the const value is significantly slower and I don't understand why.
BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362
Intel Core i7-5820K CPU 3.30GHz (Broadwell), 1 CPU, 12 logical and 6 physical cores
.NET Core SDK=3.0.100
[Host] : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), 64bit RyuJIT
DefaultJob : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), 64bit RyuJIT
| Method | Mean | Error | StdDev | Ratio |
|------------------- |---------:|---------:|---------:|------:|
| ClampToMemberValue | 590.4 ns | 1.980 ns | 1.852 ns | 1.00 |
| ClampToConstValue | 724.6 ns | 4.184 ns | 3.709 ns | 1.23 |
Looking at the JIT compiled code doesn't explain it as far as I can tell. Here's the code for the two methods. The only difference is whether the compare is done against a register or a literal.
00007ff9`7f1b8500 PerfTest.Test.ClampToMemberValue()
for (var i = 0; i < data.Length; i++)
^^^^^^^^^
00007ff9`7f1b8504 33c0 xor eax,eax
for (var i = 0; i < data.Length; i++)
^^^^^^^^^^^^^^^
00007ff9`7f1b8506 488b5108 mov rdx,qword ptr [rcx+8]
00007ff9`7f1b850a 837a0800 cmp dword ptr [rdx+8],0
00007ff9`7f1b850e 7e2e jle 00007ff9`7f1b853e
00007ff9`7f1b8510 8b4910 mov ecx,dword ptr [rcx+10h]
if (data[i] > Threshold) data[i] = Threshold;
^^^^^^^^^^^^^^^^^^^^^^^^
00007ff9`7f1b8513 4c8bc2 mov r8,rdx
00007ff9`7f1b8516 458b4808 mov r9d,dword ptr [r8+8]
00007ff9`7f1b851a 413bc1 cmp eax,r9d
00007ff9`7f1b851d 7324 jae 00007ff9`7f1b8543
00007ff9`7f1b851f 4c63c8 movsxd r9,eax
00007ff9`7f1b8522 43394c8810 cmp dword ptr [r8+r9*4+10h],ecx
00007ff9`7f1b8527 7e0e jle 00007ff9`7f1b8537
if (data[i] > Threshold) data[i] = Threshold;
^^^^^^^^^^^^^^^^^^^^
00007ff9`7f1b8529 4c8bc2 mov r8,rdx
00007ff9`7f1b852c 448bc9 mov r9d,ecx
00007ff9`7f1b852f 4c63d0 movsxd r10,eax
00007ff9`7f1b8532 47894c9010 mov dword ptr [r8+r10*4+10h],r9d
for (var i = 0; i < data.Length; i++)
^^^
00007ff9`7f1b8537 ffc0 inc eax
00007ff9`7f1b8539 394208 cmp dword ptr [rdx+8],eax
00007ff9`7f1b853c 7fd5 jg 00007ff9`7f1b8513
}
^
00007ff9`7f1b853e 4883c428 add rsp,28h
and
00007ff9`7f1a8500 PerfTest.Test.ClampToConstValue()
for (var i = 0; i < data.Length; i++)
^^^^^^^^^
00007ff9`7f1a8504 33c0 xor eax,eax
for (var i = 0; i < data.Length; i++)
^^^^^^^^^^^^^^^
00007ff9`7f1a8506 488b5108 mov rdx,qword ptr [rcx+8]
00007ff9`7f1a850a 837a0800 cmp dword ptr [rdx+8],0
00007ff9`7f1a850e 7e2d jle 00007ff9`7f1a853d
if (data[i] > ConstThreshold) data[i] = ConstThreshold;
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
00007ff9`7f1a8510 488bca mov rcx,rdx
00007ff9`7f1a8513 448b4108 mov r8d,dword ptr [rcx+8]
00007ff9`7f1a8517 413bc0 cmp eax,r8d
00007ff9`7f1a851a 7326 jae 00007ff9`7f1a8542
00007ff9`7f1a851c 4c63c0 movsxd r8,eax
00007ff9`7f1a851f 42837c81105a cmp dword ptr [rcx+r8*4+10h],5Ah
00007ff9`7f1a8525 7e0f jle 00007ff9`7f1a8536
if (data[i] > ConstThreshold) data[i] = ConstThreshold;
^^^^^^^^^^^^^^^^^^^^^^^^^
00007ff9`7f1a8527 488bca mov rcx,rdx
00007ff9`7f1a852a 4c63c0 movsxd r8,eax
00007ff9`7f1a852d 42c74481105a000000 mov dword ptr [rcx+r8*4+10h],5Ah
for (var i = 0; i < data.Length; i++)
^^^
00007ff9`7f1a8536 ffc0 inc eax
00007ff9`7f1a8538 394208 cmp dword ptr [rdx+8],eax
00007ff9`7f1a853b 7fd3 jg 00007ff9`7f1a8510
}
^
00007ff9`7f1a853d 4883c428 add rsp,28h
I'm sure there's something I've overlooked, but I can't grok it at this point so I'm looking for input on what can explain this.
Looking at https://benchmarkdotnet.org/articles/features/setup-and-cleanup.html
I believe you should be using [IterationSetup]
instead of [GlobalSetup]
. With global setup, the data
is changed once and then the changed data
is reused across benchmarks.
So, I've changed the code to use proper initialization. Changed variables to make checks more frequent. And added few more variations.
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using System;
namespace PerfTest
{
[DisassemblyDiagnoser(printAsm: true, printSource: true)]
public class Test
{
private int[] data;
private int[] data_iteration;
private int Threshold = 50;
private const int ConstThreshold = 50;
[GlobalSetup]
public void GlobalSetup()
{
data = new int[100000];
var random = new Random(42);
for (var i = 0; i < data.Length; i++)
{
data[i] = random.Next(100);
}
}
[IterationSetup]
public void IterationSetup()
{
data_iteration = new int[data.Length];
Array.Copy(data, data_iteration, data.Length);
}
static void Main(string[] args)
{
var summary = BenchmarkRunner.Run<Test>();
}
[Benchmark]
public void ClampToClassConstValue()
{
for (var i = 0; i < data_iteration.Length; i++)
{
if (data_iteration[i] > ConstThreshold) data_iteration[i] = ConstThreshold;
}
}
[Benchmark]
public void ClampToLocalConstValue()
{
const int ConstThresholdLocal = 50;
for (var i = 0; i < data_iteration.Length; i++)
{
if (data_iteration[i] > ConstThresholdLocal) data_iteration[i] = ConstThresholdLocal;
}
}
[Benchmark]
public void ClampToInlineValue()
{
for (var i = 0; i < data_iteration.Length; i++)
{
if (data_iteration[i] > 50) data_iteration[i] = 50;
}
}
[Benchmark]
public void ClampToLocalVariable()
{
var ThresholdLocal = 50;
for (var i = 0; i < data_iteration.Length; i++)
{
if (data_iteration[i] > ThresholdLocal) data_iteration[i] = ThresholdLocal;
}
}
[Benchmark(Baseline = true)]
public void ClampToMemberValue()
{
for (var i = 0; i < data_iteration.Length; i++)
{
if (data_iteration[i] > Threshold) data_iteration[i] = Threshold;
}
}
}
}
Results look more normal :
BenchmarkDotNet=v0.12.0, OS=Windows 10.0.17134.1069 (1803/April2018Update/Redstone4)
Intel Core i7-8850H CPU 2.60GHz (Coffee Lake), 1 CPU, 12 logical and 6 physical cores
Frequency=2531250 Hz, Resolution=395.0617 ns, Timer=TSC
.NET Core SDK=3.0.100
[Host] : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), X64 RyuJIT
Job-INSHHX : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), X64 RyuJIT
InvocationCount=1 UnrollFactor=1
| Method | Mean | Error | StdDev | Median | Ratio | RatioSD |
|----------------------- |---------:|---------:|---------:|---------:|------:|--------:|
| ClampToClassConstValue | 391.5 us | 17.86 us | 17.54 us | 384.2 us | 1.02 | 0.05 |
| ClampToLocalConstValue | 399.6 us | 9.49 us | 11.66 us | 399.0 us | 1.05 | 0.07 |
| ClampToInlineValue | 384.1 us | 5.99 us | 5.00 us | 383.0 us | 1.00 | 0.06 |
| ClampToLocalVariable | 382.7 us | 3.60 us | 3.00 us | 382.0 us | 1.00 | 0.05 |
| ClampToMemberValue | 379.6 us | 8.48 us | 16.73 us | 371.8 us | 1.00 | 0.00 |
There doesn't seem to be any difference between different variants. Either everything is optimized or const int not optimized in any way in this scenario.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With