I've noticed a very significant (~15x) drop in performance when using Math.Round to convert double to int while targeting x64 compared to x86. I tested it on 64 bit Windows on Core i7 3770K. Can anyone reproduce it? Is there any good reason why this is the case? Maybe some weird boundary conditions?
Just for reference I compared Math.Round
(Test1) with 2 approximations: conditional cast (Test2) and 6755399441055744 trick (Test3).
Running times are:
---------------------------
| | x86 | x64 |
|-------+--------+--------|
| Test1 | 0,0662 | 0,9975 |
| Test2 | 0,1517 | 0,1513 |
| Test3 | 0,1966 | 0,0978 |
---------------------------
Here is the benchmark code:
using System;
using System.Diagnostics;
using System.Runtime.InteropServices;
namespace MathRoundTester
{
class Program
{
private const int IterationCount = 1000000;
private static int dummy;
static void Main(string[] args)
{
var data = new double[100];
var rand = new Random(0);
for (int i = 0; i < data.Length; ++i)
{
data[i] = rand.NextDouble() * int.MaxValue * 2 +
int.MinValue + rand.NextDouble();
}
dummy ^= Test1(data);
dummy ^= Test2(data);
dummy ^= Test3(data);
RecordTime(data, Test1);
RecordTime(data, Test2);
RecordTime(data, Test3);
Console.WriteLine(dummy);
Console.Read();
}
private static void RecordTime(double[] data, Func<double[], int> action)
{
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
var sw = Stopwatch.StartNew();
dummy ^= action(data);
sw.Stop();
Console.WriteLine((sw.ElapsedTicks / (double)Stopwatch.Frequency).ToString("F4"));
}
private static int Test1(double[] data)
{
int d = 0;
for (int i = 0; i < IterationCount; ++i)
{
for (int j = 0; j < data.Length; ++j)
{
var x = data[j];
d ^= (int)Math.Round(x);
}
}
return d;
}
private static int Test2(double[] data)
{
int d = 0;
for (int i = 0; i < IterationCount; ++i)
{
for (int j = 0; j < data.Length; ++j)
{
var x = data[j];
d ^= x > 0 ? (int)(x + 0.5) : (int)(x - 0.5);
}
}
return d;
}
[StructLayout(LayoutKind.Explicit)]
private struct DoubleIntUnion
{
public DoubleIntUnion(double a)
{
Int = 0;
Double = a;
}
[FieldOffset(0)]
public double Double;
[FieldOffset(0)]
public int Int;
}
private static int Test3(double[] data)
{
int d = 0;
for (int i = 0; i < IterationCount; ++i)
{
for (int j = 0; j < data.Length; ++j)
{
var x = data[j];
d ^= new DoubleIntUnion(x + 6755399441055744.0).Int;
}
}
return d;
}
}
}
Update 2016-11-23:
Some time after AndreyAkinshin kindly posted a question on the dotnet/coreclr repo, it was added to the 1.2.0 milestone. So it seems that this issue is just an oversight and will be fixed.
Let's look at the asm of (int) Math.Round(data[j])
.
LegacyJIT-x86:
01172EB0 fld qword ptr [eax+edi*8+8]
01172EB4 fistp dword ptr [ebp-14h]
RyuJIT-x64:
`d7350617 c4e17b1044d010 vmovsd xmm0,qword ptr [rax+rdx*8+10h]
`d735061e e83dce605f call clr!COMDouble::Round (`3695d460)
`d7350623 c4e17b2ce8 vcvttsd2si ebp,xmm0
Source of clr!COMDouble::Round
:
clr!COMDouble::Round:
`3695d460 4883ec58 sub rsp,58h
`3695d464 0f29742440 movaps xmmword ptr [rsp+40h],xmm6
`3695d469 0f57c9 xorps xmm1,xmm1
`3695d46c f2480f2cc0 cvttsd2si rax,xmm0
`3695d471 0f297c2430 movaps xmmword ptr [rsp+30h],xmm7
`3695d476 0f28f0 movaps xmm6,xmm0
`3695d479 440f29442420 movaps xmmword ptr [rsp+20h],xmm8
`3695d47f f2480f2ac8 cvtsi2sd xmm1,rax
`3695d484 660f2ec1 ucomisd xmm0,xmm1
`3695d488 7a17 jp clr!COMDouble::Round+0x41 (`3695d4a1)
`3695d48a 7515 jne clr!COMDouble::Round+0x41 (`3695d4a1)
`3695d48c 0f28742440 movaps xmm6,xmmword ptr [rsp+40h]
`3695d491 0f287c2430 movaps xmm7,xmmword ptr [rsp+30h]
`3695d496 440f28442420 movaps xmm8,xmmword ptr [rsp+20h]
`3695d49c 4883c458 add rsp,58h
`3695d4a0 c3 ret
`3695d4a1 440f28c0 movaps xmm8,xmm0
`3695d4a5 f2440f5805c23a7100
addsd xmm8,mmword ptr [clr!_real (`37070f70)] ds:`37070f70=3fe0000000000000
`3695d4ae 410f28c0 movaps xmm0,xmm8
`3695d4b2 e821000000 call clr!floor (`3695d4d8)
`3695d4b7 66410f2ec0 ucomisd xmm0,xmm8
`3695d4bc 0f28f8 movaps xmm7,xmm0
`3695d4bf 7a06 jp clr!COMDouble::Round+0x67 (`3695d4c7)
`3695d4c1 0f8465af3c00 je clr! ?? ::FNODOBFM::`string'+0xdd8c4 (`36d2842c)
`3695d4c7 0f28ce movaps xmm1,xmm6
`3695d4ca 0f28c7 movaps xmm0,xmm7
`3695d4cd ff1505067000 call qword ptr [clr!_imp__copysign (`3705dad8)]
`3695d4d3 ebb7 jmp clr!COMDouble::Round+0x2c (`3695d48c)
As you can see, LegacyJIT-x86 uses an extremely fast fld
-fistp
pair; according to the Instruction tables by Agner Fog, we have the following numbers for Haswell:
Instruction | Latency | Reciprocal throughput
------------|---------|----------------------
FLD m32/64 | 3 | 0.5
FIST(P) m | 7 | 1
RyuJIT-x64 directly calls clr!COMDouble::Round
(LegacyJIT-x64 do the same). You can find source code for this method in the dotnet/coreclr repo. If you are working with release-1.0.0
, you need floatnative.cpp:
#if defined(_TARGET_X86_)
__declspec(naked)
double __fastcall COMDouble::Round(double d)
{
LIMITED_METHOD_CONTRACT;
__asm {
fld QWORD PTR [ESP+4]
frndint
ret 8
}
}
#else // !defined(_TARGET_X86_)
FCIMPL1_V(double, COMDouble::Round, double d)
FCALL_CONTRACT;
double tempVal;
double flrTempVal;
// If the number has no fractional part do nothing
// This shortcut is necessary to workaround precision loss in borderline cases on some platforms
if ( d == (double)(__int64)d )
return d;
tempVal = (d+0.5);
//We had a number that was equally close to 2 integers.
//We need to return the even one.
flrTempVal = floor(tempVal);
if (flrTempVal==tempVal) {
if (0 != fmod(tempVal, 2.0)) {
flrTempVal -= 1.0;
}
}
flrTempVal = _copysign(flrTempVal, d);
return flrTempVal;
FCIMPLEND
#endif // defined(_TARGET_X86_)
If you are working with the master branch, you could find a similar code in floatdouble.cpp.
FCIMPL1_V(double, COMDouble::Round, double x)
FCALL_CONTRACT;
// If the number has no fractional part do nothing
// This shortcut is necessary to workaround precision loss in borderline cases on some platforms
if (x == (double)((INT64)x)) {
return x;
}
// We had a number that was equally close to 2 integers.
// We need to return the even one.
double tempVal = (x + 0.5);
double flrTempVal = floor(tempVal);
if ((flrTempVal == tempVal) && (fmod(tempVal, 2.0) != 0)) {
flrTempVal -= 1.0;
}
return _copysign(flrTempVal, x);
FCIMPLEND
It seems that the full .NET Framework uses the same logic.
Thus, (int)Math.Round
really works much faster on x86
than on x64
because of a difference in the internal implementations of different JIT compilers. Note that this behavior can be changed in the future.
By the way, you could write a small and reliable benchmark with help of BenchmarkDotNet:
[LegacyJitX86Job, LegacyJitX64Job, RyuJitX64Job]
public class MathRoundBenchmarks
{
private const int N = 100;
private double[] data;
[Setup]
public void Setup()
{
var rand = new Random(0);
data = new double[N];
for (int i = 0; i < data.Length; ++i)
{
data[i] = rand.NextDouble() * int.MaxValue * 2 +
int.MinValue + rand.NextDouble();
}
}
[Benchmark(OperationsPerInvoke = N)]
public int MathRound()
{
int d = 0;
for (int i = 0; i < data.Length; ++i)
d ^= (int) Math.Round(data[i]);
return d;
}
}
Results:
BenchmarkDotNet.Core=v0.9.9.0
OS=Microsoft Windows NT 6.2.9200.0
Processor=Intel(R) Core(TM) i7-4702MQ CPU 2.20GHz, ProcessorCount=8
Frequency=2143475 ticks, Resolution=466.5321 ns, Timer=TSC
CLR=MS.NET 4.0.30319.42000, Arch=64-bit RELEASE [RyuJIT]
GC=Concurrent Workstation
JitModules=clrjit-v4.6.1586.0
Type=MathRoundBenchmarks Mode=Throughput
Method | Platform | Jit | Median | StdDev |
---------- |--------- |---------- |----------- |---------- |
MathRound | X64 | LegacyJit | 12.8640 ns | 0.2796 ns |
MathRound | X64 | RyuJit | 13.4390 ns | 0.4365 ns |
MathRound | X86 | LegacyJit | 1.0278 ns | 0.0373 ns |
Not an answer as such, but some code that others may find useful in performance critical areas on x64 systems depending on exact rounding requirements.
Performance times in ms for 100000000 operations are:
Round(x): 1112
Round(x,y): 2183
FastMath.Round(x): 155
FastMath.Round(x,y): 519
Code:
public static class FastMath
{
private static readonly double[] RoundLookup = CreateRoundLookup();
private static double[] CreateRoundLookup()
{
double[] result = new double[15];
for (int i = 0; i < result.Length; i++)
{
result[i] = Math.Pow(10, i);
}
return result;
}
public static double Round(double value)
{
return Math.Floor(value + 0.5);
}
public static double Round(double value, int decimalPlaces)
{
double adjustment = RoundLookup[decimalPlaces];
return Math.Floor(value * adjustment + 0.5) / adjustment;
}
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With