Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why is the the generic implementation of Vector.Log so much slower than the non-generic implementations for me?

I've run some benchmarks on Math.Log, System.Numerics.Vector.Log, System.Runtime.Intrinsics.Vector128.Log, Vector256.Log and Vector512.Log and the results were pretty surprising to me. I was expecting the generic Vector.Log to perform similarly to its non-generic counterparts, however it was not only significantly slower than them, but even slower than Math.Log. From reading this https://devblogs.microsoft.com/dotnet/dotnet-8-hardware-intrinsics/ I had assumed that Vector<T> would internally make calls to one of the non-generic implementations. Can someone explain to me why Vector<T> is so slow?

I've included my benchmarks below for reference:

Intel Xeon Silver 4214R CPU 2.40GHz, 2 CPU, 48 logical and 24 physical cores  
Benchmark Process Environment Information:  
BenchmarkDotNet v0.13.10  
Runtime=.NET 9.0.8 (9.0.825.36511), X64 RyuJIT AVX2  
GC=Concurrent Workstation  
HardwareIntrinsics=AVX2,AES,BMI1,BMI2,FMA,LZCNT,PCLMUL,POPCNT  VectorSize=256
Method Mean Error StdDev Median
MathLog 777.2 us 14.34 us 13.42 us 773.1 us
GenericVectorLog 2,537.3 us 50.44 us 139.77 us 2,496.8 us
Vector128Log 407.8 us 7.16 us 6.70 us 403.8 us
Vector256Log 243.3 us 4.54 us 4.46 us 243.0 us
Vector512Log 429.4 us 8.43 us 10.66 us 427.7 us
public class LogBenchmarks
{
    double[] inputArray = new double[100000];
    double[] outputArray = new double[100000];
    public LogBenchmarks()
    {
        var random = new Random(0);
        for (int i = 0; i < inputArray.Length; i++)
        {
            inputArray[i] = random.NextDouble();
        }
    }
    [Benchmark]
    public void MathLog()
    {
        for (int i = 0; i < inputArray.Length; i++)
        {
            outputArray[i] = Math.Log(inputArray[i]);
        }
    }

    [Benchmark]
    public void GenericVectorLog()
    {
        var input = MemoryMarshal.Cast<double, Vector<double>>(new ReadOnlySpan<double>(inputArray));
        var output = MemoryMarshal.Cast<double, Vector<double>>(new Span<double>(outputArray));
        var i = 0;
        while (i < input.Length)
        {
            output[i] = Vector.Log(input[i]);
            i++;
        }
    }

    [Benchmark]
    public void Vector128Log()
    {
        var input = MemoryMarshal.Cast<double, Vector128<double>>(new ReadOnlySpan<double>(inputArray));
        var output = MemoryMarshal.Cast<double, Vector128<double>>(new Span<double>(outputArray));
        var i = 0;
        while (i < input.Length)
        {
            output[i] = Vector128.Log(input[i]);
            i++;
        }
    }

    [Benchmark]
    public void Vector256Log()
    {
        var input = MemoryMarshal.Cast<double, Vector256<double>>(new ReadOnlySpan<double>(inputArray));
        var output = MemoryMarshal.Cast<double, Vector256<double>>(new Span<double>(outputArray));
        var i = 0;
        while (i < input.Length)
        {
            output[i] = Vector256.Log(input[i]);
            i++;
        }
    }

    [Benchmark]
    public void Vector512Log()
    {
        var input = MemoryMarshal.Cast<double, Vector512<double>>(new ReadOnlySpan<double>(inputArray));
        var output = MemoryMarshal.Cast<double, Vector512<double>>(new Span<double>(outputArray));
        var i = 0;
        while (i < input.Length)
        {
            output[i] = Vector512.Log(input[i]);
            i++;
        }
    }
}
like image 205
user31260114 Avatar asked Oct 24 '25 15:10

user31260114


1 Answers

I posted a new answer because I finally managed to replicate the question's results. Something weird is going on and doesn't look to have anything to do with SIMD. I managed to confirm the weird Log behavior but only for .NET 9. It looks as if in .NET 9 the un-accelerated Log<T>.Vector is called instead of the accelerated Log(Vector<double> vector)

The source code for Log<T>(Vector<T> vector) and Log(Vector<double> vector) doesn't seem to have changed between .NET 10 and .NET 9.0.1 but only Log(Vector<double> vector) is accelerated :

    internal static Vector<T> Log<T>(Vector<T> vector)
        where T : ILogarithmicFunctions<T>
    {
        Unsafe.SkipInit(out Vector<T> result);

        for (int index = 0; index < Vector<T>.Count; index++)
        {
            T value = T.Log(vector.GetElementUnsafe(index));
            result.SetElementUnsafe(index, value);
        }

        return result;
    }

    /// <inheritdoc cref="Vector128.Log(Vector128{double})" />
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector<double> Log(Vector<double> vector)
    {
        if (IsHardwareAccelerated)
        {
            return VectorMath.LogDouble<Vector<double>, Vector<long>, Vector<ulong>>(vector);
        }
        else
        {
            return Log<double>(vector);
        }
    }

I run this simplified benchmark with a custom configuration to improve the chances of JIT optimization, but .NET 9 always behaves in a weird way :

[Config(typeof(Config))]
public class VectorLogBench
{
    double[] data = new double[4096];

    public VectorLogBench()
    {
        for (int i = 0; i < data.Length; i++)
        {
            data[i] = i + 1;
        }
    }

    [Benchmark]
    public double[] LogMath()
    {
        var results = new double[data.Length];
        for (int i = 0; i < data.Length; i++)
        {
            results[i] = Math.Log(data[i]);
        }

        return results;
    }

    [Benchmark]
    public Vector<double>[] LogV()
    {
        var doubleVectors = MemoryMarshal.Cast<double, Vector<double>>(data);
        var results = new Vector<double>[doubleVectors.Length];
        for (int i = 0; i < doubleVectors.Length; i++)
        {
            results[i] = Vector.Log(doubleVectors[i]);
        }

        return results;
    }

    [Benchmark]
    public Vector256<double>[] Log256()
    {
        var doubleVectors256= MemoryMarshal.Cast<double, Vector256<double>>(data);
        var results = new Vector256<double>[doubleVectors256.Length];
        for (int i = 0; i < doubleVectors256.Length; i++)
        {
            results[i]=Vector256.Log(doubleVectors256[i]);
        }

        return results;
    }

}

The configuration runs withDOTNET_TieredPGO both enabled and disabled, and a very long warmup, just in case this triggers dynamic PGO.

class Config : ManualConfig
{
    public Config()
    {
        AddJob(Job.Default.WithId("Non DPGO 9 ")
            .WithBaseline(true)
            .WithRuntime(CoreRuntime.Core90)
            .WithWarmupCount(100)
            .WithEnvironmentVariables(
                new EnvironmentVariable("DOTNET_TieredPGO", "0")));

        AddJob(Job.Default.WithId("DPGO 9 ")
            .WithRuntime(CoreRuntime.Core90)
            .WithWarmupCount(100)
            .WithEnvironmentVariables(
                new EnvironmentVariable("DOTNET_TieredPGO", "1")));

        AddJob(Job.Default.WithId("Non DPGO 10 ")
            .WithRuntime(CoreRuntime.Core10_0)
            .WithWarmupCount(100)
            .WithEnvironmentVariables(
                new EnvironmentVariable("DOTNET_TieredPGO", "0")));

        AddJob(Job.Default.WithId("DPGO 10 ")
            .WithRuntime(CoreRuntime.Core10_0)
            .WithWarmupCount(100)
            .WithEnvironmentVariables(
                new EnvironmentVariable("DOTNET_TieredPGO", "1")));

    }
}

The results show an enexpected huge delay for Vector.Log in .NET 9 :


// * Summary *

BenchmarkDotNet v0.15.2, Windows 11 (10.0.22631.5699/23H2/2023Update/SunValley3)
Intel Core Ultra 9 185H 2.30GHz, 1 CPU, 22 logical and 16 physical cores
.NET SDK 10.0.100-preview.6.25358.103
  [Host]       : .NET 10.0.0 (10.0.25.35903), X64 RyuJIT AVX2
  DPGO 10      : .NET 10.0.0 (10.0.25.35903), X64 RyuJIT AVX2
  DPGO 9       : .NET 9.0.8 (9.0.825.36511), X64 RyuJIT AVX2
  Non DPGO 10  : .NET 10.0.0 (10.0.25.35903), X64 RyuJIT AVX2
  Non DPGO 9   : .NET 9.0.8 (9.0.825.36511), X64 RyuJIT AVX2

WarmupCount=100

| Method  | Job          | EnvironmentVariables | Runtime   | Mean      | Error    | StdDev    | Median    |
|-------- |------------- |--------------------- |---------- |----------:|---------:|----------:|----------:|
| LogMath | DPGO 10      | DOTNET_TieredPGO=1   | .NET 10.0 |  23.99 us | 0.769 us |  2.245 us |  23.83 us |
| LogV    | DPGO 10      | DOTNET_TieredPGO=1   | .NET 10.0 |  13.94 us | 0.691 us |  2.005 us |  13.90 us |
| Log256  | DPGO 10      | DOTNET_TieredPGO=1   | .NET 10.0 |  14.24 us | 0.800 us |  2.360 us |  13.98 us |
|         |              |                      |           |           |          |           |           |
| LogMath | Non DPGO 10  | DOTNET_TieredPGO=0   | .NET 10.0 |  21.83 us | 0.949 us |  2.708 us |  21.60 us |
| LogV    | Non DPGO 10  | DOTNET_TieredPGO=0   | .NET 10.0 |  13.37 us | 0.578 us |  1.631 us |  13.31 us |
| Log256  | Non DPGO 10  | DOTNET_TieredPGO=0   | .NET 10.0 |  12.42 us | 0.418 us |  1.212 us |  12.63 us |
|         |              |                      |           |           |          |           |           |
| LogMath | DPGO 9       | DOTNET_TieredPGO=1   | .NET 9.0  |  27.11 us | 1.156 us |  3.299 us |  27.05 us |
| LogV    | DPGO 9       | DOTNET_TieredPGO=1   | .NET 9.0  | 104.75 us | 3.794 us | 10.824 us | 106.54 us |
| Log256  | DPGO 9       | DOTNET_TieredPGO=1   | .NET 9.0  |  32.27 us | 2.485 us |  7.326 us |  28.94 us |
|         |              |                      |           |           |          |           |           |
| LogMath | Non DPGO 9   | DOTNET_TieredPGO=0   | .NET 9.0  |  20.35 us | 0.402 us |  0.523 us |  20.34 us |
| LogV    | Non DPGO 9   | DOTNET_TieredPGO=0   | .NET 9.0  | 116.76 us | 3.714 us | 10.717 us | 116.71 us |
| Log256  | Non DPGO 9   | DOTNET_TieredPGO=0   | .NET 9.0  |  29.23 us | 1.533 us |  4.447 us |  28.64 us |

The LogV method's IL generated by dotPeek shows that Vector.Log<> is called, and yet the results are weird :

  IL_0027: call         instance !0/*valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>*/& valuetype [System.Runtime]System.Span`1<valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>>::get_Item(int32)
  IL_002c: ldobj        valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>
  IL_0031: call         valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64> [System.Numerics.Vectors]System.Numerics.Vector::Log(valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>)
  IL_0036: stelem       valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>

The entire method's IL is

  .method public hidebysig instance valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>[]
    LogV() cil managed
  {
    .custom instance void [BenchmarkDotNet.Annotations]BenchmarkDotNet.Attributes.BenchmarkAttribute::.ctor(int32, string)
      = (
      ...
      )
      // int32(34) // 0x00000022

    .param [0]
      .custom instance void [System.Runtime]System.Runtime.CompilerServices.NullableAttribute::.ctor(unsigned int8[])
        = (01 00 02 00 00 00 01 00 00 00 ) // ..........
        // unsigned int8[2]
          /*( unsigned int8(1) // 0x01
          unsigned int8(0) // 0x00
           )*/
    .maxstack 4
    .locals init (
      [0] valuetype [System.Runtime]System.Span`1<valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>> doubleVectors,
      [1] valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>[] results,
      [2] int32 i
    )

    // [37 9 - 37 78]
    IL_0000: ldarg.0      // this
    IL_0001: ldfld        float64[] VectorLogBench::data
    IL_0006: call         valuetype [System.Runtime]System.Span`1<!0/*float64*/> valuetype [System.Runtime]System.Span`1<float64>::op_Implicit(!0/*float64*/[])
    IL_000b: call         valuetype [System.Runtime]System.Span`1<!!1/*valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>*/> [System.Runtime]System.Runtime.InteropServices.MemoryMarshal::Cast<float64, valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>>(valuetype [System.Runtime]System.Span`1<!!0/*float64*/>)
    IL_0010: stloc.0      // doubleVectors

    // [38 9 - 38 64]
    IL_0011: ldloca.s     doubleVectors
    IL_0013: call         instance int32 valuetype [System.Runtime]System.Span`1<valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>>::get_Length()
    IL_0018: newarr       valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>
    IL_001d: stloc.1      // results

    // [39 14 - 39 23]
    IL_001e: ldc.i4.0
    IL_001f: stloc.2      // i

    IL_0020: br.s         IL_003f
    // start of loop, entry point: IL_003f

      // [41 13 - 41 55]
      IL_0022: ldloc.1      // results
      IL_0023: ldloc.2      // i
      IL_0024: ldloca.s     doubleVectors
      IL_0026: ldloc.2      // i
      IL_0027: call         instance !0/*valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>*/& valuetype [System.Runtime]System.Span`1<valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>>::get_Item(int32)
      IL_002c: ldobj        valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>
      IL_0031: call         valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64> [System.Numerics.Vectors]System.Numerics.Vector::Log(valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>)
      IL_0036: stelem       valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>

      // [39 51 - 39 54]
      IL_003b: ldloc.2      // i
      IL_003c: ldc.i4.1
      IL_003d: add
      IL_003e: stloc.2      // i

      // [39 25 - 39 49]
      IL_003f: ldloc.2      // i
      IL_0040: ldloca.s     doubleVectors
      IL_0042: call         instance int32 valuetype [System.Runtime]System.Span`1<valuetype [System.Numerics.Vectors]System.Numerics.Vector`1<float64>>::get_Length()
      IL_0047: blt.s        IL_0022
    // end of loop

    // [44 9 - 44 24]
    IL_0049: ldloc.1      // results
    IL_004a: ret

  } // end of method VectorLogBench::LogV
like image 60
Panagiotis Kanavos Avatar answered Oct 26 '25 05:10

Panagiotis Kanavos



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!