For a dynamic binary translation simulator, I need to generate collectible .NET assemblies with classes that access static fields. However, when using static fields inside collectible assemblies, execution performance is by factor of 2-3 lower compared to non-collectible assemblies. This phenomen is not present in collectible assemblies that do not use static fields.
In the code below the method MyMethod
of abstract class AbstrTest
is implemented by collectible and non-collectible dynamic assemblies. Using CreateTypeConst
the MyMethod
multiplies the ulong argument value by a constant value of two, while using CreateTypeField
the second factor is taken from
a constructor initialized static field MyField
.
To obtain realistic results, the MyMethod
results are accumulated in a for loop.
Here are the measurement results (.NET CLR 4.5/4.6):
Testing non-collectible const multiply:
Elapsed: 8721.2867 ms
Testing collectible const multiply:
Elapsed: 8696.8124 ms
Testing non-collectible field multiply:
Elapsed: 10151.6921 ms
Testing collectible field multiply:
Elapsed: 33404.4878 ms
Here is my reproducer code:
using System;
using System.Reflection;
using System.Reflection.Emit;
using System.Diagnostics;
public abstract class AbstrTest {
public abstract ulong MyMethod(ulong x);
}
public class DerivedClassBuilder {
private static Type CreateTypeConst(string name, bool collect) {
// Create an assembly.
AssemblyName myAssemblyName = new AssemblyName();
myAssemblyName.Name = name;
AssemblyBuilder myAssembly = AppDomain.CurrentDomain.DefineDynamicAssembly(
myAssemblyName, collect ? AssemblyBuilderAccess.RunAndCollect : AssemblyBuilderAccess.Run);
// Create a dynamic module in Dynamic Assembly.
ModuleBuilder myModuleBuilder = myAssembly.DefineDynamicModule(name);
// Define a public class named "MyClass" in the assembly.
TypeBuilder myTypeBuilder = myModuleBuilder.DefineType("MyClass", TypeAttributes.Public, typeof(AbstrTest));
// Create the MyMethod method.
MethodBuilder myMethodBuilder = myTypeBuilder.DefineMethod("MyMethod",
MethodAttributes.Public | MethodAttributes.ReuseSlot | MethodAttributes.Virtual | MethodAttributes.HideBySig,
typeof(ulong), new Type [] { typeof(ulong) });
ILGenerator methodIL = myMethodBuilder.GetILGenerator();
methodIL.Emit(OpCodes.Ldarg_1);
methodIL.Emit(OpCodes.Ldc_I4_2);
methodIL.Emit(OpCodes.Conv_U8);
methodIL.Emit(OpCodes.Mul);
methodIL.Emit(OpCodes.Ret);
return myTypeBuilder.CreateType();
}
private static Type CreateTypeField(string name, bool collect) {
// Create an assembly.
AssemblyName myAssemblyName = new AssemblyName();
myAssemblyName.Name = name;
AssemblyBuilder myAssembly = AppDomain.CurrentDomain.DefineDynamicAssembly(
myAssemblyName, collect ? AssemblyBuilderAccess.RunAndCollect : AssemblyBuilderAccess.Run);
// Create a dynamic module in Dynamic Assembly.
ModuleBuilder myModuleBuilder = myAssembly.DefineDynamicModule(name);
// Define a public class named "MyClass" in the assembly.
TypeBuilder myTypeBuilder = myModuleBuilder.DefineType("MyClass", TypeAttributes.Public, typeof(AbstrTest));
// Define a private String field named "MyField" in the type.
FieldBuilder myFieldBuilder = myTypeBuilder.DefineField("MyField",
typeof(ulong), FieldAttributes.Private | FieldAttributes.Static);
// Create the constructor.
ConstructorBuilder constructor = myTypeBuilder.DefineConstructor(
MethodAttributes.Public | MethodAttributes.SpecialName | MethodAttributes.RTSpecialName | MethodAttributes.HideBySig,
CallingConventions.Standard, Type.EmptyTypes);
ConstructorInfo superConstructor = typeof(AbstrTest).GetConstructor(
BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance,
null, Type.EmptyTypes, null);
ILGenerator constructorIL = constructor.GetILGenerator();
constructorIL.Emit(OpCodes.Ldarg_0);
constructorIL.Emit(OpCodes.Call, superConstructor);
constructorIL.Emit(OpCodes.Ldc_I4_2);
constructorIL.Emit(OpCodes.Conv_U8);
constructorIL.Emit(OpCodes.Stsfld, myFieldBuilder);
constructorIL.Emit(OpCodes.Ret);
// Create the MyMethod method.
MethodBuilder myMethodBuilder = myTypeBuilder.DefineMethod("MyMethod",
MethodAttributes.Public | MethodAttributes.ReuseSlot | MethodAttributes.Virtual | MethodAttributes.HideBySig,
typeof(ulong), new Type [] { typeof(ulong) });
ILGenerator methodIL = myMethodBuilder.GetILGenerator();
methodIL.Emit(OpCodes.Ldarg_1);
methodIL.Emit(OpCodes.Ldsfld, myFieldBuilder);
methodIL.Emit(OpCodes.Mul);
methodIL.Emit(OpCodes.Ret);
return myTypeBuilder.CreateType();
}
public static void Main() {
ulong accu;
Stopwatch stopwatch;
try {
Console.WriteLine("Testing non-collectible const multiply:");
AbstrTest i0 = (AbstrTest)Activator.CreateInstance(
CreateTypeConst("MyClassModule0", false));
stopwatch = Stopwatch.StartNew();
accu = 0;
for (uint i = 0; i < 0xffffffff; i++)
accu += i0.MyMethod(i);
stopwatch.Stop();
Console.WriteLine("Elapsed: " + stopwatch.Elapsed.TotalMilliseconds + " ms");
Console.WriteLine("Testing collectible const multiply:");
AbstrTest i1 = (AbstrTest)Activator.CreateInstance(
CreateTypeConst("MyClassModule1", true));
stopwatch = Stopwatch.StartNew();
accu = 0;
for (uint i = 0; i < 0xffffffff; i++)
accu += i1.MyMethod(i);
stopwatch.Stop();
Console.WriteLine("Elapsed: " + stopwatch.Elapsed.TotalMilliseconds + " ms");
Console.WriteLine("Testing non-collectible field multiply:");
AbstrTest i2 = (AbstrTest)Activator.CreateInstance(
CreateTypeField("MyClassModule2", false));
stopwatch = Stopwatch.StartNew();
accu = 0;
for (uint i = 0; i < 0xffffffff; i++)
accu += i2.MyMethod(i);
stopwatch.Stop();
Console.WriteLine("Elapsed: " + stopwatch.Elapsed.TotalMilliseconds + " ms");
Console.WriteLine("Testing collectible field multiply:");
AbstrTest i3 = (AbstrTest)Activator.CreateInstance(
CreateTypeField("MyClassModule3", true));
stopwatch = Stopwatch.StartNew();
accu = 0;
for (uint i = 0; i < 0xffffffff; i++)
accu += i3.MyMethod(i);
stopwatch.Stop();
Console.WriteLine("Elapsed: " + stopwatch.Elapsed.TotalMilliseconds + " ms");
}
catch (Exception e) {
Console.WriteLine("Exception Caught " + e.Message);
}
}
}
So my question is: Why is it slower?
Yes, this is a pretty inevitable consequence of the way static variables are allocated. I'll first describe how you put the "visual" back into Visual Studio, you'll only have a shot at diagnosing perf problems like this when you can look at the machine code that the jitter generates.
That's tricky to do for Reflection.Emit code, you can't step through the delegate call nor do you have any way to find exactly where the code is generated. What you want to do is inject a call to Debugger.Break() so the debugger stops at the exact right spot. So:
ILGenerator methodIL = myMethodBuilder.GetILGenerator();
var brk = typeof(Debugger).GetMethod("Break");
methodIL.Emit(OpCodes.Call, brk);
methodIL.Emit(OpCodes.Ldarg_1);
// etc..
Change the loop repeats to 1. Tools > Options > Debugging > General. Untick "Just My Code" and "Suppress JIT optimization". Debug tab > tick "Enable native code debugging". Switch to the Release build. I'll post the 32-bit code, it is more fun since the x64 jitter can do a much better job.
The machine code for the "Testing non-collectible field multiply" test looks like:
01410E70 push dword ptr [ebp+0Ch] ; Ldarg_1, high 32-bits
01410E73 push dword ptr [ebp+8] ; Ldarg_1, low 32-bits
01410E76 push dword ptr ds:[13A6528h] ; myFieldBuilder, high 32-bits
01410E7C push dword ptr ds:[13A6524h] ; myFieldBuilder, low 32-bits
01410E82 call @JIT_LMul@16 (73AE1C20h) ; 64 bit multiply
Nothing very drastic going on, it calls into a CLR helper method to perform a 64-bit multiply. The x64 jitter can do it with a single IMUL instruction. Note the access to the static myFieldBuilder
variable, it has a hard-coded address, 0x13A6524. It will be different on your machine. This is very efficient.
Now the disappointing one:
059F0480 push dword ptr [ebp+0Ch] ; Ldarg_1, high 32-bits
059F0483 push dword ptr [ebp+8] ; Ldarg_1, low 32-bits
059F0486 mov ecx,59FC8A0h ; arg2 = DynamicClassDomainId
059F048B xor edx,edx ; arg1 = DomainId
059F048D call JIT_GetSharedNonGCStaticBaseDynamicClass (73E0A6C7h)
059F0492 push dword ptr [eax+8] ; @myFieldBuilder, high 32-bits
059F0495 push dword ptr [eax+4] ; @myFieldBuilder, low 32-bits
059F0498 call @JIT_LMul@16 (73AE1C20h) ; 64-bit multiply
You can tell why it slower from half a mile away, there's an extra call to JIT_GetSharedNonGCStaticBaseDynamicClass. It is a helper function inside the CLR that was specifically designed to deal with static variables used in Reflection.Emit code that was built with AssemblyBuilderAccess.RunAndCollect. You can see the source today, it is here. Makes everybody's eyes bleed but it is function that maps an AppDomain identifier and a dynamic class identifier (aka type handle) to an allocated piece of memory that stores static variables.
In the "non-collectible" version the jitter knows the specific address where the static variable is stored. It allocated the variable when it jitted the code from an internal structure called the "loader heap", associated with the AppDomain. Knowing the exact address of the variable, it can directly emit the address of the variable in the machine code. Very efficient of course, there is no possible way to do this faster.
But that cannot work in the "collectible" version, it doesn't just have to garbage collect the machine code but also the static variables. That can only work when the storage is allocated dynamically. So it can dynamically be released. The extra indirection, compare it to a Dictionary, is what makes the code slower.
You'll perhaps now appreciate the reason why .NET assemblies (and code) cannot be unloaded unless the AppDomain is unloaded. It is a very, very important perf optimization.
Not sure what kind of recommendation you'd like to get ahead. One would be to take care of static variable storage yourself, a class with instance fields. No problem getting those collected. Still won't be as fast, it takes an extra indirection, but definitely faster than letting the CLR take care of it.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With