I have recently been running benchmarks on Java vs C# for 1000 tasks to be scheduled over a threadpool. The server has 4 physical processors, each with 8 cores. The OS is Server 2008, has 32 GB of memory and each CPU is a Xeon x7550 Westmere/Nehalem-C.
In short, the Java implementation is much faster than C# at 4 threads but much slower as the number of threads increases. It also seems C# has become quicker per iteration, when the thread count has increased. Graphs are included in this post:
The Java implementation was written on a 64bit Hotspot JVM, with Java 7 and using an Executor Service threadpool I found online (see below). I also set the JVM to concurrent GC.
C# was written on .net 3.5 and the threadpool came from codeproject: http://www.codeproject.com/Articles/7933/Smart-Thread-Pool
(I have included the code below).
My questions:
1) Why is Java getting slower but C# is getting quicker?
2) Why do the execution times of C# fluctuate greatly? (This is our main question)
We did wonder whether the C# fluctuation was caused by the memory bus being maxed out....
Code (Please do not highlight errors with locking, this is irrelevant with my aims):
Java
import java.io.DataOutputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
public class PoolDemo {
static long FastestMemory = 2000000000;
static long SlowestMemory = 0;
static long TotalTime;
static long[] FileArray;
static DataOutputStream outs;
static FileOutputStream fout;
public static void main(String[] args) throws InterruptedException, FileNotFoundException {
int Iterations = Integer.parseInt(args[0]);
int ThreadSize = Integer.parseInt(args[1]);
FileArray = new long[Iterations];
fout = new FileOutputStream("server_testing.csv");
// fixed pool, unlimited queue
ExecutorService service = Executors.newFixedThreadPool(ThreadSize);
//ThreadPoolExecutor executor = (ThreadPoolExecutor) service;
for(int i = 0; i<Iterations; i++) {
Task t = new Task(i);
service.execute(t);
}
service.shutdown();
service.awaitTermination(90, TimeUnit.SECONDS);
System.out.println("Fastest: " + FastestMemory);
System.out.println("Average: " + TotalTime/Iterations);
for(int j=0; j<FileArray.length; j++){
new PrintStream(fout).println(FileArray[j] + ",");
}
}
private static class Task implements Runnable {
private int ID;
static Byte myByte = 0;
public Task(int index) {
this.ID = index;
}
@Override
public void run() {
long Start = System.nanoTime();
int Size1 = 10000000;
int Size2 = 2 * Size1;
int Size3 = Size1;
byte[] list1 = new byte[Size1];
byte[] list2 = new byte[Size2];
byte[] list3 = new byte[Size3];
for(int i=0; i<Size1; i++){
list1[i] = myByte;
}
for (int i = 0; i < Size2; i=i+2)
{
list2[i] = myByte;
}
for (int i = 0; i < Size3; i++)
{
byte temp = list1[i];
byte temp2 = list2[i];
list3[i] = temp;
list2[i] = temp;
list1[i] = temp2;
}
long Finish = System.nanoTime();
long Duration = Finish - Start;
FileArray[this.ID] = Duration;
TotalTime += Duration;
System.out.println("Individual Time " + this.ID + " \t: " + (Duration) + " nanoseconds");
if(Duration < FastestMemory){
FastestMemory = Duration;
}
if (Duration > SlowestMemory)
{
SlowestMemory = Duration;
}
}
}
}
C#:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using Amib.Threading;
using System.Diagnostics;
using System.IO;
using System.Runtime;
namespace ServerTesting
{
class Program
{
static long FastestMemory = 2000000000;
static long SlowestMemory = 0;
static long TotalTime = 0;
static int[] FileOutput;
static byte myByte = 56;
static System.IO.StreamWriter timeFile;
static System.IO.StreamWriter memoryFile;
static void Main(string[] args)
{
Console.WriteLine("Concurrent GC enabled: " + GCSettings.IsServerGC);
int Threads = Int32.Parse(args[1]);
int Iterations = Int32.Parse(args[0]);
timeFile = new System.IO.StreamWriter(Threads + "_" + Iterations + "_" + "time.csv");
FileOutput = new int[Iterations];
TestMemory(Threads, Iterations);
for (int j = 0; j < Iterations; j++)
{
timeFile.WriteLine(FileOutput[j] + ",");
}
timeFile.Close();
Console.ReadLine();
}
private static void TestMemory(int threads, int iterations)
{
SmartThreadPool pool = new SmartThreadPool();
pool.MaxThreads = threads;
Console.WriteLine("Launching " + iterations + " calculators with " + pool.MaxThreads + " threads");
for (int i = 0; i < iterations; i++)
{
pool.QueueWorkItem(new WorkItemCallback(MemoryIntensiveTask), i);
}
pool.WaitForIdle();
double avg = TotalTime/iterations;
Console.WriteLine("Avg Memory Time : " + avg);
Console.WriteLine("Fastest: " + FastestMemory + " ms");
Console.WriteLine("Slowest: " + SlowestMemory + " ms");
}
private static object MemoryIntensiveTask(object args)
{
DateTime start = DateTime.Now;
int Size1 = 10000000;
int Size2 = 2 * Size1;
int Size3 = Size1;
byte[] list1 = new byte[Size1];
byte[] list2 = new byte[Size2];
byte[] list3 = new byte[Size3];
for (int i = 0; i < Size1; i++)
{
list1[i] = myByte;
}
for (int i = 0; i < Size2; i = i + 2)
{
list2[i] = myByte;
}
for (int i = 0; i < Size3; i++)
{
byte temp = list1[i];
byte temp2 = list2[i];
list3[i] = temp;
list2[i] = temp;
list1[i] = temp2;
}
DateTime finish = DateTime.Now;
TimeSpan ts = finish - start;
long duration = ts.Milliseconds;
Console.WriteLine("Individual Time " + args + " \t: " + duration);
FileOutput[(int)args] = (int)duration;
TotalTime += duration;
if (duration < FastestMemory)
{
FastestMemory = duration;
}
if (duration > SlowestMemory)
{
SlowestMemory = duration;
}
return null;
}
}
}
You don't appear to be testing the threading frame work as much as you are testing how the language optimises un-optimised code.
Java is particular good at optimising pointless code, which I believe would explain the difference in the languages. As the number of threads grows, I suspect the bottle neck moves to how the GC performs or some thing else incidental to your test.
Java could also be slowing down as its not NUMA aware by default. Try running -XX:+UseNUMA
However I suggest for maximum performance you should try to keep each process to a single numa region to avoid cross numa overhead.
You can also try this slightly optimise code which was 40% fast on my machine
import java.io.DataOutputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
public class PoolDemo {
static long FastestMemory = 2000000000;
static long SlowestMemory = 0;
static long TotalTime;
static long[] FileArray;
static FileOutputStream fout;
public static void main(String[] args) throws InterruptedException, FileNotFoundException {
int Iterations = Integer.parseInt(args[0]);
int ThreadSize = Integer.parseInt(args[1]);
FileArray = new long[Iterations];
fout = new FileOutputStream("server_testing.csv");
// fixed pool, unlimited queue
ExecutorService service = Executors.newFixedThreadPool(ThreadSize);
//ThreadPoolExecutor executor = (ThreadPoolExecutor) service;
for (int i = 0; i < Iterations; i++) {
Task t = new Task(i);
service.execute(t);
}
service.shutdown();
service.awaitTermination(90, TimeUnit.SECONDS);
System.out.println("Fastest: " + FastestMemory);
System.out.println("Average: " + TotalTime / Iterations);
PrintStream ps = new PrintStream(fout);
for (long aFileArray : FileArray) {
ps.println(aFileArray + ",");
}
}
static class ThreadLocalBytes extends ThreadLocal<byte[]> {
private final int bytes;
ThreadLocalBytes(int bytes) {
this.bytes = bytes;
}
@Override
protected byte[] initialValue() {
return new byte[bytes];
}
}
private static class Task implements Runnable {
static final int Size1 = 10000000;
static final int Size2 = 2 * Size1;
static final int Size3 = Size1;
private int ID;
private static final ThreadLocalBytes list1b = new ThreadLocalBytes(Size1);
private static final ThreadLocalBytes list2b = new ThreadLocalBytes(Size2);
private static final ThreadLocalBytes list3b = new ThreadLocalBytes(Size3);
static byte myByte = 0;
public Task(int index) {
this.ID = index;
}
@Override
public void run() {
long Start = System.nanoTime();
byte[] list1 = list1b.get();
byte[] list2 = list2b.get();
byte[] list3 = list3b.get();
for (int i = 0; i < Size1; i++) {
list1[i] = myByte;
}
for (int i = 0; i < Size2; i = i + 2) {
list2[i] = myByte;
}
for (int i = 0; i < Size3; i++) {
byte temp = list1[i];
byte temp2 = list2[i];
list3[i] = temp;
list2[i] = temp;
list1[i] = temp2;
}
long Finish = System.nanoTime();
long Duration = Finish - Start;
FileArray[this.ID] = Duration;
TotalTime += Duration;
System.out.println("Individual Time " + this.ID + " \t: " + (Duration) + " nanoseconds");
if (Duration < FastestMemory) {
FastestMemory = Duration;
}
if (Duration > SlowestMemory) {
SlowestMemory = Duration;
}
}
}
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With