Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Improve Binary Serialization Performance for large List of structs

I have a structure holding 3d co-ordinates in 3 ints. In a test I've put together a List<> of 1 million random points and then used Binary serialization to a memory stream.

The memory stream is coming in a ~ 21 MB - which seems very inefficient as 1000000 points * 3 coords * 4 bytes should come out at 11MB minimum

Its also taking ~ 3 seconds on my test rig.

Any ideas for improving performance and/or size?

(I don't have to keep the ISerialzable interface if it helps, I could write out directly to a memory stream)

EDIT - From answers below I've put together a serialization showdown comparing BinaryFormatter, 'Raw' BinaryWriter and Protobuf

using System;
using System.Text;
using System.Collections.Generic;
using System.Linq;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using System.Runtime.Serialization;
using System.Runtime.Serialization.Formatters.Binary;
using System.IO;
using ProtoBuf;

namespace asp_heatmap.test
{
    [Serializable()] // For .NET BinaryFormatter
    [ProtoContract] // For Protobuf
    public class Coordinates : ISerializable
    {
        [Serializable()]
        [ProtoContract]
        public struct CoOrd
        {
            public CoOrd(int x, int y, int z)
            {
                this.x = x;
                this.y = y;
                this.z = z;
            }
            [ProtoMember(1)]            
            public int x;
            [ProtoMember(2)]
            public int y;
            [ProtoMember(3)]
            public int z;
        }

        internal Coordinates()
        {
        }

        [ProtoMember(1)]
        public List<CoOrd> Coords = new List<CoOrd>();

        public void SetupTestArray()
        {
            Random r = new Random();
            List<CoOrd> coordinates = new List<CoOrd>();
            for (int i = 0; i < 1000000; i++)
            {
                Coords.Add(new CoOrd(r.Next(), r.Next(), r.Next()));
            }
        }

        #region Using Framework Binary Formatter Serialization

        void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
        {
            info.AddValue("Coords", this.Coords);
        }

        internal Coordinates(SerializationInfo info, StreamingContext context)
        {
            this.Coords = (List<CoOrd>)info.GetValue("Coords", typeof(List<CoOrd>));
        }

        #endregion

        # region 'Raw' Binary Writer serialization

        public MemoryStream RawSerializeToStream()
        {
            MemoryStream stream = new MemoryStream(Coords.Count * 3 * 4 + 4);
            BinaryWriter writer = new BinaryWriter(stream);
            writer.Write(Coords.Count);
            foreach (CoOrd point in Coords)
            {
                writer.Write(point.x);
                writer.Write(point.y);
                writer.Write(point.z);
            }
            return stream;
        }

        public Coordinates(MemoryStream stream)
        {
            using (BinaryReader reader = new BinaryReader(stream))
            {
                int count = reader.ReadInt32();
                Coords = new List<CoOrd>(count);
                for (int i = 0; i < count; i++)                
                {
                    Coords.Add(new CoOrd(reader.ReadInt32(),reader.ReadInt32(),reader.ReadInt32()));
                }
            }        
        }
        #endregion
    }

    [TestClass]
    public class SerializationTest
    {
        [TestMethod]
        public void TestBinaryFormatter()
        {
            Coordinates c = new Coordinates();
            c.SetupTestArray();

            // Serialize to memory stream
            MemoryStream mStream = new MemoryStream();
            BinaryFormatter bformatter = new BinaryFormatter();
            bformatter.Serialize(mStream, c);
            Console.WriteLine("Length : {0}", mStream.Length);

            // Now Deserialize
            mStream.Position = 0;
            Coordinates c2 = (Coordinates)bformatter.Deserialize(mStream);
            Console.Write(c2.Coords.Count);

            mStream.Close();
        }

        [TestMethod]
        public void TestBinaryWriter()
        {
            Coordinates c = new Coordinates();
            c.SetupTestArray();

            MemoryStream mStream = c.RawSerializeToStream();
            Console.WriteLine("Length : {0}", mStream.Length);

            // Now Deserialize
            mStream.Position = 0;
            Coordinates c2 = new Coordinates(mStream);
            Console.Write(c2.Coords.Count);
        }

        [TestMethod]
        public void TestProtoBufV2()
        {
            Coordinates c = new Coordinates();
            c.SetupTestArray();

            MemoryStream mStream = new MemoryStream();
            ProtoBuf.Serializer.Serialize(mStream,c);
            Console.WriteLine("Length : {0}", mStream.Length);

            mStream.Position = 0;
            Coordinates c2 = ProtoBuf.Serializer.Deserialize<Coordinates>(mStream);
            Console.Write(c2.Coords.Count);
        }
    }
}

Results (Note PB v2.0.0.423 beta)

                Serialize | Ser + Deserialize    | Size
-----------------------------------------------------------          
BinaryFormatter    2.89s  |      26.00s !!!      | 21.0 MB
ProtoBuf v2        0.52s  |       0.83s          | 18.7 MB
Raw BinaryWriter   0.27s  |       0.36s          | 11.4 MB

Obviously this is just looking at speed/size and doesn't take into account anything else.

like image 359
Ryan Avatar asked Jun 25 '11 15:06

Ryan


2 Answers

Binary serialisation using BinaryFormatter includes type information in the bytes it generates. This takes up additional space. It's useful in cases where you don't know what structure of data to expect at the other end, for example.

In your case, you know what format the data has at both ends, and that doesn't sound like it'd change. So you can write a simple encode and decode method. Your CoOrd class no longer needs to be serializable too.

I would use System.IO.BinaryReader and System.IO.BinaryWriter, then loop through each of your CoOrd instances and read/write the X,Y,Z propery values to the stream. Those classes will even pack your ints into less than 11MB, assuming many of your numbers are smaller than 0x7F and 0x7FFF.

Something like this:

using (var writer = new BinaryWriter(stream)) {
    // write the number of items so we know how many to read out
    writer.Write(points.Count);
    // write three ints per point
    foreach (var point in points) {
        writer.Write(point.X);
        writer.Write(point.Y);
        writer.Write(point.Z);
    }
}

To read from the stream:

List<CoOrd> points;
using (var reader = new BinaryReader(stream)) {
    var count = reader.ReadInt32();
    points = new List<CoOrd>(count);
    for (int i = 0; i < count; i++) {
        var x = reader.ReadInt32();
        var y = reader.ReadInt32();
        var z = reader.ReadInt32();
        points.Add(new CoOrd(x, y, z));
    }
}
like image 55
Drew Noakes Avatar answered Nov 01 '22 18:11

Drew Noakes


For simplicity of using a pre-build serializer, I recommend protobuf-net; here is protobuf-net v2, with just adding some attributes:

[DataContract]
public class Coordinates
{
    [DataContract]
    public struct CoOrd
    {
        public CoOrd(int x, int y, int z)
        {
            this.x = x;
            this.y = y;
            this.z = z;
        }
        [DataMember(Order = 1)]
        int x;
        [DataMember(Order = 2)]
        int y;
        [DataMember(Order = 3)]
        int z;
    }
    [DataMember(Order = 1)]
    public List<CoOrd> Coords = new List<CoOrd>();

    public void SetupTestArray()
    {
        Random r = new Random(123456);
        List<CoOrd> coordinates = new List<CoOrd>();
        for (int i = 0; i < 1000000; i++)
        {
            Coords.Add(new CoOrd(r.Next(10000), r.Next(10000), r.Next(10000)));
        }
    }
}

using:

ProtoBuf.Serializer.Serialize(mStream, c);

to serialize. This takes 10,960,823 bytes, but note that I tweaked SetupTestArray to limit the size to 10,000 since by default it uses "varint" encoding on the integers, which depends on the size. 10k isn't important here (in fact I didn't check what the "steps" are). If you prefer a fixed size (which will allow any range):

        [ProtoMember(1, DataFormat = DataFormat.FixedSize)]
        int x;
        [ProtoMember(2, DataFormat = DataFormat.FixedSize)]
        int y;
        [ProtoMember(3, DataFormat = DataFormat.FixedSize)]
        int z;

Which takes 16,998,640 bytes

like image 27
Marc Gravell Avatar answered Nov 01 '22 18:11

Marc Gravell