Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Windows fsync (FlushFileBuffers) performance with large files

From information on ensuring data is on disk (http://winntfs.com/2012/11/29/windows-write-caching-part-2-an-overview-for-application-developers/), even in the case of e.g. a power outage, it appears that on Windows platforms you need to rely on its "fsync" version FlushFileBuffers to have the best guarantee that buffers are actually flushed from disk device caches onto the storage medium itself. The combination of FILE_FLAG_NO_BUFFERING with FILE_FLAG_WRITE_THROUGH does not ensure flushing the device cache, but merely have an effect on the file system cache, if this information is correct.

Given the fact that I will be working with rather large files, that need to be updated "transactionally", this means performing an "fsync" at the end of a transaction commit. So I created a tiny app to test the performance in doing so. It basically performs sequential writes of a batch of 8 memory page sized random bytes using 8 writes and then flushes. The batch is repeated in a loop, and after every so many written pages it records the performance. Additionally it has two configurable options: to fsync on a flush and whether to write a byte to the last position of the file, before beginning the page writes.

// Code updated to reflect new results as discussed in answer below.
// 26/Aug/2013: Code updated again to reflect results as discussed in follow up question.
// 28/Aug/2012: Increased file stream buffer to ensure 8 page flushes.
class Program
{
    static void Main(string[] args)
    {
        BenchSequentialWrites(reuseExistingFile:false);
    }
    public static void BenchSequentialWrites(bool reuseExistingFile = false)
    {
        Tuple<string, bool, bool, bool, bool>[] scenarios = new Tuple<string, bool, bool, bool, bool>[]
        {   // output csv, fsync?, fill end?, write through?, mem map?
            Tuple.Create("timing FS-E-B-F.csv", true, false, false, false),
            Tuple.Create("timing NS-E-B-F.csv", false, false, false, false),
            Tuple.Create("timing FS-LB-B-F.csv", true, true, false, false),
            Tuple.Create("timing NS-LB-B-F.csv", false, true, false, false),
            Tuple.Create("timing FS-E-WT-F.csv", true, false, true, false),
            Tuple.Create("timing NS-E-WT-F.csv", false, false, true, false),
            Tuple.Create("timing FS-LB-WT-F.csv", true, true, true, false),
            Tuple.Create("timing NS-LB-WT-F.csv", false, true, true, false),
            Tuple.Create("timing FS-E-B-MM.csv", true, false, false, true),
            Tuple.Create("timing NS-E-B-MM.csv", false, false, false, true),
            Tuple.Create("timing FS-LB-B-MM.csv", true, true, false, true),
            Tuple.Create("timing NS-LB-B-MM.csv", false, true, false, true),
            Tuple.Create("timing FS-E-WT-MM.csv", true, false, true, true),
            Tuple.Create("timing NS-E-WT-MM.csv", false, false, true, true),
            Tuple.Create("timing FS-LB-WT-MM.csv", true, true, true, true),
            Tuple.Create("timing NS-LB-WT-MM.csv", false, true, true, true),
        };
        foreach (var scenario in scenarios)
        {
            Console.WriteLine("{0,-12} {1,-16} {2,-16} {3,-16} {4:F2}", "Total pages", "Interval pages", "Total time", "Interval time", "MB/s");
            CollectGarbage();
            var timingResults = SequentialWriteTest("test.data", !reuseExistingFile, fillEnd: scenario.Item3, nPages: 200 * 1000, fSync: scenario.Item2, writeThrough: scenario.Item4, writeToMemMap: scenario.Item5);
            using (var report = File.CreateText(scenario.Item1))
            {
                report.WriteLine("Total pages,Interval pages,Total bytes,Interval bytes,Total time,Interval time,MB/s");
                foreach (var entry in timingResults)
                {
                    Console.WriteLine("{0,-12} {1,-16} {2,-16} {3,-16} {4:F2}", entry.Item1, entry.Item2, entry.Item5, entry.Item6, entry.Item7);
                    report.WriteLine("{0},{1},{2},{3},{4},{5},{6}", entry.Item1, entry.Item2, entry.Item3, entry.Item4, entry.Item5.TotalSeconds, entry.Item6.TotalSeconds, entry.Item7);
                }
            }
        }
    }

    public unsafe static IEnumerable<Tuple<long, long, long, long, TimeSpan, TimeSpan, double>> SequentialWriteTest(
        string fileName,
        bool createNewFile,
        bool fillEnd,
        long nPages,
        bool fSync = true,
        bool writeThrough = false,
        bool writeToMemMap = false,
        long pageSize = 4096)
    {
        // create or open file and if requested fill in its last byte.
        var fileMode = createNewFile ? FileMode.Create : FileMode.OpenOrCreate;
        using (var tmpFile = new FileStream(fileName, fileMode, FileAccess.ReadWrite, FileShare.ReadWrite, (int)pageSize))
        {
            Console.WriteLine("Opening temp file with mode {0}{1}", fileMode, fillEnd ? " and writing last byte." : ".");
            tmpFile.SetLength(nPages * pageSize);
            if (fillEnd)
            {
                tmpFile.Position = tmpFile.Length - 1;
                tmpFile.WriteByte(1);
                tmpFile.Position = 0;
                tmpFile.Flush(true);
            }
        }
        // Make sure any flushing / activity has completed
        System.Threading.Thread.Sleep(TimeSpan.FromMinutes(1));
        System.Threading.Thread.SpinWait(50); // warm up.

        var buf = new byte[pageSize];
        new Random().NextBytes(buf);
        var ms = new System.IO.MemoryStream(buf);

        var stopwatch = new System.Diagnostics.Stopwatch();
        var timings = new List<Tuple<long, long, long, long, TimeSpan, TimeSpan, double>>();
        var pageTimingInterval = 8 * 2000;
        var prevPages = 0L;
        var prevElapsed = TimeSpan.FromMilliseconds(0);

        // Open file
        const FileOptions NoBuffering = ((FileOptions)0x20000000);
        var options = writeThrough ? (FileOptions.WriteThrough | NoBuffering) : FileOptions.None;
        using (var file = new FileStream(fileName, FileMode.Open, FileAccess.ReadWrite, FileShare.ReadWrite, (int)(16 *pageSize), options))
        {
            stopwatch.Start();
            if (writeToMemMap)
            {
                // write pages through memory map.
                using (var mmf = MemoryMappedFile.CreateFromFile(file, Guid.NewGuid().ToString(), file.Length, MemoryMappedFileAccess.ReadWrite, null, HandleInheritability.None, true))
                using (var accessor = mmf.CreateViewAccessor(0, file.Length, MemoryMappedFileAccess.ReadWrite))
                {
                    byte* base_ptr = null;
                    accessor.SafeMemoryMappedViewHandle.AcquirePointer(ref base_ptr);
                    var offset = 0L;
                    for (long i = 0; i < nPages / 8; i++)
                    {
                        using (var memStream = new UnmanagedMemoryStream(base_ptr + offset, 8 * pageSize, 8 * pageSize, FileAccess.ReadWrite))
                        {
                            for (int j = 0; j < 8; j++)
                            {
                                ms.CopyTo(memStream);
                                ms.Position = 0;
                            }
                        }
                        FlushViewOfFile((IntPtr)(base_ptr + offset), (int)(8 * pageSize));
                        offset += 8 * pageSize;
                        if (fSync)
                            FlushFileBuffers(file.SafeFileHandle);

                        if (((i + 1) * 8) % pageTimingInterval == 0)
                            timings.Add(Report(stopwatch.Elapsed, ref prevElapsed, (i + 1) * 8, ref prevPages, pageSize));
                    }
                    accessor.SafeMemoryMappedViewHandle.ReleasePointer();
                }
            }
            else
            {
                for (long i = 0; i < nPages / 8; i++)
                {
                    for (int j = 0; j < 8; j++)
                    {
                        ms.CopyTo(file);
                        ms.Position = 0;
                    }
                    file.Flush(fSync);
                    if (((i + 1) * 8) % pageTimingInterval == 0)
                        timings.Add(Report(stopwatch.Elapsed, ref prevElapsed, (i + 1) * 8, ref prevPages, pageSize));
                }
            }
        }
        timings.Add(Report(stopwatch.Elapsed, ref prevElapsed, nPages, ref prevPages, pageSize));
        return timings;
    }

    private static Tuple<long, long, long, long, TimeSpan, TimeSpan, double> Report(TimeSpan elapsed, ref TimeSpan prevElapsed, long curPages, ref long prevPages, long pageSize)
    {
        var intervalPages = curPages - prevPages;
        var intervalElapsed = elapsed - prevElapsed;
        var intervalPageSize = intervalPages * pageSize;
        var mbps = (intervalPageSize / (1024.0 * 1024.0)) / intervalElapsed.TotalSeconds;
        prevElapsed = elapsed;
        prevPages = curPages;
        return Tuple.Create(curPages, intervalPages, curPages * pageSize, intervalPageSize, elapsed, intervalElapsed, mbps);
    }

    private static void CollectGarbage()
    {
        GC.Collect();
        GC.WaitForPendingFinalizers();
        System.Threading.Thread.Sleep(200);
        GC.Collect();
        GC.WaitForPendingFinalizers();
        System.Threading.Thread.SpinWait(10);
    }

    [DllImport("kernel32.dll", SetLastError = true)]
    static extern bool FlushViewOfFile(
        IntPtr lpBaseAddress, int dwNumBytesToFlush);

    [DllImport("kernel32.dll", SetLastError = true, CharSet = CharSet.Auto)]
    static extern bool FlushFileBuffers(SafeFileHandle hFile);
}

The performance results I am obtaining (64 bit Win 7, slow spindle disk) are not very encouraging. It appears that "fsync" performance depends very much on the size of the file being flushed, such that this dominates the time, and not the amount of "dirty" data to be flushed. The graph below shows the results for the 4 different settings options of the little benchmark app.

Benchmark timing for 4 the scenarios

As you can see, performance of "fsync" exponentially decreases as the file grows (until at a few GB it really grinds to a halt). Also, the disk itself does not seem to be doing a whole lot (i.e. resource monitor shows its active time as just around a few percent, and its disk queue mostly empty for most of the time).

I had obviously expected "fsync" performance to be quite a bit worse than doing normal buffered flushes, but I had expected it to be more or less constant and independent of file size. Like this it would seem to suggest that it is not usable in combination with a single large file.

Does somebody have an explanation, different experiences or a different solution that allows ensuring data is on disk and that has a more or less constant, predictable performance?

UPDATED See new information in answer below.

like image 384
Alex Avatar asked Aug 16 '13 15:08

Alex


1 Answers

Your test shows exponential decrease in speed on the sync runs because you're recreating the file each time. In this case it's no longer a purely sequential write - every write also grows the file, which requires multiple seeks to update the file metadata in the filesystem. If you ran all of these jobs using a pre-existing fully allocated file, you'd see a much faster result because none of those metadata updates would be interfering.

I ran a similar test on my Linux box. The results while recreating the file each time:

mmap    direct  last    sync    time
0   0   0   0    0.882293s
0   0   0   1    27.050636s
0   0   1   0    0.832495s
0   0   1   1    26.966625s
0   1   0   0    5.775266s
0   1   0   1    22.063392s
0   1   1   0    5.265739s
0   1   1   1    24.203251s
1   0   0   0    1.031684s
1   0   0   1    28.244678s
1   0   1   0    1.031888s
1   0   1   1    29.540660s
1   1   0   0    1.032883s
1   1   0   1    29.408005s
1   1   1   0    1.035110s
1   1   1   1    28.948555s

The results using the pre-existing file (obviously the last_byte case is irrelevant here. Also, the very first result also had to create the file):

mmap    direct  last    sync    time
0   0   0   0    1.199310s
0   0   0   1    7.858803s
0   0   1   0    0.184925s
0   0   1   1    8.320572s
0   1   0   0    4.047780s
0   1   0   1    4.066993s
0   1   1   0    4.042564s
0   1   1   1    4.307159s
1   0   0   0    3.596712s
1   0   0   1    8.284428s
1   0   1   0    0.242584s
1   0   1   1    8.070947s
1   1   0   0    0.240500s
1   1   0   1    8.213450s
1   1   1   0    0.240922s
1   1   1   1    8.265024s

(Note that I only used 10,000 chunks not 25,000 chunks, so this is only writing 320MB, using an ext2 filesystem. I didn't have a larger ext2fs handy, my larger fs is XFS and it refused to allow mmap+direct I/O.)

Here's the code, if you're interested:

#define _GNU_SOURCE 1

#include <malloc.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>

#define USE_MMAP    8
#define USE_DIRECT  4
#define USE_LAST    2
#define USE_SYNC    1

#define PAGE    4096
#define CHUNK   (8*PAGE)
#define NCHUNKS 10000
#define STATI   1000

#define FSIZE   (NCHUNKS*CHUNK)

main()
{
    int i, j, fd, rc, stc;
    char *data = valloc(CHUNK);
    char *map, *dst;
    char sfname[8];
    struct timeval start, end, stats[NCHUNKS/STATI+1];
    FILE *sfile;

    printf("mmap\tdirect\tlast\tsync\ttime\n");
    for (i=0; i<16; i++) {
        int oflag = O_CREAT|O_RDWR|O_TRUNC;

        if (i & USE_DIRECT)
            oflag |= O_DIRECT;
        fd = open("dummy", oflag, 0666);
        ftruncate(fd, FSIZE);
        if (i & USE_LAST) {
            lseek(fd, 0, SEEK_END);
            write(fd, data, 1);
            lseek(fd, 0, SEEK_SET);
        }
        if (i & USE_MMAP) {
            map = mmap(NULL, FSIZE, PROT_WRITE, MAP_SHARED, fd, 0);
            if (map == (char *)-1L) {
                perror("mmap");
                exit(1);
            }
            dst = map;
        }
        sprintf(sfname, "%x.csv", i);
        sfile = fopen(sfname, "w");
        stc = 1;
        printf("%d\t%d\t%d\t%d\t",
            (i&USE_MMAP)!=0, (i&USE_DIRECT)!=0, (i&USE_LAST)!=0, i&USE_SYNC);
        fflush(stdout);
        gettimeofday(&start, NULL);
        stats[0] = start;
        for (j = 1; j<=NCHUNKS; j++) {
            if (i & USE_MMAP) {
                memcpy(dst, data, CHUNK);
                if (i & USE_SYNC)
                    msync(dst, CHUNK, MS_SYNC);
                dst += CHUNK;
            } else {
                write(fd, data, CHUNK);
                if (i & USE_SYNC)
                    fdatasync(fd);
            }
            if (!(j % STATI)) {
                gettimeofday(&end, NULL);
                stats[stc++] = end;
            }
        }
        end.tv_usec -= start.tv_usec;
        if (end.tv_usec < 0) {
            end.tv_sec--;
            end.tv_usec += 1000000;
        }
        end.tv_sec -= start.tv_sec;
        printf(" %d.%06ds\n", (int)end.tv_sec, (int)end.tv_usec);
        if (i & USE_MMAP)
            munmap(map, FSIZE);
        close(fd);
        for (j=NCHUNKS/STATI; j>0; j--) {
            stats[j].tv_usec -= stats[j-1].tv_usec;
            if (stats[j].tv_usec < 0) {
                stats[j].tv_sec--;
                stats[j].tv_usec+= 1000000;
            }
            stats[j].tv_sec -= stats[j-1].tv_sec;
        }
        for (j=1; j<=NCHUNKS/STATI; j++)
            fprintf(sfile, "%d\t%d.%06d\n", j*STATI*CHUNK,
                (int)stats[j].tv_sec, (int)stats[j].tv_usec);
        fclose(sfile);
    }
}
like image 161
hyc Avatar answered Oct 12 '22 23:10

hyc