Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Fastest way to get directory data in .NET [duplicate]

I'm working on a file synchronization service to synchronize files between two folders on different machines. I need to find a very fast way to enumerate a directory and pull the following information from it:

  • A data structure or structures of all file paths and sub-directory paths within this directory, which includes the last write time for each file or sub-directory.
  • For each sub-directory that is found at any level below the current directory, the same as above.

So far, I have come up with this:

static void Main(string[] args)
{
    List<Tuple<string, DateTime>> files = new List<Tuple<string, DateTime>>();
    List<Tuple<string, DateTime>> directories = new List<Tuple<string, DateTime>>();
    Stopwatch watch = new Stopwatch();
    while (true)
    {
        watch.Start();
        while (!CheckFolderRecursiveSingleThreaded("C:\\", out files, out directories))
        {
            // You can assume for all intents and purposes that drive C does exist and that you have access to it, which will cause this sleep to not get called.
            Thread.Sleep(1000);
        }
        watch.Stop();
        Console.WriteLine(watch.ElapsedMilliseconds);
        watch.Reset();
        // Do something with the information.
        Thread.Sleep(1000);
    }
}

static bool CheckFolderRecursiveSingleThreaded(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
    try
    {
        DirectoryInfo directoryInformation = new DirectoryInfo(path);
        List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
        foreach (FileInfo file in directoryInformation.GetFiles())
        {
            fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc));
        }
        List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
        foreach (DirectoryInfo directory in directoryInformation.GetDirectories())
        {
            // Check for the ReparsePoint flag, which will indicate a symbolic link.
            if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint))
            {
                directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc));
                List<Tuple<string, DateTime>> directoryFiles;
                List<Tuple<string, DateTime>> directoryFolders;
                if (CheckFolderRecursiveSingleThreaded(directory.FullName, out directoryFiles, out directoryFolders))
                {
                    fileList.AddRange(directoryFiles);
                    directoryList.AddRange(directoryFolders);
                }
            }
        }
        files = fileList;
        directories = directoryList;
        return true;
    }
    catch
    {
        files = null;
        directories = null;
        return false;
    }
}

Performance-wise, it takes about 22 seconds (regardless of running in release or debug mode without the debugger attached) to enumerate through my C:\ drive and produce a list of about 549,254 files and 83,235 folders it has access to, but can it be faster? I am open to any suggestions, even MSVC++ suggestions.

Edit: 12 seconds with LINQ's AsParallel because of multithreading (must be tested in Release mode). Note that this parallelizes for all C:\ sub-folders, but recursive calls will be made to the single-threaded implementation I have above, otherwise it would take a REALLY long time to parallelize for all folders all the time!

static bool CheckFolderParallelled(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
    try
    {
        DirectoryInfo directoryInformation = new DirectoryInfo(path);
        List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
        foreach (FileInfo file in directoryInformation.GetFiles())
        {
            fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc));
        }
        List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
        directoryInformation.GetDirectories().AsParallel().ForAll(directory =>
        {
            // Check for the ReparsePoint flag, which will indicate a symbolic link.
            if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint))
            {
                directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc));
                List<Tuple<string, DateTime>> directoryFiles;
                List<Tuple<string, DateTime>> directoryFolders;
                if (CheckFolderRecursiveSingleThreaded(directory.FullName, out directoryFiles, out directoryFolders))
                {
                    fileList.AddRange(directoryFiles);
                    directoryList.AddRange(directoryFolders);
                }
            }
        });
        files = fileList;
        directories = directoryList;
        return true;
    }
    catch
    {
        files = null;
        directories = null;
        return false;
    }
}

Edit: Still about 21 seconds using Alexei's linked solution's accepted answer from Mark Gravell. This non-recursive technique is not the fastest (probably the cost of keeping this Queue datatype alive is just as expensive as the cost of pushing and popping calls to this method on the stack):

static bool CheckFolderNonRecursive(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
    try
    {
        List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
        List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
        ConcurrentQueue<DirectoryInfo> pendingSearches = new ConcurrentQueue<DirectoryInfo>();
        pendingSearches.Enqueue(new DirectoryInfo(path));
        DirectoryInfo pendingDirectory;
        while (pendingSearches.Count > 0)
        {
            if (pendingSearches.TryDequeue(out pendingDirectory))
            {
                try
                {
                    foreach (FileInfo file in pendingDirectory.GetFiles())
                    {
                        fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc));
                    }
                    foreach (DirectoryInfo directory in pendingDirectory.GetDirectories())
                    {
                        // Check for the ReparsePoint flag, which will indicate a symbolic link.
                        if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint))
                        {
                            directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc));
                            pendingSearches.Enqueue(directory);
                        }
                    }
                }
                catch { } // Ignore directories with no access rights.
            }
        }
        files = fileList;
        directories = directoryList;
        return true;
    }
    catch
    {
        files = null;
        directories = null;
        return false;
    }
}

Edit: This question is open-ended towards .NET, because there may be a faster way with MSVC++ libraries like boost, but I have yet to come across a faster method. If anyone can beat my C# method with a faster C drive enumerator in C++ that pulls the same data, first of all kudos to you for doing it faster, second of all I would be really interested to see it, third of all it would help a lot of people out (not just myself). I got this far into boost until I realized the following method took around 200,000 ms, much, much longer than any code I've posted above:

#include "stdafx.h"
#include <iostream>
#include <Windows.h>
#include <boost/filesystem.hpp>
#include <boost/foreach.hpp>
#include <boost/timer.hpp>

namespace fs = boost::filesystem;

bool IterateDirectory(const wchar_t *directory);

int _tmain(int argc, _TCHAR* argv[])
{
    boost::timer timer = boost::timer();
    while (true)
    {
        timer.restart();
        // L makes it wide, since IterateDirectory takes wchar_t.
        // R makes it a raw string literal, which tells the compiler to parse the string as-is, not escape characters and fancy tricks.
        IterateDirectory(LR"(C:\)");
        std::cout << "Elapsed time: " << timer.elapsed() * 1000 << " ms" << std::endl;
        Sleep(1000);
    }
    return 0;
}

// IterateDirectory takes wchar_t because path.c_str() always returns wchar_t whether you are using unicode or multibyte.
bool IterateDirectory(const wchar_t *directory)
{
    if (boost::filesystem::exists(directory))
    {
        fs::directory_iterator it(directory), eod;
        BOOST_FOREACH(fs::path path, std::make_pair(it, eod))
        {
            try
            {
                if (is_regular_file(path))
                {
                    //std::cout << path << ", last write time: " << last_write_time(path) << '.' << std::endl;
                }
                if (is_directory(path))
                {
                    //std::cout << path << ", last write time: " << last_write_time(path) << '.' << std::endl;
                    // path.c_str() always returns wchar_t, whether you are using unicode or multibyte. This is probably because of multi-language support inside of the Windows operating system and file structure.
                    IterateDirectory(path.c_str());
                }
            }
            catch (...) { } // Ignore directories we don't have access to.
        }
        return true;
    }
    return false;
}

Edit: Using PInvoke to FindFirstFile and FindNextFile took about 6 seconds to iterate my entire C drive (thanks to the duplicate link and Sam Saffron's answer). But...can it be faster?

[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
public static extern IntPtr FindFirstFileW(string lpFileName, out WIN32_FIND_DATAW lpFindFileData);

[DllImport("kernel32.dll", CharSet = CharSet.Unicode)]
public static extern bool FindNextFile(IntPtr hFindFile, out WIN32_FIND_DATAW lpFindFileData);

[DllImport("kernel32.dll")]
public static extern bool FindClose(IntPtr hFindFile);

[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode)]
public struct WIN32_FIND_DATAW {
    public FileAttributes dwFileAttributes;
    internal System.Runtime.InteropServices.ComTypes.FILETIME ftCreationTime;
    internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastAccessTime;
    internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastWriteTime;
    public int nFileSizeHigh;
    public int nFileSizeLow;
    public int dwReserved0;
    public int dwReserved1;
    [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 260)]
    public string cFileName;
    [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 14)]
    public string cAlternateFileName;
}

static IntPtr INVALID_HANDLE_VALUE = new IntPtr(-1);

static bool FindNextFilePInvokeRecursive(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
    List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
    List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
    WIN32_FIND_DATAW findData;
    IntPtr findHandle = INVALID_HANDLE_VALUE;
    List<Tuple<string, DateTime>> info = new List<Tuple<string,DateTime>>();
    try
    {
        findHandle = FindFirstFileW(path + @"\*", out findData);
        if (findHandle != INVALID_HANDLE_VALUE)
        {
            do
            {
                if (findData.cFileName == "." || findData.cFileName == "..") continue;
                string fullPath = path + (path.EndsWith("\\") ? String.Empty : "\\") + findData.cFileName;
                // Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
                if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
                {
                    directoryList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
                    List<Tuple<string, DateTime>> subDirectoryFileList = new List<Tuple<string, DateTime>>();
                    List<Tuple<string, DateTime>> subDirectoryDirectoryList = new List<Tuple<string, DateTime>>();
                    if (FindNextFilePInvokeRecursive(fullPath, out subDirectoryFileList, out subDirectoryDirectoryList))
                    {
                        fileList.AddRange(subDirectoryFileList);
                        directoryList.AddRange(subDirectoryDirectoryList);
                    }
                }
                else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
                {
                    fileList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
                }
            }
            while (FindNextFile(findHandle, out findData));
        }
    }
    catch (Exception exception)
    {
        Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
        if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
        files = null;
        directories = null;
        return false;
    }
    if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
    files = fileList;
    directories = directoryList;
    return true;
}

public static class FILETIMEExtensions
{
    public static DateTime ToDateTime(this System.Runtime.InteropServices.ComTypes.FILETIME filetime)
    {
        long highBits = filetime.dwHighDateTime;
        highBits = highBits << 32;
        return DateTime.FromFileTimeUtc(highBits + (long)filetime.dwLowDateTime);
    }
}

Edit: Yes, it can be faster. Using techniques to parallelize the target folder's subdirectory recursions, I can get it to 4 seconds using the above FindNextFilePInvokeRecursive method. That's 4 seconds to iterate my entire C drive with the data I need. I can see in process monitor, I eat up about 30% CPU and only 1% disk at most, which is a little odd to me, not sure why that is at the moment, maybe just this linked list traversal style causes it to be pretty negligible. Ideally it should eat 100% CPU at least, but that might depend on the number and depth of subfolders you parallelize on. But can it be faster?!

static bool FindNextFilePInvokeRecursiveParalleled(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
    List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
    List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
    WIN32_FIND_DATAW findData;
    IntPtr findHandle = INVALID_HANDLE_VALUE;
    List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>();
    try
    {
        findHandle = FindFirstFileW(path + @"\*", out findData);
        if (findHandle != INVALID_HANDLE_VALUE)
        {
            do
            {
                if (findData.cFileName == "." || findData.cFileName == "..") continue;
                string fullPath = path + (path.EndsWith("\\") ? String.Empty : "\\") + findData.cFileName;
                // Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
                if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
                {
                    directoryList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
                }
                else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
                {
                    fileList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
                }
            }
            while (FindNextFile(findHandle, out findData));
            directoryList.AsParallel().ForAll(x =>
            {
                List<Tuple<string, DateTime>> subDirectoryFileList = new List<Tuple<string, DateTime>>();
                List<Tuple<string, DateTime>> subDirectoryDirectoryList = new List<Tuple<string, DateTime>>();
                if (FindNextFilePInvokeRecursive(x.Item1, out subDirectoryFileList, out subDirectoryDirectoryList))
                {
                    fileList.AddRange(subDirectoryFileList);
                    directoryList.AddRange(subDirectoryDirectoryList);
                }
            });
        }
    }
    catch (Exception exception)
    {
        Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
        if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
        files = null;
        directories = null;
        return false;
    }
    if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
    files = fileList;
    directories = directoryList;
    return true;
}

Edit: Forgot to add concurrency locks when using parallels, otherwise you might catch an exception. Also removed tuples and went with a FileInformation/DirectoryInformation class for my purposes. This shaved off .5 seconds. Now 3.5 seconds to enumerate my C: drive.

[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
public static extern IntPtr FindFirstFileW(string lpFileName, out WIN32_FIND_DATAW lpFindFileData);

[DllImport("kernel32.dll", CharSet = CharSet.Unicode)]
public static extern bool FindNextFile(IntPtr hFindFile, out WIN32_FIND_DATAW lpFindFileData);

[DllImport("kernel32.dll")]
public static extern bool FindClose(IntPtr hFindFile);

[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode)]
public struct WIN32_FIND_DATAW {
    public FileAttributes dwFileAttributes;
    internal System.Runtime.InteropServices.ComTypes.FILETIME ftCreationTime;
    internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastAccessTime;
    internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastWriteTime;
    public int nFileSizeHigh;
    public int nFileSizeLow;
    public int dwReserved0;
    public int dwReserved1;
    [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 260)]
    public string cFileName;
    [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 14)]
    public string cAlternateFileName;
}

static IntPtr INVALID_HANDLE_VALUE = new IntPtr(-1);

static bool FindNextFilePInvokeRecursive(string path, out List<FileInformation> files, out List<DirectoryInformation> directories)
{
    List<FileInformation> fileList = new List<FileInformation>();
    List<DirectoryInformation> directoryList = new List<DirectoryInformation>();
    WIN32_FIND_DATAW findData;
    IntPtr findHandle = INVALID_HANDLE_VALUE;
    List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>();
    try
    {
        findHandle = FindFirstFileW(path + @"\*", out findData);
        if (findHandle != INVALID_HANDLE_VALUE)
        {
            do
            {
                // Skip current directory and parent directory symbols that are returned.
                if (findData.cFileName != "." && findData.cFileName != "..")
                {
                    string fullPath = path + @"\" + findData.cFileName;
                    // Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
                    if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
                    {
                        directoryList.Add(new DirectoryInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
                        List<FileInformation> subDirectoryFileList = new List<FileInformation>();
                        List<DirectoryInformation> subDirectoryDirectoryList = new List<DirectoryInformation>();
                        if (FindNextFilePInvokeRecursive(fullPath, out subDirectoryFileList, out subDirectoryDirectoryList))
                        {
                            fileList.AddRange(subDirectoryFileList);
                            directoryList.AddRange(subDirectoryDirectoryList);
                        }
                    }
                    else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
                    {
                        fileList.Add(new FileInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
                    }
                }
            }
            while (FindNextFile(findHandle, out findData));
        }
    }
    catch (Exception exception)
    {
        Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
        if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
        files = null;
        directories = null;
        return false;
    }
    if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
    files = fileList;
    directories = directoryList;
    return true;
}

static bool FindNextFilePInvokeRecursiveParalleled(string path, out List<FileInformation> files, out List<DirectoryInformation> directories)
{
    List<FileInformation> fileList = new List<FileInformation>();
    object fileListLock = new object();
    List<DirectoryInformation> directoryList = new List<DirectoryInformation>();
    object directoryListLock = new object();
    WIN32_FIND_DATAW findData;
    IntPtr findHandle = INVALID_HANDLE_VALUE;
    List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>();
    try
    {
        path = path.EndsWith(@"\") ? path : path + @"\";
        findHandle = FindFirstFileW(path + @"*", out findData);
        if (findHandle != INVALID_HANDLE_VALUE)
        {
            do
            {
                // Skip current directory and parent directory symbols that are returned.
                if (findData.cFileName != "." && findData.cFileName != "..")
                {
                    string fullPath = path + findData.cFileName;
                    // Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
                    if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
                    {
                        directoryList.Add(new DirectoryInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
                    }
                    else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
                    {
                        fileList.Add(new FileInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
                    }
                }
            }
            while (FindNextFile(findHandle, out findData));
            directoryList.AsParallel().ForAll(x =>
            {
                List<FileInformation> subDirectoryFileList = new List<FileInformation>();
                List<DirectoryInformation> subDirectoryDirectoryList = new List<DirectoryInformation>();
                if (FindNextFilePInvokeRecursive(x.FullPath, out subDirectoryFileList, out subDirectoryDirectoryList))
                {
                    lock (fileListLock)
                    {
                        fileList.AddRange(subDirectoryFileList);
                    }
                    lock (directoryListLock)
                    {
                        directoryList.AddRange(subDirectoryDirectoryList);
                    }
                }
            });
        }
    }
    catch (Exception exception)
    {
        Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
        if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
        files = null;
        directories = null;
        return false;
    }
    if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
    files = fileList;
    directories = directoryList;
    return true;
}

public class FileInformation
{
    public string FullPath;
    public DateTime LastWriteTime;
}

public class DirectoryInformation
{
    public string FullPath;
    public DateTime LastWriteTime;
}

Edit: B.K. was asking about the conversion to DateTime from FILETIME:

public static class FILETIMEExtensions
{
    public static DateTime ToDateTime(this System.Runtime.InteropServices.ComTypes.FILETIME time)
    {
        ulong high = (ulong)time.dwHighDateTime;
        ulong low = (ulong)time.dwLowDateTime;
        long fileTime = (long)((high << 32) + low);
        return DateTime.FromFileTimeUtc(fileTime);
    }
}
like image 565
Alexandru Avatar asked Oct 12 '14 02:10

Alexandru


1 Answers

use LINQ and Parallel Tasks

var stuff = dir.GetFiles("*.*", System.IO.SearchOption.AllDirectories);  
Parallel.ForEach(stuff, p=>{ //do things in parallel..  });

//or this 
var q = stuff.AsParallel().Where(x => p(x)).Orderby(x => k(x)).Select(x => f(x));       
foreach (var e in q) a(e);
like image 95
JWP Avatar answered Oct 13 '22 18:10

JWP