I am making a tool using c# that iterates through a large file directory and extracts certain information. The directory is organised by language (LCID), so I want to use multithreading to go through the directory- one thread per language folder.
My code currently scans through a small number of the files and extracts the required data without multithreading, but on a large scale it will take too long.
I set up a thread within my loop that gets the LCID folders, but got the following error: "no overload for 'HBscan' matches delegate System.threading.threadstart". From what I read online, I then put my method within a class so I could have parameters, and now there's no errors but the code is not iterating through the files properly. It is leaving files out of it's scan.
I was wondering if anyone could see where I was going wrong with my code that's making it not perform properly? Thanks.
public static void Main(string[] args)
{
//change rootDirectory variable to point to directory which you wish to scan through
string rootDirectory = @"C:\sample";
DirectoryInfo dir = new DirectoryInfo(rootDirectory);
//get the LCIDs from the folders
string[] filePaths = Directory.GetDirectories(rootDirectory);
for (int i = 0; i < filePaths.Length; i++)
{
string LCID = filePaths[i].Split('\\').Last();
Console.WriteLine(LCID);
HBScanner scanner = new HBScanner(new DirectoryInfo(filePaths[i]));
Thread t1 = new Thread(new ThreadStart(scanner.HBscan));
t1.Start();
}
Console.WriteLine("Scanning through files...");
}
public class HBScanner
{
private DirectoryInfo DirectoryToScan { get; set; }
public HBScanner(DirectoryInfo startDir)
{
DirectoryToScan = startDir;
}
public void HBscan()
{
HBscan(DirectoryToScan);
}
public static void HBscan(DirectoryInfo directoryToScan)
{
//create an array of files using FileInfo object
FileInfo[] files;
//get all files for the current directory
files = directoryToScan.GetFiles("*.*");
string asset = "";
string lcid = "";
//iterate through the directory and get file details
foreach (FileInfo file in files)
{
String name = file.Name;
DateTime lastModified = file.LastWriteTime;
String path = file.FullName;
//first check the file name for asset id using regular expression
Regex regEx = new Regex(@"([A-Z][A-Z][0-9]{8,10})\.");
asset = regEx.Match(file.Name).Groups[1].Value.ToString();
//get LCID from the file path using regular expression
Regex LCIDregEx = new Regex(@"sample\\(\d{4,5})");
lcid = LCIDregEx.Match(file.FullName).Groups[1].Value.ToString();
//if it can't find it from filename, it looks into xml
if (file.Extension == ".xml" && asset == "")
{
System.Diagnostics.Debug.WriteLine("File is an .XML");
System.Diagnostics.Debug.WriteLine("file.FullName is: " + file.FullName);
XmlDocument xmlDoc = new XmlDocument();
xmlDoc.Load(path);
//load XML file in
//check for <assetid> element
XmlNode assetIDNode = xmlDoc.GetElementsByTagName("assetid")[0];
//check for <Asset> element
XmlNode AssetIdNodeWithAttribute = xmlDoc.GetElementsByTagName("Asset")[0];
//if there is an <assetid> element
if (assetIDNode != null)
{
asset = assetIDNode.InnerText;
}
else if (AssetIdNodeWithAttribute != null) //if there is an <asset> element, see if it has an AssetID attribute
{
//get the attribute
asset = AssetIdNodeWithAttribute.Attributes["AssetId"].Value;
if (AssetIdNodeWithAttribute.Attributes != null)
{
var attributeTest = AssetIdNodeWithAttribute.Attributes["AssetId"];
if (attributeTest != null)
{
asset = attributeTest.Value;
}
}
}
}
Item newFile = new Item
{
AssetID = asset,
LCID = lcid,
LastModifiedDate = lastModified,
Path = path,
FileName = name
};
Console.WriteLine(newFile);
}
//get sub-folders for the current directory
DirectoryInfo[] dirs = directoryToScan.GetDirectories("*.*");
foreach (DirectoryInfo dir in dirs)
{
HBscan(dir);
}
}
}
I havent checked, but i think this could work.
The code will create one scanner per thread and perform the HBscan method.
public static void Main(string[] args)
{
//change rootDirectory variable to point to directory which you wish to scan through
string rootDirectory = @"C:\sample";
DirectoryInfo dir = new DirectoryInfo(rootDirectory);
//get the LCIDs from the folders
string[] filePaths = Directory.GetDirectories(rootDirectory);
for (int i = 0; i < filePaths.Length; i++)
{
string LCID = filePaths[i].Split('\\').Last();
Console.WriteLine(LCID);
Thread t1 = new Thread(() => new HBScanner(new DirectoryInfo(filePaths[i])).HBscan());
t1.Start();
}
Console.WriteLine("Scanning through files...");
}
public class HBScanner
{
private DirectoryInfo DirectoryToScan { get; set; }
public HBScanner(DirectoryInfo startDir)
{
DirectoryToScan = startDir;
}
public void HBscan()
{
HBscan(DirectoryToScan);
}
public static void HBscan(DirectoryInfo directoryToScan)
{
//create an array of files using FileInfo object
FileInfo[] files;
//get all files for the current directory
files = directoryToScan.GetFiles("*.*");
string asset = "";
string lcid = "";
//iterate through the directory and get file details
foreach (FileInfo file in files)
{
String name = file.Name;
DateTime lastModified = file.LastWriteTime;
String path = file.FullName;
//first check the file name for asset id using regular expression
Regex regEx = new Regex(@"([A-Z][A-Z][0-9]{8,10})\.");
asset = regEx.Match(file.Name).Groups[1].Value.ToString();
//get LCID from the file path using regular expression
Regex LCIDregEx = new Regex(@"sample\\(\d{4,5})");
lcid = LCIDregEx.Match(file.FullName).Groups[1].Value.ToString();
//if it can't find it from filename, it looks into xml
if (file.Extension == ".xml" && asset == "")
{
System.Diagnostics.Debug.WriteLine("File is an .XML");
System.Diagnostics.Debug.WriteLine("file.FullName is: " + file.FullName);
XmlDocument xmlDoc = new XmlDocument();
xmlDoc.Load(path);
//load XML file in
//check for <assetid> element
XmlNode assetIDNode = xmlDoc.GetElementsByTagName("assetid")[0];
//check for <Asset> element
XmlNode AssetIdNodeWithAttribute = xmlDoc.GetElementsByTagName("Asset")[0];
//if there is an <assetid> element
if (assetIDNode != null)
{
asset = assetIDNode.InnerText;
}
else if (AssetIdNodeWithAttribute != null) //if there is an <asset> element, see if it has an AssetID attribute
{
//get the attribute
asset = AssetIdNodeWithAttribute.Attributes["AssetId"].Value;
if (AssetIdNodeWithAttribute.Attributes != null)
{
var attributeTest = AssetIdNodeWithAttribute.Attributes["AssetId"];
if (attributeTest != null)
{
asset = attributeTest.Value;
}
}
}
}
Item newFile = new Item
{
AssetID = asset,
LCID = lcid,
LastModifiedDate = lastModified,
Path = path,
FileName = name
};
Console.WriteLine(newFile);
}
//get sub-folders for the current directory
DirectoryInfo[] dirs = directoryToScan.GetDirectories("*.*");
foreach (DirectoryInfo dir in dirs)
{
HBscan(dir);
}
}
}
If you are using .NET 4.0, you could Use TPL and use Parallel.For/Parallel.ForEach to work on multiple items at the same time fairly easy.
I just got in touch with it a few days before and it's very interesting. It gives you great performance by using multiple threads on different cores to speed up your working. Of cause this might be limited in your case due exessive IO accesses.
But it may be worth a try! (And altering your current source is fairly easy done to just check it out)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With