I noticed a few questions about finding the nth occurrence of a character in a string. Since I was curious (and have several uses for this in an application, but mainly out of curiosity), I coded and benchmarked two of these methods in Visual Studio 2010, and I'm wondering why method 1 (FindNthOccurrence
) is much slower than method 2 (IndexOfNth
). The only reasons I could think of were:
indexOf
is a in-built .NET method and is therefore already optimisedI'm leaning toward #2, but I'd still like to know. This is the relevant code.
class Program
{
static void Main(string[] args)
{
char searchChar = 'a';
Random r = new Random(UnixTimestamp());
// Generate sample data
int numSearches = 100000, inputLength = 100;
List<String> inputs = new List<String>(numSearches);
List<int> nth = new List<int>(numSearches);
List<int> occurrences = new List<int>(numSearches);
for (int i = 0; i < numSearches; i++)
{
inputs.Add(GenerateRandomString(inputLength, "abcdefghijklmnopqrstuvwxyz"));
nth.Add(r.Next(1, 4));
}
// Timing of FindNthOccurrence
Stopwatch timeFindNth = Stopwatch.StartNew();
for (int i = 0; i < numSearches; i++)
occurrences.Add(FindNthOccurrence(inputs[i], searchChar, nth[i]));
timeFindNth.Stop();
Console.WriteLine(String.Format("FindNthOccurrence: {0} / {1}",
timeFindNth.ElapsedMilliseconds, timeFindNth.ElapsedTicks));
// Cleanup
occurrences.Clear();
// Timing of IndexOfNth
Stopwatch timeIndexOf = Stopwatch.StartNew();
for (int i = 0; i < numSearches; i++)
occurrences.Add(IndexOfNth(inputs[i], searchChar, nth[i]));
timeIndexOf.Stop();
Console.WriteLine(String.Format("IndexOfNth: {0} / {1}",
timeIndexOf.ElapsedMilliseconds, timeIndexOf.ElapsedTicks));
Console.Read();
}
static int FindNthOccurrence(String input, char c, int n)
{
int len = input.Length;
int occurrences = 0;
for (int i = 0; i < len; i++)
{
if (input[i] == c)
{
occurrences++;
if (occurrences == n)
return i;
}
}
return -1;
}
static int IndexOfNth(String input, char c, int n)
{
int occurrence = 0;
int pos = input.IndexOf(c, 0);
while (pos != -1)
{
occurrence++;
if (occurrence == n)
return pos;
pos = input.IndexOf(c, pos + 1);
}
return -1;
}
// Helper methods
static String GenerateRandomString(int length, String legalCharacters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
{
if (length < 0) throw new ArgumentOutOfRangeException("length", "length cannot be less than zero.");
if (string.IsNullOrEmpty(legalCharacters))
throw new ArgumentException("allowedChars may not be empty.");
const int byteSize = 0x100;
var legalCharSet = new HashSet<char>(legalCharacters).ToArray();
if (byteSize < legalCharSet.Length)
throw new ArgumentException(String.Format("allowedChars may contain no more than {0} characters.", byteSize));
// Guid.NewGuid and System.Random are not particularly random. By using a
// cryptographically-secure random number generator, the caller is always
// protected, regardless of use.
using (var rng = new System.Security.Cryptography.RNGCryptoServiceProvider())
{
StringBuilder result = new StringBuilder();
var buf = new byte[128];
while (result.Length < length)
{
rng.GetBytes(buf);
for (var i = 0; i < buf.Length && result.Length < length; ++i)
{
// Divide the byte into legalCharSet-sized groups. If the
// random value falls into the last group and the last group is
// too small to choose from the entire legalCharSet, ignore
// the value in order to avoid biasing the result.
var outOfRangeStart = byteSize - (byteSize % legalCharSet.Length);
if (outOfRangeStart <= buf[i]) continue;
result.Append(legalCharSet[buf[i] % legalCharSet.Length]);
}
}
return result.ToString();
}
}
static int UnixTimestamp()
{
TimeSpan ts = (System.DateTime.UtcNow - new System.DateTime(1970, 1, 1, 0, 0, 0));
return (int)ts.TotalSeconds;
}
}
Every result outputs times similar to this (milliseconds / elapsed ticks):
FindNthOccurrence: 27 / 79716
IndexOfNth: 12 / 36492
Browsing through the source code of System.String
with Reflector, it appears an IndexOf
method is called which is defined as:
public extern int IndexOf(char value, int startIndex, int count);
So it is calling out to some internal unmanaged code, which probably provides the observed speed boost. It is unlikely you will be able to get any faster with managed code.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With