I have written two functions that convert a string of whitespace-separated integers into an int array. The first function uses Substring
and then applies System.Int32.Parse
to convert the substring into an int
value:
let intsOfString (s: string) =
let ints = ResizeArray()
let rec inside i j =
if j = s.Length then
ints.Add(s.Substring(i, j-i) |> System.Int32.Parse)
else
let c = s.[j]
if '0' <= c && c <= '9' then
inside i (j+1)
else
ints.Add(s.Substring(i, j-i) |> System.Int32.Parse)
outside (j+1)
and outside i =
if i < s.Length then
let c = s.[i]
if '0' <= c && c <= '9' then
inside i (i+1)
else
outside (i+1)
outside 0
ints.ToArray()
The second function traverses the characters of the string in-place accumulating the integer without creating a temporary substring:
let intsOfString (s: string) =
let ints = ResizeArray()
let rec inside n i =
if i = s.Length then
ints.Add n
else
let c = s.[i]
if '0' <= c && c <= '9' then
inside (10*n + int c - 48) (i+1)
else
ints.Add n
outside(i+1)
and outside i =
if i < s.Length then
let c = s.[i]
if '0' <= c && c <= '9' then
inside (int c - 48) (i+1)
else
outside (i+1)
outside 0
ints.ToArray()
Benchmarking on space-separated integers 1 to 1,000,000, the first version takes 1.5s whereas the second version takes 0.3s.
Parsing such values can be performance critical so leaving 5x performance on the table by using temporary substrings can be undesirable. Parsing integers is easy but parsing other values such as floating point numbers, decimals and dates is considerably harder.
So, are there built-in functions to parse directly from a substring within a string (i.e. using the given start and length of a string) in order to avoid generating a temporary string? If not, are there any libraries that provide efficient functions to do this?
System.Int32.Parse
is slowlest, because it used CultureInfo
, FormatInfo
and etc; and performance reason is not in the temporary strings.
Code from reflection:
private unsafe static bool ParseNumber(ref char* str, NumberStyles options, ref Number.NumberBuffer number, NumberFormatInfo numfmt, bool parseDecimal)
{
number.scale = 0;
number.sign = false;
string text = null;
string text2 = null;
string str2 = null;
string str3 = null;
bool flag = false;
string str4;
string str5;
if ((options & NumberStyles.AllowCurrencySymbol) != NumberStyles.None)
{
text = numfmt.CurrencySymbol;
if (numfmt.ansiCurrencySymbol != null)
{
text2 = numfmt.ansiCurrencySymbol;
}
str2 = numfmt.NumberDecimalSeparator;
str3 = numfmt.NumberGroupSeparator;
str4 = numfmt.CurrencyDecimalSeparator;
str5 = numfmt.CurrencyGroupSeparator;
flag = true;
}
else
{
str4 = numfmt.NumberDecimalSeparator;
str5 = numfmt.NumberGroupSeparator;
}
int num = 0;
char* ptr = str;
char c = *ptr;
while (true)
{
if (!Number.IsWhite(c) || (options & NumberStyles.AllowLeadingWhite) == NumberStyles.None || ((num & 1) != 0 && ((num & 1) == 0 || ((num & 32) == 0 && numfmt.numberNegativePattern != 2))))
{
bool flag2;
char* ptr2;
if ((flag2 = (((options & NumberStyles.AllowLeadingSign) == NumberStyles.None) ? false : ((num & 1) == 0))) && (ptr2 = Number.MatchChars(ptr, numfmt.positiveSign)) != null)
{
num |= 1;
ptr = ptr2 - (IntPtr)2 / 2;
}
else
{
if (flag2 && (ptr2 = Number.MatchChars(ptr, numfmt.negativeSign)) != null)
{
num |= 1;
number.sign = true;
ptr = ptr2 - (IntPtr)2 / 2;
}
else
{
if (c == '(' && (options & NumberStyles.AllowParentheses) != NumberStyles.None && (num & 1) == 0)
{
num |= 3;
number.sign = true;
}
else
{
if ((text == null || (ptr2 = Number.MatchChars(ptr, text)) == null) && (text2 == null || (ptr2 = Number.MatchChars(ptr, text2)) == null))
{
break;
}
num |= 32;
text = null;
text2 = null;
ptr = ptr2 - (IntPtr)2 / 2;
}
}
}
}
c = *(ptr += (IntPtr)2 / 2);
}
int num2 = 0;
int num3 = 0;
while (true)
{
if ((c >= '0' && c <= '9') || ((options & NumberStyles.AllowHexSpecifier) != NumberStyles.None && ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))))
{
num |= 4;
if (c != '0' || (num & 8) != 0)
{
if (num2 < 50)
{
number.digits[(IntPtr)(num2++)] = c;
if (c != '0' || parseDecimal)
{
num3 = num2;
}
}
if ((num & 16) == 0)
{
number.scale++;
}
num |= 8;
}
else
{
if ((num & 16) != 0)
{
number.scale--;
}
}
}
else
{
char* ptr2;
if ((options & NumberStyles.AllowDecimalPoint) != NumberStyles.None && (num & 16) == 0 && ((ptr2 = Number.MatchChars(ptr, str4)) != null || (flag && (num & 32) == 0 && (ptr2 = Number.MatchChars(ptr, str2)) != null)))
{
num |= 16;
ptr = ptr2 - (IntPtr)2 / 2;
}
else
{
if ((options & NumberStyles.AllowThousands) == NumberStyles.None || (num & 4) == 0 || (num & 16) != 0 || ((ptr2 = Number.MatchChars(ptr, str5)) == null && (!flag || (num & 32) != 0 || (ptr2 = Number.MatchChars(ptr, str3)) == null)))
{
break;
}
ptr = ptr2 - (IntPtr)2 / 2;
}
}
c = *(ptr += (IntPtr)2 / 2);
}
bool flag3 = false;
number.precision = num3;
number.digits[(IntPtr)num3] = '\0';
if ((num & 4) != 0)
{
if ((c == 'E' || c == 'e') && (options & NumberStyles.AllowExponent) != NumberStyles.None)
{
char* ptr3 = ptr;
c = *(ptr += (IntPtr)2 / 2);
char* ptr2;
if ((ptr2 = Number.MatchChars(ptr, numfmt.positiveSign)) != null)
{
c = *(ptr = ptr2);
}
else
{
if ((ptr2 = Number.MatchChars(ptr, numfmt.negativeSign)) != null)
{
c = *(ptr = ptr2);
flag3 = true;
}
}
if (c >= '0' && c <= '9')
{
int num4 = 0;
do
{
num4 = num4 * 10 + (int)(c - '0');
c = *(ptr += (IntPtr)2 / 2);
if (num4 > 1000)
{
num4 = 9999;
while (c >= '0' && c <= '9')
{
c = *(ptr += (IntPtr)2 / 2);
}
}
}
while (c >= '0' && c <= '9');
if (flag3)
{
num4 = -num4;
}
number.scale += num4;
}
else
{
ptr = ptr3;
c = *ptr;
}
}
while (true)
{
if (!Number.IsWhite(c) || (options & NumberStyles.AllowTrailingWhite) == NumberStyles.None)
{
bool flag2;
char* ptr2;
if ((flag2 = (((options & NumberStyles.AllowTrailingSign) == NumberStyles.None) ? false : ((num & 1) == 0))) && (ptr2 = Number.MatchChars(ptr, numfmt.positiveSign)) != null)
{
num |= 1;
ptr = ptr2 - (IntPtr)2 / 2;
}
else
{
if (flag2 && (ptr2 = Number.MatchChars(ptr, numfmt.negativeSign)) != null)
{
num |= 1;
number.sign = true;
ptr = ptr2 - (IntPtr)2 / 2;
}
else
{
if (c == ')' && (num & 2) != 0)
{
num &= -3;
}
else
{
if ((text == null || (ptr2 = Number.MatchChars(ptr, text)) == null) && (text2 == null || (ptr2 = Number.MatchChars(ptr, text2)) == null))
{
break;
}
text = null;
text2 = null;
ptr = ptr2 - (IntPtr)2 / 2;
}
}
}
}
c = *(ptr += (IntPtr)2 / 2);
}
if ((num & 2) == 0)
{
if ((num & 8) == 0)
{
if (!parseDecimal)
{
number.scale = 0;
}
if ((num & 16) == 0)
{
number.sign = false;
}
}
str = ptr;
return true;
}
}
str = ptr;
return false;
}
public static int Parse(string s)
{
return Number.ParseInt32(s, NumberStyles.Integer, NumberFormatInfo.CurrentInfo);
}
internal unsafe static int ParseInt32(string s, NumberStyles style, NumberFormatInfo info)
{
byte* stackBuffer = stackalloc byte[1 * 114 / 1];
Number.NumberBuffer numberBuffer = new Number.NumberBuffer(stackBuffer);
int result = 0;
Number.StringToNumber(s, style, ref numberBuffer, info, false);
if ((style & NumberStyles.AllowHexSpecifier) != NumberStyles.None)
{
if (!Number.HexNumberToInt32(ref numberBuffer, ref result))
{
throw new OverflowException(Environment.GetResourceString("Overflow_Int32"));
}
}
else
{
if (!Number.NumberToInt32(ref numberBuffer, ref result))
{
throw new OverflowException(Environment.GetResourceString("Overflow_Int32"));
}
}
return result;
}
private unsafe static void StringToNumber(string str, NumberStyles options, ref Number.NumberBuffer number, NumberFormatInfo info, bool parseDecimal)
{
if (str == null)
{
throw new ArgumentNullException("String");
}
fixed (char* ptr = str)
{
char* ptr2 = ptr;
if (!Number.ParseNumber(ref ptr2, options, ref number, info, parseDecimal) || ((ptr2 - ptr / 2) / 2 < str.Length && !Number.TrailingZeros(str, (ptr2 - ptr / 2) / 2)))
{
throw new FormatException(Environment.GetResourceString("Format_InvalidString"));
}
}
}
I've written this one for doubles, that doesn't create a temporary substring. It's meant to be used inside a JSON parser so it limits itself to how doubles can be represented in JSON according to http://www.json.org/.
It's not optimal yet because it requires you to know where the number begins and ends (begin
and end
parameters), so you'll have to traverse the length of the number twice to find out where it ends. It's still around 10-15x faster than double.Parse
and it could be fairly easily modified that it finds the end
inside the function which is then returned as an out
parameter to know where you have to resume parsing the main string.
Used like so:
Parsers.TryParseDoubleFastStream("1", 0, 1, out j);
Parsers.TryParseDoubleFastStream("2.0", 0, 3, out j);
Parsers.TryParseDoubleFastStream("3.5", 0, 3, out j);
Parsers.TryParseDoubleFastStream("-4.5", 0, 4, out j);
Parsers.TryParseDoubleFastStream("50.06", 0, 5, out j);
Parsers.TryParseDoubleFastStream("1000.65", 0, 7, out j);
Parsers.TryParseDoubleFastStream("-10000.8600", 0, 11, out j);
Code can be found here:
https://gist.github.com/3010984 (would be too lengthy to post here).
And StandardFunctions.IgnoreChar
is for my purpose as simple as:
public static bool IgnoreChar(char c)
{
return c < 33;
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With