Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Faster parsing of numbers on .NET

I have written two functions that convert a string of whitespace-separated integers into an int array. The first function uses Substring and then applies System.Int32.Parse to convert the substring into an int value:

let intsOfString (s: string) =
  let ints = ResizeArray()
  let rec inside i j =
    if j = s.Length then
      ints.Add(s.Substring(i, j-i) |> System.Int32.Parse)
    else
      let c = s.[j]
      if '0' <= c && c <= '9' then
        inside i (j+1)
      else
        ints.Add(s.Substring(i, j-i) |> System.Int32.Parse)
        outside (j+1)
  and outside i =
    if i < s.Length then
      let c = s.[i]
      if '0' <= c && c <= '9' then
        inside i (i+1)
      else
        outside (i+1)
  outside 0
  ints.ToArray()

The second function traverses the characters of the string in-place accumulating the integer without creating a temporary substring:

let intsOfString (s: string) =
  let ints = ResizeArray()
  let rec inside n i =
    if i = s.Length then
      ints.Add n
    else
      let c = s.[i]
      if '0' <= c && c <= '9' then
        inside (10*n + int c - 48) (i+1)
      else
        ints.Add n
        outside(i+1)
  and outside i =
    if i < s.Length then
      let c = s.[i]
      if '0' <= c && c <= '9' then
        inside (int c - 48) (i+1)
      else
        outside (i+1)
  outside 0
  ints.ToArray()

Benchmarking on space-separated integers 1 to 1,000,000, the first version takes 1.5s whereas the second version takes 0.3s.

Parsing such values can be performance critical so leaving 5x performance on the table by using temporary substrings can be undesirable. Parsing integers is easy but parsing other values such as floating point numbers, decimals and dates is considerably harder.

So, are there built-in functions to parse directly from a substring within a string (i.e. using the given start and length of a string) in order to avoid generating a temporary string? If not, are there any libraries that provide efficient functions to do this?

like image 846
J D Avatar asked Jun 28 '12 09:06

J D


2 Answers

System.Int32.Parse is slowlest, because it used CultureInfo, FormatInfo and etc; and performance reason is not in the temporary strings.

Code from reflection:

private unsafe static bool ParseNumber(ref char* str, NumberStyles options, ref Number.NumberBuffer number, NumberFormatInfo numfmt, bool parseDecimal)
{
    number.scale = 0;
    number.sign = false;
    string text = null;
    string text2 = null;
    string str2 = null;
    string str3 = null;
    bool flag = false;
    string str4;
    string str5;
    if ((options & NumberStyles.AllowCurrencySymbol) != NumberStyles.None)
    {
        text = numfmt.CurrencySymbol;
        if (numfmt.ansiCurrencySymbol != null)
        {
            text2 = numfmt.ansiCurrencySymbol;
        }
        str2 = numfmt.NumberDecimalSeparator;
        str3 = numfmt.NumberGroupSeparator;
        str4 = numfmt.CurrencyDecimalSeparator;
        str5 = numfmt.CurrencyGroupSeparator;
        flag = true;
    }
    else
    {
        str4 = numfmt.NumberDecimalSeparator;
        str5 = numfmt.NumberGroupSeparator;
    }
    int num = 0;
    char* ptr = str;
    char c = *ptr;
    while (true)
    {
        if (!Number.IsWhite(c) || (options & NumberStyles.AllowLeadingWhite) == NumberStyles.None || ((num & 1) != 0 && ((num & 1) == 0 || ((num & 32) == 0 && numfmt.numberNegativePattern != 2))))
        {
            bool flag2;
            char* ptr2;
            if ((flag2 = (((options & NumberStyles.AllowLeadingSign) == NumberStyles.None) ? false : ((num & 1) == 0))) && (ptr2 = Number.MatchChars(ptr, numfmt.positiveSign)) != null)
            {
                num |= 1;
                ptr = ptr2 - (IntPtr)2 / 2;
            }
            else
            {
                if (flag2 && (ptr2 = Number.MatchChars(ptr, numfmt.negativeSign)) != null)
                {
                    num |= 1;
                    number.sign = true;
                    ptr = ptr2 - (IntPtr)2 / 2;
                }
                else
                {
                    if (c == '(' && (options & NumberStyles.AllowParentheses) != NumberStyles.None && (num & 1) == 0)
                    {
                        num |= 3;
                        number.sign = true;
                    }
                    else
                    {
                        if ((text == null || (ptr2 = Number.MatchChars(ptr, text)) == null) && (text2 == null || (ptr2 = Number.MatchChars(ptr, text2)) == null))
                        {
                            break;
                        }
                        num |= 32;
                        text = null;
                        text2 = null;
                        ptr = ptr2 - (IntPtr)2 / 2;
                    }
                }
            }
        }
        c = *(ptr += (IntPtr)2 / 2);
    }
    int num2 = 0;
    int num3 = 0;
    while (true)
    {
        if ((c >= '0' && c <= '9') || ((options & NumberStyles.AllowHexSpecifier) != NumberStyles.None && ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))))
        {
            num |= 4;
            if (c != '0' || (num & 8) != 0)
            {
                if (num2 < 50)
                {
                    number.digits[(IntPtr)(num2++)] = c;
                    if (c != '0' || parseDecimal)
                    {
                        num3 = num2;
                    }
                }
                if ((num & 16) == 0)
                {
                    number.scale++;
                }
                num |= 8;
            }
            else
            {
                if ((num & 16) != 0)
                {
                    number.scale--;
                }
            }
        }
        else
        {
            char* ptr2;
            if ((options & NumberStyles.AllowDecimalPoint) != NumberStyles.None && (num & 16) == 0 && ((ptr2 = Number.MatchChars(ptr, str4)) != null || (flag && (num & 32) == 0 && (ptr2 = Number.MatchChars(ptr, str2)) != null)))
            {
                num |= 16;
                ptr = ptr2 - (IntPtr)2 / 2;
            }
            else
            {
                if ((options & NumberStyles.AllowThousands) == NumberStyles.None || (num & 4) == 0 || (num & 16) != 0 || ((ptr2 = Number.MatchChars(ptr, str5)) == null && (!flag || (num & 32) != 0 || (ptr2 = Number.MatchChars(ptr, str3)) == null)))
                {
                    break;
                }
                ptr = ptr2 - (IntPtr)2 / 2;
            }
        }
        c = *(ptr += (IntPtr)2 / 2);
    }
    bool flag3 = false;
    number.precision = num3;
    number.digits[(IntPtr)num3] = '\0';
    if ((num & 4) != 0)
    {
        if ((c == 'E' || c == 'e') && (options & NumberStyles.AllowExponent) != NumberStyles.None)
        {
            char* ptr3 = ptr;
            c = *(ptr += (IntPtr)2 / 2);
            char* ptr2;
            if ((ptr2 = Number.MatchChars(ptr, numfmt.positiveSign)) != null)
            {
                c = *(ptr = ptr2);
            }
            else
            {
                if ((ptr2 = Number.MatchChars(ptr, numfmt.negativeSign)) != null)
                {
                    c = *(ptr = ptr2);
                    flag3 = true;
                }
            }
            if (c >= '0' && c <= '9')
            {
                int num4 = 0;
                do
                {
                    num4 = num4 * 10 + (int)(c - '0');
                    c = *(ptr += (IntPtr)2 / 2);
                    if (num4 > 1000)
                    {
                        num4 = 9999;
                        while (c >= '0' && c <= '9')
                        {
                            c = *(ptr += (IntPtr)2 / 2);
                        }
                    }
                }
                while (c >= '0' && c <= '9');
                if (flag3)
                {
                    num4 = -num4;
                }
                number.scale += num4;
            }
            else
            {
                ptr = ptr3;
                c = *ptr;
            }
        }
        while (true)
        {
            if (!Number.IsWhite(c) || (options & NumberStyles.AllowTrailingWhite) == NumberStyles.None)
            {
                bool flag2;
                char* ptr2;
                if ((flag2 = (((options & NumberStyles.AllowTrailingSign) == NumberStyles.None) ? false : ((num & 1) == 0))) && (ptr2 = Number.MatchChars(ptr, numfmt.positiveSign)) != null)
                {
                    num |= 1;
                    ptr = ptr2 - (IntPtr)2 / 2;
                }
                else
                {
                    if (flag2 && (ptr2 = Number.MatchChars(ptr, numfmt.negativeSign)) != null)
                    {
                        num |= 1;
                        number.sign = true;
                        ptr = ptr2 - (IntPtr)2 / 2;
                    }
                    else
                    {
                        if (c == ')' && (num & 2) != 0)
                        {
                            num &= -3;
                        }
                        else
                        {
                            if ((text == null || (ptr2 = Number.MatchChars(ptr, text)) == null) && (text2 == null || (ptr2 = Number.MatchChars(ptr, text2)) == null))
                            {
                                break;
                            }
                            text = null;
                            text2 = null;
                            ptr = ptr2 - (IntPtr)2 / 2;
                        }
                    }
                }
            }
            c = *(ptr += (IntPtr)2 / 2);
        }
        if ((num & 2) == 0)
        {
            if ((num & 8) == 0)
            {
                if (!parseDecimal)
                {
                    number.scale = 0;
                }
                if ((num & 16) == 0)
                {
                    number.sign = false;
                }
            }
            str = ptr;
            return true;
        }
    }
    str = ptr;
    return false;
}
public static int Parse(string s)
{
    return Number.ParseInt32(s, NumberStyles.Integer, NumberFormatInfo.CurrentInfo);
}

internal unsafe static int ParseInt32(string s, NumberStyles style, NumberFormatInfo info)
{
    byte* stackBuffer = stackalloc byte[1 * 114 / 1];
    Number.NumberBuffer numberBuffer = new Number.NumberBuffer(stackBuffer);
    int result = 0;
    Number.StringToNumber(s, style, ref numberBuffer, info, false);
    if ((style & NumberStyles.AllowHexSpecifier) != NumberStyles.None)
    {
        if (!Number.HexNumberToInt32(ref numberBuffer, ref result))
        {
            throw new OverflowException(Environment.GetResourceString("Overflow_Int32"));
        }
    }
    else
    {
        if (!Number.NumberToInt32(ref numberBuffer, ref result))
        {
            throw new OverflowException(Environment.GetResourceString("Overflow_Int32"));
        }
    }
    return result;
}

private unsafe static void StringToNumber(string str, NumberStyles options, ref Number.NumberBuffer number, NumberFormatInfo info, bool parseDecimal)
{
    if (str == null)
    {
        throw new ArgumentNullException("String");
    }
    fixed (char* ptr = str)
    {
        char* ptr2 = ptr;
        if (!Number.ParseNumber(ref ptr2, options, ref number, info, parseDecimal) || ((ptr2 - ptr / 2) / 2 < str.Length && !Number.TrailingZeros(str, (ptr2 - ptr / 2) / 2)))
        {
            throw new FormatException(Environment.GetResourceString("Format_InvalidString"));
        }
    }
}
like image 101
Serj-Tm Avatar answered Nov 05 '22 07:11

Serj-Tm


I've written this one for doubles, that doesn't create a temporary substring. It's meant to be used inside a JSON parser so it limits itself to how doubles can be represented in JSON according to http://www.json.org/.

It's not optimal yet because it requires you to know where the number begins and ends (begin and end parameters), so you'll have to traverse the length of the number twice to find out where it ends. It's still around 10-15x faster than double.Parse and it could be fairly easily modified that it finds the end inside the function which is then returned as an out parameter to know where you have to resume parsing the main string.

Used like so:

Parsers.TryParseDoubleFastStream("1", 0, 1, out j);
Parsers.TryParseDoubleFastStream("2.0", 0, 3, out j);
Parsers.TryParseDoubleFastStream("3.5", 0, 3, out j);
Parsers.TryParseDoubleFastStream("-4.5", 0, 4, out j);
Parsers.TryParseDoubleFastStream("50.06", 0, 5, out j);
Parsers.TryParseDoubleFastStream("1000.65", 0, 7, out j);
Parsers.TryParseDoubleFastStream("-10000.8600", 0, 11, out j);

Code can be found here:

https://gist.github.com/3010984 (would be too lengthy to post here).

And StandardFunctions.IgnoreChar is for my purpose as simple as:

public static bool IgnoreChar(char c)
{
  return c < 33;
}
like image 5
JulianR Avatar answered Nov 05 '22 08:11

JulianR