23

我编写了两个函数,它们将一串以空格分隔的整数转换为一个 int 数组。第一个函数使用Substring然后应用于System.Int32.Parse将子字符串转换为int值:

let intsOfString (s: string) =
  let ints = ResizeArray()
  let rec inside i j =
    if j = s.Length then
      ints.Add(s.Substring(i, j-i) |> System.Int32.Parse)
    else
      let c = s.[j]
      if '0' <= c && c <= '9' then
        inside i (j+1)
      else
        ints.Add(s.Substring(i, j-i) |> System.Int32.Parse)
        outside (j+1)
  and outside i =
    if i < s.Length then
      let c = s.[i]
      if '0' <= c && c <= '9' then
        inside i (i+1)
      else
        outside (i+1)
  outside 0
  ints.ToArray()

第二个函数遍历字符串的字符就地累加整数而不创建临时子字符串:

let intsOfString (s: string) =
  let ints = ResizeArray()
  let rec inside n i =
    if i = s.Length then
      ints.Add n
    else
      let c = s.[i]
      if '0' <= c && c <= '9' then
        inside (10*n + int c - 48) (i+1)
      else
        ints.Add n
        outside(i+1)
  and outside i =
    if i < s.Length then
      let c = s.[i]
      if '0' <= c && c <= '9' then
        inside (int c - 48) (i+1)
      else
        outside (i+1)
  outside 0
  ints.ToArray()

以空格分隔的整数 1 到 1,000,000 为基准,第一个版本需要 1.5 秒,而第二个版本需要 0.3 秒。

解析此类值可能对性能至关重要,因此通过使用临时子字符串在表上保留 5 倍的性能可能是不可取的。解析整数很容易,但解析浮点数、小数和日期等其他值要困难得多。

那么,是否有内置函数可以直接从字符串中的子字符串解析(即使用给定的字符串开头和长度)以避免生成临时字符串?如果没有,是否有任何库提供有效的功能来做到这一点?

4

5 回答 5

8

System.Int32.Parse最慢,因为它使用了CultureInfoFormatInfo等等;性能原因不在临时字符串中。

来自反射的代码:

private unsafe static bool ParseNumber(ref char* str, NumberStyles options, ref Number.NumberBuffer number, NumberFormatInfo numfmt, bool parseDecimal)
{
    number.scale = 0;
    number.sign = false;
    string text = null;
    string text2 = null;
    string str2 = null;
    string str3 = null;
    bool flag = false;
    string str4;
    string str5;
    if ((options & NumberStyles.AllowCurrencySymbol) != NumberStyles.None)
    {
        text = numfmt.CurrencySymbol;
        if (numfmt.ansiCurrencySymbol != null)
        {
            text2 = numfmt.ansiCurrencySymbol;
        }
        str2 = numfmt.NumberDecimalSeparator;
        str3 = numfmt.NumberGroupSeparator;
        str4 = numfmt.CurrencyDecimalSeparator;
        str5 = numfmt.CurrencyGroupSeparator;
        flag = true;
    }
    else
    {
        str4 = numfmt.NumberDecimalSeparator;
        str5 = numfmt.NumberGroupSeparator;
    }
    int num = 0;
    char* ptr = str;
    char c = *ptr;
    while (true)
    {
        if (!Number.IsWhite(c) || (options & NumberStyles.AllowLeadingWhite) == NumberStyles.None || ((num & 1) != 0 && ((num & 1) == 0 || ((num & 32) == 0 && numfmt.numberNegativePattern != 2))))
        {
            bool flag2;
            char* ptr2;
            if ((flag2 = (((options & NumberStyles.AllowLeadingSign) == NumberStyles.None) ? false : ((num & 1) == 0))) && (ptr2 = Number.MatchChars(ptr, numfmt.positiveSign)) != null)
            {
                num |= 1;
                ptr = ptr2 - (IntPtr)2 / 2;
            }
            else
            {
                if (flag2 && (ptr2 = Number.MatchChars(ptr, numfmt.negativeSign)) != null)
                {
                    num |= 1;
                    number.sign = true;
                    ptr = ptr2 - (IntPtr)2 / 2;
                }
                else
                {
                    if (c == '(' && (options & NumberStyles.AllowParentheses) != NumberStyles.None && (num & 1) == 0)
                    {
                        num |= 3;
                        number.sign = true;
                    }
                    else
                    {
                        if ((text == null || (ptr2 = Number.MatchChars(ptr, text)) == null) && (text2 == null || (ptr2 = Number.MatchChars(ptr, text2)) == null))
                        {
                            break;
                        }
                        num |= 32;
                        text = null;
                        text2 = null;
                        ptr = ptr2 - (IntPtr)2 / 2;
                    }
                }
            }
        }
        c = *(ptr += (IntPtr)2 / 2);
    }
    int num2 = 0;
    int num3 = 0;
    while (true)
    {
        if ((c >= '0' && c <= '9') || ((options & NumberStyles.AllowHexSpecifier) != NumberStyles.None && ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))))
        {
            num |= 4;
            if (c != '0' || (num & 8) != 0)
            {
                if (num2 < 50)
                {
                    number.digits[(IntPtr)(num2++)] = c;
                    if (c != '0' || parseDecimal)
                    {
                        num3 = num2;
                    }
                }
                if ((num & 16) == 0)
                {
                    number.scale++;
                }
                num |= 8;
            }
            else
            {
                if ((num & 16) != 0)
                {
                    number.scale--;
                }
            }
        }
        else
        {
            char* ptr2;
            if ((options & NumberStyles.AllowDecimalPoint) != NumberStyles.None && (num & 16) == 0 && ((ptr2 = Number.MatchChars(ptr, str4)) != null || (flag && (num & 32) == 0 && (ptr2 = Number.MatchChars(ptr, str2)) != null)))
            {
                num |= 16;
                ptr = ptr2 - (IntPtr)2 / 2;
            }
            else
            {
                if ((options & NumberStyles.AllowThousands) == NumberStyles.None || (num & 4) == 0 || (num & 16) != 0 || ((ptr2 = Number.MatchChars(ptr, str5)) == null && (!flag || (num & 32) != 0 || (ptr2 = Number.MatchChars(ptr, str3)) == null)))
                {
                    break;
                }
                ptr = ptr2 - (IntPtr)2 / 2;
            }
        }
        c = *(ptr += (IntPtr)2 / 2);
    }
    bool flag3 = false;
    number.precision = num3;
    number.digits[(IntPtr)num3] = '\0';
    if ((num & 4) != 0)
    {
        if ((c == 'E' || c == 'e') && (options & NumberStyles.AllowExponent) != NumberStyles.None)
        {
            char* ptr3 = ptr;
            c = *(ptr += (IntPtr)2 / 2);
            char* ptr2;
            if ((ptr2 = Number.MatchChars(ptr, numfmt.positiveSign)) != null)
            {
                c = *(ptr = ptr2);
            }
            else
            {
                if ((ptr2 = Number.MatchChars(ptr, numfmt.negativeSign)) != null)
                {
                    c = *(ptr = ptr2);
                    flag3 = true;
                }
            }
            if (c >= '0' && c <= '9')
            {
                int num4 = 0;
                do
                {
                    num4 = num4 * 10 + (int)(c - '0');
                    c = *(ptr += (IntPtr)2 / 2);
                    if (num4 > 1000)
                    {
                        num4 = 9999;
                        while (c >= '0' && c <= '9')
                        {
                            c = *(ptr += (IntPtr)2 / 2);
                        }
                    }
                }
                while (c >= '0' && c <= '9');
                if (flag3)
                {
                    num4 = -num4;
                }
                number.scale += num4;
            }
            else
            {
                ptr = ptr3;
                c = *ptr;
            }
        }
        while (true)
        {
            if (!Number.IsWhite(c) || (options & NumberStyles.AllowTrailingWhite) == NumberStyles.None)
            {
                bool flag2;
                char* ptr2;
                if ((flag2 = (((options & NumberStyles.AllowTrailingSign) == NumberStyles.None) ? false : ((num & 1) == 0))) && (ptr2 = Number.MatchChars(ptr, numfmt.positiveSign)) != null)
                {
                    num |= 1;
                    ptr = ptr2 - (IntPtr)2 / 2;
                }
                else
                {
                    if (flag2 && (ptr2 = Number.MatchChars(ptr, numfmt.negativeSign)) != null)
                    {
                        num |= 1;
                        number.sign = true;
                        ptr = ptr2 - (IntPtr)2 / 2;
                    }
                    else
                    {
                        if (c == ')' && (num & 2) != 0)
                        {
                            num &= -3;
                        }
                        else
                        {
                            if ((text == null || (ptr2 = Number.MatchChars(ptr, text)) == null) && (text2 == null || (ptr2 = Number.MatchChars(ptr, text2)) == null))
                            {
                                break;
                            }
                            text = null;
                            text2 = null;
                            ptr = ptr2 - (IntPtr)2 / 2;
                        }
                    }
                }
            }
            c = *(ptr += (IntPtr)2 / 2);
        }
        if ((num & 2) == 0)
        {
            if ((num & 8) == 0)
            {
                if (!parseDecimal)
                {
                    number.scale = 0;
                }
                if ((num & 16) == 0)
                {
                    number.sign = false;
                }
            }
            str = ptr;
            return true;
        }
    }
    str = ptr;
    return false;
}
public static int Parse(string s)
{
    return Number.ParseInt32(s, NumberStyles.Integer, NumberFormatInfo.CurrentInfo);
}

internal unsafe static int ParseInt32(string s, NumberStyles style, NumberFormatInfo info)
{
    byte* stackBuffer = stackalloc byte[1 * 114 / 1];
    Number.NumberBuffer numberBuffer = new Number.NumberBuffer(stackBuffer);
    int result = 0;
    Number.StringToNumber(s, style, ref numberBuffer, info, false);
    if ((style & NumberStyles.AllowHexSpecifier) != NumberStyles.None)
    {
        if (!Number.HexNumberToInt32(ref numberBuffer, ref result))
        {
            throw new OverflowException(Environment.GetResourceString("Overflow_Int32"));
        }
    }
    else
    {
        if (!Number.NumberToInt32(ref numberBuffer, ref result))
        {
            throw new OverflowException(Environment.GetResourceString("Overflow_Int32"));
        }
    }
    return result;
}

private unsafe static void StringToNumber(string str, NumberStyles options, ref Number.NumberBuffer number, NumberFormatInfo info, bool parseDecimal)
{
    if (str == null)
    {
        throw new ArgumentNullException("String");
    }
    fixed (char* ptr = str)
    {
        char* ptr2 = ptr;
        if (!Number.ParseNumber(ref ptr2, options, ref number, info, parseDecimal) || ((ptr2 - ptr / 2) / 2 < str.Length && !Number.TrailingZeros(str, (ptr2 - ptr / 2) / 2)))
        {
            throw new FormatException(Environment.GetResourceString("Format_InvalidString"));
        }
    }
}
于 2012-06-28T12:31:08.153 回答
5

我已经为双打编写了这个,它不会创建临时子字符串。它打算在 JSON 解析器中使用,因此它限制了自己如何根据http://www.json.org/在 JSON 中表示双精度数。

它还不是最佳的,因为它需要您知道数字的开始和结束位置(begin以及end参数),因此您必须遍历数字的长度两次才能找出它的结束位置。它仍然比它快 10-15 倍左右double.Parse,并且可以相当容易地修改它找到end函数内部,然后将其作为out参数返回,以了解您必须在哪里恢复解析主字符串。

像这样使用:

Parsers.TryParseDoubleFastStream("1", 0, 1, out j);
Parsers.TryParseDoubleFastStream("2.0", 0, 3, out j);
Parsers.TryParseDoubleFastStream("3.5", 0, 3, out j);
Parsers.TryParseDoubleFastStream("-4.5", 0, 4, out j);
Parsers.TryParseDoubleFastStream("50.06", 0, 5, out j);
Parsers.TryParseDoubleFastStream("1000.65", 0, 7, out j);
Parsers.TryParseDoubleFastStream("-10000.8600", 0, 11, out j);

代码可以在这里找到:

https://gist.github.com/3010984(在这里发布太长了)。

并且StandardFunctions.IgnoreChar对于我的目的来说很简单:

public static bool IgnoreChar(char c)
{
  return c < 33;
}
于 2012-06-28T12:13:52.033 回答
5

将所有这些代码粘贴到 C# 中并调用Test(). 这与您可以直接在字符串数组上操作以使用 C# 解析数字一样接近。它是为速度而不是优雅而构建的。ParseIntand函数是为 OpenGL 图形引擎创建的ParseFloat,用于从基于文本的 3d 模型中导入矢量。解析浮点数是该过程中的一个重要瓶颈。这是我能做到的最快速度。

using System.Diagnostics;

    private void Test()
    {
        Stopwatch sw = new Stopwatch();
        StringBuilder sb = new StringBuilder();
        int iterations = 1000;

        // Build a string of 1000000 space separated numbers
        for (var n = 0; n < iterations; n++)
        {
            if (n > 0)
                sb.Append(' ');
            sb.Append(n.ToString());
        }

        string numberString = sb.ToString();

        // Time the process
        sw.Start();
        StringToInts(numberString, iterations);
        //StringToFloats(numberString, iterations);
        sw.Stop();
        long proc1 = sw.ElapsedMilliseconds;

        Console.WriteLine("iterations: {0} \t {1}ms", iterations, proc1);
    }

    private unsafe int[] StringToInts(string s, int length)
    {
        int[] ints = new int[length];
        int index = 0;
        int startpos = 0;

        fixed (char* pStringBuffer = s)
        {
            fixed (int* pIntBuffer = ints)
            {
                for (int n = 0; n < s.Length; n++)
                {
                    if (s[n] == ' ' || n == s.Length - 1)
                    {
                        if (n == s.Length - 1)
                            n++;
                        // pIntBuffer[index++] = int.Parse(new string(pStringBuffer, startpos, n - startpos));
                        pIntBuffer[index++] = ParseInt((pStringBuffer + startpos), n - startpos); 
                        startpos = n + 1;
                    }
                }
            }
        }

        return ints;
    }

    private unsafe float[] StringToFloats(string s, int length)
    {
        float[] floats = new float[length];
        int index = 0;
        int startpos = 0;

        fixed (char* pStringBuffer = s)
        {
            fixed (float* pFloatBuffer = floats)
            {
                for (int n = 0; n < s.Length; n++)
                {
                    if (s[n] == ' ' || n == s.Length - 1)
                    {
                        if (n == s.Length - 1)
                            n++;

                        pFloatBuffer[index++] = ParseFloat((pStringBuffer + startpos), n - startpos); // int.Parse(new string(pStringBuffer, startpos, n - startpos));
                        startpos = n + 1;
                    }
                }
            }
        }

        return floats;
    }

    public static unsafe int ParseInt(char* input, int len)
    {
        int pos = 0;           // read pointer position
        int part = 0;          // the current part (int, float and sci parts of the number)
        bool neg = false;      // true if part is a negative number

        int* ret = stackalloc int[1];

        while (pos < len && (*(input + pos) > '9' || *(input + pos) < '0') && *(input + pos) != '-')
            pos++;

        // sign
        if (*(input + pos) == '-')
        {
            neg = true;
            pos++;
        }

        // integer part
        while (pos < len && !(input[pos] > '9' || input[pos] < '0'))
            part = part * 10 + (input[pos++] - '0');

        *ret = neg ? (part * -1) : part;
        return *ret;
    }

    public static unsafe float ParseFloat(char* input, int len)
    {
        //float ret = 0f;      // return value
        int pos = 0;           // read pointer position
        int part = 0;          // the current part (int, float and sci parts of the number)
        bool neg = false;      // true if part is a negative number

        float* ret = stackalloc float[1];

        // find start
        while (pos < len && (input[pos] < '0' || input[pos] > '9') && input[pos] != '-' && input[pos] != '.')
            pos++;

        // sign
        if (input[pos] == '-')
        {
            neg = true;
            pos++;
        }

        // integer part
        while (pos < len && !(input[pos] > '9' || input[pos] < '0'))
            part = part * 10 + (input[pos++] - '0');

        *ret = neg ? (float)(part * -1) : (float)part;

        // float part
        if (pos < len && input[pos] == '.')
        {
            pos++;
            double mul = 1;
            part = 0;

            while (pos < len && !(input[pos] > '9' || input[pos] < '0'))
            {
                part = part * 10 + (input[pos] - '0');
                mul *= 10; 
                pos++;
            }

            if (neg)
                *ret -= (float)part / (float)mul;
            else
                *ret += (float)part / (float)mul;

        }

        // scientific part
        if (pos < len && (input[pos] == 'e' || input[pos] == 'E'))
        {
            pos++;
            neg = (input[pos] == '-'); pos++;
            part = 0;
            while (pos < len && !(input[pos] > '9' || input[pos] < '0'))
            {
                part = part * 10 + (input[pos++] - '0');
            }

            if (neg)
                *ret /= (float)Math.Pow(10d, (double)part);
            else
                *ret *= (float)Math.Pow(10d, (double)part);
        }

        return (float)*ret;
    }
于 2013-12-06T22:46:39.147 回答
1

那么,是否有内置函数可以直接从字符串中的子字符串解析(即使用给定的字符串开头和长度)以避免生成临时字符串?如果没有,是否有任何库提供有效的功能来做到这一点?

似乎您想使用词法分析缓冲区和词法分析器,类似于 OCaml 可以提供ocamllexLexbuf缓冲区。(我无法为 F# 提供参考。)

如果您的基准测试涉及由其他标记分隔的大量整数是您的典型情况,那么它将运行良好。但在其他情况下,这可能是不切实际的。

于 2014-01-30T11:53:43.000 回答
0

不确定这是否有好处,但是您是否尝试过类似的方法:

var stringValues = input.split(" ");

var intValues = Array.ConvertAll(stringValues, s => int.Parse(s));
于 2012-06-28T09:24:05.817 回答