0

我有一个需要解析的 2.6GB 半 csv 文件。半 csv 我的意思是它以 (data,data2,data3,...),(moredata,moredata2,moredata3,...),(...) 的形式出现。这意味着新行由“),(”而不是换行符形成(这意味着整个文件基本上是一行)。

我的计划是读入文件并用“),(”分割,然后我可以根据需要解析每个元素。显然,C# 存在“内存不足”问题,但我不能只分割文件up 因为我不能保证拆分不会错误地分解数据。关于如何做到这一点的任何想法?

4

1 回答 1

0

完全未经测试,字符串必须是"mystring". 不支持字符串中的转义。不支持转义"in 字符串。所以这些是无效的:"my""quote"也不是"my\"quote". 该文件必须是完美的:末尾没有 eof,末尾没有新行,除了字符串内的任何地方都没有空格,除了字符串内的任何地方都没有新行。在字符串中,除了"(标记字符串的结尾)之外,没有任何行,没有元素太多的行,没有元素太少的行,没有null处理(从技术上讲,a ,,for a string 将返回一个空字符串,不会引发错误)。支持的所有类型Convert.ChangeType都支持。

用法:

using (var fs = new StreamReader("myfile.txt"))
{
    foreach (var objs in ParseStream(sr, new Type[] { typeof(int), typeof(double), typeof(string) }, CultureInfo.InvariantCulture))
    {
        // objs is an object[] where each member is of the type asked 
        // when ParseStream was called
    }
}

代码

public static IEnumerable<object[]> ParseStream(TextReader tr, Type[] types, IFormatProvider culture = null)
{
    var parts = new List<string>();
    var sb = new StringBuilder();

    State state = State.WaitingForOpenBracket;

    long col = -1;
    long row = 0;

    int read;

    while ((read = tr.Read()) != -1)
    {
        col++;

        char ch = (char)read;

        if (ch == '\n')
        {
            col = 0;
            row++;
        }
        else
        {
            col++;
        }

        switch (state)
        {
            case State.WaitingForOpenBracket:
                if (ch != '(')
                {
                    throw new Exception(string.Format("Malformed begin-of-the-row at R: {0}, C: {1}, char: {2}", row, col, ch));
                }

                state = State.WaitingForData;

                break;

            case State.WaitingForData:
            case State.WaitingForColumnSeparator:
                if (ch == ',' || ch == ')')
                {
                    parts.Add(sb.ToString());

                    sb.Clear();

                    if (parts.Count > types.Length)
                    {
                        throw new Exception(string.Format("Too many parts starting at R: {0}, C: {1}", row, col));
                    }

                    if (ch == ')')
                    {
                        var parts2 = parts.Select((p, ix) => Convert.ChangeType(p, types[ix], culture ?? CultureInfo.InvariantCulture)).ToArray();
                        parts.Clear();                                

                        yield return parts2;

                        state = State.WaitingForRowSeparator;
                    }
                }
                else
                {
                    if (state == State.WaitingForColumnSeparator)
                    {
                        throw new Exception(string.Format("Malformed column separator at R: {0}, C: {1}, char: {2}", row, col, ch));
                    }

                    if (ch == '"')
                    {
                        if (sb.Length != 0)
                        {
                            throw new Exception(string.Format("Malformed string at R: {0}, C: {1}, char: {2}", row, col, ch));
                        }

                        state = State.WaitingForEndQuotes;
                    }
                    else
                    {
                        sb.Append(ch);
                    }
                }

                break;

            case State.WaitingForEndQuotes:
                if (ch == '"')
                {
                    state = State.WaitingForColumnSeparator;
                }
                else
                {
                    sb.Append(ch);
                }

                break;

            case State.WaitingForRowSeparator:
                if (ch != ',')
                {
                    throw new Exception(string.Format("Malformed row separator at R: {0}, C: {1}, char: {2}", row, col, ch));
                }

                state = State.WaitingForOpenBracket;

                break;
        }
    }

    if (state != State.WaitingForRowSeparator)
    {
        throw new Exception(string.Format("Malformed end-of-file at R: {0}, C: {1}", row, col));
    }
}
于 2013-08-15T07:40:15.850 回答