1

我有以下文件,我正在使用迭代器块来解析文件中某些重新出现的节点/部分。我最初使用正则表达式来解析整个文件,但是当节点中不存在某些字段时,它将不匹配。所以我正在尝试使用收益模式。文件格式如下与我使用的代码一致。我从文件中想要的只是将复制节点作为单独的部分,因此我可以使用键字符串获取其中的字段并存储在对象集合中。我可以开始解析第一个复制发生的位置,但无法在复制节点结束的地方结束它。

文件格式:

X_HEADER
{
    DATA_MANAGEMENT_FIELD_2     NA
    DATA_MANAGEMENT_FIELD_3     NA
    DATA_MANAGEMENT_FIELD_4     NA
    SYSTEM_SOFTWARE_VERSION     NA
}
Y_HEADER
{
    DATA_MANAGEMENT_FIELD_2     NA
    DATA_MANAGEMENT_FIELD_3     NA
    DATA_MANAGEMENT_FIELD_4     NA
    SYSTEM_SOFTWARE_VERSION     NA
}
COMPLETION
{
    NUMBER          877
    VERSION         4
    CALIBRATION_VERSION 1
    CONFIGURATION_ID    877    
}
REPLICATE
{
    REPLICATE_ID            1985
    ASSAY_NUMBER            656
    ASSAY_VERSION           4
    ASSAY_STATUS            Research
    DILUTION_ID         1
}
REPLICATE
{
    REPLICATE_ID            1985
    ASSAY_NUMBER            656
    ASSAY_VERSION           4
    ASSAY_STATUS            Research
}

代码:

static IEnumerable<IDictionary<string, string>> ReadParts(string path)
{
    using (var reader = File.OpenText(path))
    {
        var current = new Dictionary<string, string>();
        string line;
        while ((line = reader.ReadLine()) != null)
        {
            if (string.IsNullOrWhiteSpace(line)) continue;

            if (line.StartsWith("REPLICATE"))
            {
                yield return current;
                current = new Dictionary<string, string>();
            }
            else
            {
                var parts = line.Split('\t');
            }

            if (current.Count > 0) yield return current;
        }
    }
}

public static void parseFile(string fileName)
    {
        foreach (var part in ReadParts(fileName))
        {
           //part["fIELD1"] will retireve certain values from the REPLICATE PART HERE
        }
    }
4

3 回答 3

3

好吧,听起来您只需要在获得右大括号时“关闭”一个部分,并且仅yield return在那时。例如:

static IEnumerable<IDictionary<string, string>> ReadParts(string path)
{
    using (var reader = File.OpenText(path))
    {
        string currentName = null;
        IDictionary<string, string> currentMap = null;
        while ((line = reader.ReadLine()) != null)
        {
            if (string.IsNullOrWhiteSpace(line))
            {
                continue;
            }
            if (line == "{")
            {
                if (currentName == null || currentMap != null)
                {
                    throw new BadDataException("Open brace at wrong place");
                }
                currentMap = new Dictionary<string, string>();
            }
            else if (line == "}")
            {
                if (currentName == null || currentMap == null)
                {
                    throw new BadDataException("Closing brace at wrong place");
                }
                // Isolate the "REPLICATE-only" requirement to a single
                // line - if you ever need other bits, you can change this.
                if (currentName == "REPLICATE")
                {
                    yield return currentMap;
                }
                currentName = null;
                currentMap = null;
            }
            else if (!line.StartsWith("\t"))
            {
                if (currentName != null || currentMap != null)
                {
                    throw new BadDataException("Section name at wrong place");
                }
                currentName = line;
            }
            else
            {
                if (currentName == null || currentMap == null)
                {
                    throw new BadDataException("Name/value pair at wrong place");
                }
                var parts = line.Substring(1).Split('\t');
                if (parts.Length != 2)
                {
                    throw new BadDataException("Invalid name/value pair");
                }
                currentMap[parts[0]] = parts[1];
            }                
        }
    }
}

老实说,现在这是一个非常可怕的功能。我怀疑我会将它放在它自己的类中(可能是嵌套类)来存储状态,并使每个处理程序都有自己的方法。哎呀,这实际上是状态模式可能有意义的情况:)

于 2012-06-19T20:10:07.007 回答
2
private IEnumerable<IDictionary<string, string>> ParseFile(System.IO.TextReader reader)
{
    string token = reader.ReadLine();

    while (token != null)
    {
        bool isReplicate = token.StartsWith("REPLICATE");
        token = reader.ReadLine(); //consume this token to either skip it or parse it

        if (isReplicate)
        {     
            yield return ParseBlock(ref token, reader);
        }
    }
}

private IDictionary<string, string> ParseBlock(ref string token, System.IO.TextReader reader)
{
    if (token != "{")
    {
        throw new Exception("Missing opening brace.");
    }

    token = reader.ReadLine();

    var result = ParseValues(ref token, reader);

    if (token != "}")
    {
        throw new Exception("Missing closing brace.");
    }

    token = reader.ReadLine();

    return result;
}

private IDictionary<string, string> ParseValues(ref string token, System.IO.TextReader reader)
{
    IDictionary<string, string> result = new Dictionary<string, string>();

    while (token != "}" and token != null)
    {
        var args = token.Split('\t');

        if (args.Length < 2)
        {
            throw new Exception();
        }

        result.Add(args[0], args[1]);

        token = reader.ReadLine();
    }

    return result;
}
于 2012-06-19T20:30:38.620 回答
1

如果yield return current;在 while 循环结束后添加 a,您将获得最终字典。

我相信最好检查'}'作为当前块的结尾,然后把它放在yield return那里。尽管您不能使用正则表达式来解析整个文件,但您可以使用正则表达式来搜索行内的键值对。以下迭代器代码应该可以工作。它只会返回 REPLICATE 块的字典。

 // Check for lines that are a key-value pair, separated by whitespace.
// Note that value is optional
static string partPattern = @"^(?<Key>\w*)(\s+(?<Value>\.*))?$";

static IEnumerable<IDictionary<string, string>> ReadParts(string path)
{
    using (var reader = File.OpenText(path))
    {
        string line;
        while ((line = reader.ReadLine()) != null)
        {
            // Ignore lines that just contain whitespace
            if (string.IsNullOrWhiteSpace(line)) continue; 

            // This is a new replicate block, start a new dictionary
            if (line.Trim().CompareTo("REPLICATE") == 0)
            {
                yield return parseReplicateBlock(reader);
            }
        }
    }
}

private static IDictionary<string, string> parseReplicateBlock(StreamReader reader)
{
    // Make sure we have an opening brace
    VerifyOpening(reader);
    string line;
    var currentDictionary = new Dictionary<string, string>();
    while ((line = reader.ReadLine()) != null)
    {
        // Ignore lines that just contain whitespace
        if (string.IsNullOrWhiteSpace(line)) continue;

        line = line.Trim();

        // Since our regex used groupings (?<Key> and ?<Value>), 
        // we can do a match and check to see if our groupings 
        // found anything. If they did, extract the key and value. 
        Match m = Regex.Match(line, partPattern);
        if (m.Groups["Key"].Length > 0)
        {
            currentDictionary.Add(m.Groups["Key"].Value, m.Groups["Value"].Value);
        }
        else if (line.CompareTo("}") == 0)
        {
            return currentDictionary;
        }
    }
    // We exited the loop before we found a closing brace, throw an exception
    throw new ApplicationException("Missing closing brace");
}

private static void VerifyOpening(StreamReader reader)
{
    string line;
    while ((line = reader.ReadLine()) != null)
    {
        // Ignore lines that just contain whitespace
        if (string.IsNullOrWhiteSpace(line)) continue;

        if (line.Trim().CompareTo("{") == 0)
        {
            return;
        }
        else
        {
            throw new ApplicationException("Missing opening brace");
        }
    }
    throw new ApplicationException("Missing opening brace");
}

更新:我确保正则表达式字符串包含没有值的情况。此外,组索引全部更改为使用组名称,以避免在修改正则表达式字符串时出现任何问题。

于 2012-06-19T20:09:42.587 回答