3

我正在寻找第一个公共子字符串的实现

Mike is not your average guy. I think you are great.
Jim is not your friend. I think you are great.
Being different is not your fault. I think you are great.

使用最长的公共子串实现(并忽略标点符号),你会得到“我认为你很棒”,但我正在寻找第一个出现的公共子串,在这个例子中:

is not your

也许是一个实现,它生成并排序了所有常见子字符串的列表,我可以从中获取第一个。

编辑

被比较的标记将是完整的单词。寻找第一个最长的整个单词序列的贪婪匹配。(假设在方法中使用了后缀树,树的每个节点都是一个词)

4

2 回答 2

3

有很多步骤可以做到这一点。

  1. 删除标点符号
  2. 将句子分解为单词列表
  3. 创建所有连续单词组合的字符串(min:1,max:wordCount)
  4. 在新的字符串列表(子句)上加入三个列表
  5. 相应地排序。

代码:

static void Main(string[] args)
{
    var sentence1 = "Mike is not your average guy. I think you are great.";
    var sentence2 = "Jim is not your friend. I think you are great.";
    var sentence3 = "Being different is not your fault. I think you are great.";

    //remove all punctuation 
    // http://stackoverflow.com/questions/421616
    sentence1 = new string(
      sentence1.Where(c => !char.IsPunctuation(c)).ToArray());
    sentence2 = new string(
      sentence2.Where(c => !char.IsPunctuation(c)).ToArray());
    sentence3 = new string(
      sentence3.Where(c => !char.IsPunctuation(c)).ToArray());

    //seperate into words
    var words1 = sentence1.Split(new char[] { ' ' },
      StringSplitOptions.RemoveEmptyEntries).ToList();
    var words2 = sentence2.Split(new char[] { ' ' },          
      StringSplitOptions.RemoveEmptyEntries).ToList();
    var words3 = sentence3.Split(new char[] { ' ' }, 
      StringSplitOptions.RemoveEmptyEntries).ToList();

    //create substring list
    var subSentence1 = CreateSubstrings(words1);
    var subSentence2 = CreateSubstrings(words2);
    var subSentence3 = CreateSubstrings(words3);

    //join then like a Sql Table
    var subSentences = subSentence1
        .Join(subSentence2,
            sub1 => sub1.Value,
            sub2 => sub2.Value,
            (sub1, sub2) => new { Sub1 = sub1, 
                                  Sub2 = sub2 })
        .Join(subSentence3,
            sub1 => sub1.Sub1.Value,
            sub2 => sub2.Value,
            (sub1, sub2) => new { Sub1 = sub1.Sub1, 
                                  Sub2 = sub1.Sub2, 
                                  Sub3 = sub2 })
        ;

    //Sorted by Lowest Index, then by Maximum Words
    subSentences = subSentences.OrderBy(s => s.Sub1.Rank)
      .ThenByDescending(s => s.Sub1.Length)
      .ToList();

    //Sort by Maximum Words, then Lowest Index
    /*subSentences = subSentences.OrderByDescending(s => s.Sub1.Length)
        .ThenBy(s => s.Sub1.Rank)
        .ToList();//*/

    foreach (var subSentence in subSentences)
    {
        Console.WriteLine(subSentence.Sub1.Length.ToString() + " " 
          + subSentence.Sub1.Value);
        Console.WriteLine(subSentence.Sub2.Length.ToString() + " " 
          + subSentence.Sub2.Value);
        Console.WriteLine(subSentence.Sub3.Length.ToString() + " " 
          + subSentence.Sub3.Value);
        Console.WriteLine("======================================");
    }

    Console.ReadKey();

}

//this could probably be done better -Erik
internal static List<SubSentence> CreateSubstrings(List<string> words)
{
    var result = new List<SubSentence>();
    for (int wordIndex = 0; wordIndex < words.Count; wordIndex++)
    {
        var sentence = new StringBuilder();
        int currentWord = wordIndex;
        while (currentWord < words.Count - 1)
        {
            sentence.Append(words.ElementAt(currentWord));
            result.Add(new SubSentence() { Rank = wordIndex, 
              Value = sentence.ToString(), 
              Length = currentWord - wordIndex + 1 });
            sentence.Append(' ');
            currentWord++;
        }
        sentence.Append(words.Last());
        result.Add(new SubSentence() { Rank = wordIndex, 
          Value = sentence.ToString(), 
          Length = words.Count - wordIndex });
    }
    return result;
}

internal class SubSentence
{
    public int Rank { get; set; }
    public string Value { get; set; }
    public int Length { get; set; }
}

结果:

3 不是你的

3 不是你的

3 不是你的

=======================================

2 不是

2 不是

2 不是

=======================================

1 是

1 是

1 是

=======================================

2不是你的

2不是你的

2不是你的

=======================================

1 没有

1 没有

1 没有

=======================================

1 你的

1 你的

1 你的

=======================================

5 我觉得你很棒

5 我觉得你很棒

5 我觉得你很棒

=======================================

4 我觉得你是

4 我觉得你是

4 我觉得你是

=======================================

3 我觉得你

3 我觉得你

3 我觉得你

=======================================

2 我认为

2 我认为

2 我认为

=======================================

1 我

1 我

1 我

=======================================

4觉得你很棒

4觉得你很棒

4觉得你很棒

=======================================

3认为你是

3认为你是

3认为你是

=======================================

2认为你

2认为你

2认为你

=======================================

1 认为

1 认为

1 认为

=======================================

3你很棒

3你很棒

3你很棒

=======================================

2你是

2你是

2你是

=======================================

1 你

1 你

1 你

=======================================

2很棒

2很棒

2很棒

=======================================

1 是

1 是

1 是

=======================================

1 很棒

1 很棒

1 很棒

=======================================

于 2014-01-24T22:37:50.337 回答
0

这里有一些可以做你想做的事情。您实际上会调整以预先构建您的字符串列表,将其传递进去,它会为您找到......在此示例中,短语将基于以最短字符串为基线的字符串。

public void SomeOtherFunc()
{
    List<string> MyTest = new List<string>();
    MyTest.Add( "Mike is not your average guy. I think you are great." );
    MyTest.Add( "Jim is not your friend. I think you are great." );
    MyTest.Add( "Being different is not your fault. I think you are great." );

    string thePhrase = testPhrase( MyTest );
    MessageBox.Show( thePhrase );
}



public string testPhrase(List<string> test)
{

    // start with the first string and find the shortest.
    // if we can't find a short string in a long, we'll never find a long string in short
    // Ex "To testing a string that is longer than some other string" 
    // vs "Im testing a string that is short"
    // Work with the shortest string.
    string shortest = test[0];
    string lastGoodPhrase = "";
    string curTest;
    int firstMatch = 0;
    int lastMatch = 0;
    int allFound;
    foreach (string s in test)
        if (s.Length < shortest.Length)
            shortest = s;

    // Now, we need to break the shortest string into each "word"
    string[] words = shortest.Split( ' ' );

    // Now, start with the first word until it is found in ALL phrases
    for (int i = 0; i < words.Length; i++)
    {
        // to prevent finding "this" vs "is"
        lastGoodPhrase = " " + words[i] + " ";

        allFound = 0;
        foreach (string s in test)
        {
            // always force leading space for string
            if ((" "+s).Contains(lastGoodPhrase))
               allFound++;
            else
               // if not found in ANY string, its not found in all, get out
               break;
        }

        if (allFound == test.Count)
        {
            // we've identified the first matched field, get out for next phase test
            firstMatch = i;
            // also set the last common word to the same until we can test next...
            lastMatch = i;
            break;
        }
    }

    // if no match, get out
    if (firstMatch == 0)
        return "";

    // we DO have at least a first match, now keep looking into each subsequent
    // word UNTIL we no longer have a match.
    for( int i = 1; i < words.Length - firstMatch; i++ )
    {
        // From where the first entry was, build out the ENTIRE PHRASE
        // until the end of the original sting of words and keep building 1 word back
        curTest = " ";
        for (int j = firstMatch; j <= firstMatch + i; j++)
            curTest += words[j] + " ";

        // see if all this is found in ALL strings
        foreach (string s in test)
            // we know we STARTED with a valid found phrase.
            // as soon as a string NO LONGER MATCHES the new phrase,
            // return the last VALID phrase
            if (!(" " + s).Contains(curTest))
                return lastGoodPhrase;

        // if this is still a good phrase, set IT as the newest
        lastGoodPhrase = curTest;
    }

    return lastGoodPhrase;
}
于 2014-01-25T01:56:07.597 回答