5

给定

var stringList = new List<string>(new string[] {
                   "outage","restoration","efficiency"});

var queryText = "While walking through the park one day, I noticed an outage",
              "in the lightbulb at the plant. I talked to an officer about", 
              "restoration protocol for public works, and he said to contact",
              "the department of public works, but not to expect much because",
              "they have low efficiency."

如何从queryText获取stringList中所有字符串的出现总数?

在上面的例子中,我想要一个返回 3 的方法;

private int stringMatches (string textToQuery, string[] stringsToFind)
{
    //
}

结果

说话太早了!

进行了几次性能测试,Fabian 的这个代码分支快了很多:

private int stringMatches(string textToQuery, string[] stringsToFind)
{
    int count = 0;
    foreach (var stringToFind in stringsToFind)
    {
        int currentIndex = 0;

    while ((currentIndex = textToQuery.IndexOf(stringToFind , currentIndex, StringComparison.Ordinal)) != -1)
    {
       currentIndex++;
       count++;
    }
    }
    return count;
}

执行时间: 在使用秒表的 10000 次迭代循环中:

费边:37-42毫秒

lazyberezovsky 字符串比较:400-500 毫秒

lazyberezovsky 正则表达式:630-680 毫秒

格伦:750-800 毫秒

(将 StringComparison.Ordinal 添加到 Fabians 答案以提高速度。)

4

7 回答 7

6

这也可能很快:

private int stringMatches(string textToQuery, string[] stringsToFind)
{
  int count = 0;
  foreach (var stringToFind in stringsToFind)
  {
    int currentIndex = 0;

    while ((currentIndex = textToQuery.IndexOf(stringToFind , currentIndex, StringComparison.Ordinal)) != -1)
    {
     currentIndex++;
     count++;
    }
  }
  return count;
}
于 2013-07-26T23:26:16.070 回答
4

此 LINQ 查询按空格和标点符号拆分文本,并搜索匹配忽略大小写

private int stringMatches(string textToQuery, string[] stringsToFind)
{
   StringComparer comparer = StringComparer.CurrentCultureIgnoreCase;
   return textToQuery.Split(new []{' ', '.', ',', '!', '?'}) // add more if need
                     .Count(w => stringsToFind.Contains(w, comparer));
}

或使用正则表达式:

private static int stringMatches(string textToQuery, string[] stringsToFind)
{
    var pattern = String.Join("|", stringsToFind.Select(s => @"\b" + s + @"\b"));
    return Regex.Matches(textToQuery, pattern, RegexOptions.IgnoreCase).Count;
}
于 2013-07-26T22:58:33.010 回答
3

如果要计算字符串中其他集合中的单词:

private int stringMatches(string textToQuery, string[] stringsToFind)
{
    return textToQuery.Split().Intersect(stringsToFind).Count();
}
于 2013-07-26T22:55:27.873 回答
1

我喜欢 Tim 的回答,但我尽量避免制作太多字符串以避免性能问题,而且我确实喜欢正则表达式,所以这里有另一种方法:

private int StringMatches(string searchMe, string[] keys)
{
    System.Text.RegularExpressions.Regex expression = new System.Text.RegularExpressions.Regex(string.Join("|", keys), System.Text.RegularExpressions.RegexOptions.IgnoreCase);
    return expression.Matches(searchMe).Count;
}
于 2013-07-26T23:02:23.853 回答
0

这将只匹配您的 TextToQuery的单词:

这样做的想法是检查匹配前后的索引是否不是字母。另外,我必须确保检查它是字符串的开头还是结尾。

  private int stringMatchesWordsOnly(string textToQuery, string[] wordsToFind)
        {
            int count = 0;
            foreach (var wordToFind in wordsToFind)
            {
                int currentIndex = 0;
                while ((currentIndex = textToQuery.IndexOf(wordToFind, currentIndex,         StringComparison.Ordinal)) != -1)
                {
                    if (((currentIndex == 0) || //is it the first index?
                          (!Char.IsLetter(textToQuery, currentIndex - 1))) &&
                          ((currentIndex == (currentIndex + wordToFind.Length)) || //has the end been reached?
                          (!Char.IsLetter(textToQuery, currentIndex + wordToFind.Length))))
                    {
                        count++;
                    }
                    currentIndex++;
                }
            }
            return count;
        }

结论: 正如您所看到的,这种方法比我的其他答案有点混乱,并且性能会更低(尽管仍然比其他答案更高效)。所以这真的取决于你想要实现什么。如果您的字符串中有简短的单词要查找,您可能应该采用这个答案,因为例如“and”显然会在第一种方法中返回太多匹配项。

于 2013-07-27T11:49:17.020 回答
0
private int stringMatches(string textToQuery, string[] stringsToFind)
{
      string[] splitArray = textToQuery.Split(new char[] { ' ', ',','.' });
      var count = splitArray.Where(p => stringsToFind.Contains(p)).ToArray().Count();
      return count;
}
于 2013-07-27T18:08:48.860 回答
0

这是对 Fabian Bigler 原始答案的修订。主要是因为 StringComparison.Ordinal,速度提高了大约 33%。

这是有关此内容的更多信息的链接:http: //msdn.microsoft.com/en-us/library/bb385972.aspx

    private int stringMatches(string textToQuery, List<string> stringsToFind)
    {
        int count = 0, stringCount = stringsToFind.Count(), currentIndex;
        string stringToFind;
        for (int i = 0; i < stringCount; i++)
        {
            currentIndex = 0;
            stringToFind = stringsToFind[i];
            while ((currentIndex = textToQuery.IndexOf(stringToFind, currentIndex, StringComparison.Ordinal)) != -1)
            {
                currentIndex++;
                count++;
            }
        }
        return count;
    }
于 2013-07-27T00:23:05.417 回答