我很想看看我是否能想出一种更快的方法来做到这一点——但我只进行了一点优化。那是检查另一个字符串中出现的字符串的索引,因为它首先似乎比“包含”稍快,其次让您指定不区分大小写(如果这对您有用的话)。
下面包括我编写的一个测试类 - 我使用了超过 100 万个单词,并且在所有情况下都使用区分大小写的测试进行搜索。它测试你的方法,也是我试图建立的正则表达式。你可以自己试一试,看看时间;正则表达式的工作速度不如您提供的方法快,但是我可能会错误地构建它。我在 (word1|word2...) 之前使用 (?i) 来指定正则表达式中的不区分大小写(我很想知道如何优化它——它可能遇到了经典的回溯问题!)。
随着更多“不需要的”单词的添加,搜索方法(无论是正则表达式还是提供的原始方法)似乎变得越来越慢。
无论如何-希望这个简单的测试对您有所帮助:
class Program
{
static void Main(string[] args)
{
//Load your string here - I got war and peace from project guttenburg (http://www.gutenberg.org/ebooks/2600.txt.utf-8) and loaded twice to give 1.2 Million words
List<string> loaded = File.ReadAllText(@"D:\Temp\2600.txt").Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries).ToList();
List<string> items = new List<string>();
items.AddRange(loaded);
items.AddRange(loaded);
Console.WriteLine("Loaded {0} words", items.Count);
Stopwatch sw = new Stopwatch();
List<string> WordsUnwanted = new List<string> { "Hell", "Heaven", "and", "or", "big", "the", "when", "ur", "cat" };
StringBuilder regexBuilder = new StringBuilder("(?i)(");
foreach (string s in WordsUnwanted)
{
regexBuilder.Append(s);
regexBuilder.Append("|");
}
regexBuilder.Replace("|", ")", regexBuilder.Length - 1, 1);
string regularExpression = regexBuilder.ToString();
Console.WriteLine(regularExpression);
List<string> words = null;
bool loop = true;
while (loop)
{
Console.WriteLine("Enter test type - 1, 2, 3, 4 or Q to quit");
ConsoleKeyInfo testType = Console.ReadKey();
switch (testType.Key)
{
case ConsoleKey.D1:
sw.Reset();
sw.Start();
words = items
.Distinct()
.AsParallel()
.Where(x => !WordContains(x, WordsUnwanted)).ToList();
sw.Stop();
Console.WriteLine("Parallel (original) process took {0}ms and found {1} matching words", sw.ElapsedMilliseconds, words.Count);
words = null;
break;
case ConsoleKey.D2:
sw.Reset();
sw.Start();
words = items
.Distinct()
.Where(x => !WordContains(x, WordsUnwanted)).ToList();
sw.Stop();
Console.WriteLine("Non-Parallel (original) process took {0}ms and found {1} matching words", sw.ElapsedMilliseconds, words.Count);
words = null;
break;
case ConsoleKey.D3:
sw.Reset();
sw.Start();
words = items
.Distinct()
.AsParallel()
.Where(x => !Regex.IsMatch(x, regularExpression)).ToList();
sw.Stop();
Console.WriteLine("Non-Compiled regex (parallel) Process took {0}ms and found {1} matching words", sw.ElapsedMilliseconds, words.Count);
words = null;
break;
case ConsoleKey.D4:
sw.Reset();
sw.Start();
words = items
.Distinct()
.Where(x => !Regex.IsMatch(x, regularExpression)).ToList();
sw.Stop();
Console.WriteLine("Non-Compiled regex (non-parallel) Process took {0}ms and found {1} matching words", sw.ElapsedMilliseconds, words.Count);
words = null;
break;
case ConsoleKey.Q:
loop = false;
break;
default:
continue;
}
}
}
public static bool WordContains(string word, List<string> words)
{
for (int i = 0; i < words.Count(); i++)
{
//Found that this was a bit fater and also lets you check the casing...!
//if (word.Contains(words[i]))
if (word.IndexOf(words[i], StringComparison.InvariantCultureIgnoreCase) >= 0)
return true;
}
return false;
}
}