根据您的描述(120 个文件,70K-80K 字,每个文件 1-2 MB),似乎最好的方法是读取文件一次并建立一个可以搜索的索引。我在下面提供了一个示例来说明如何完成这样的事情,但是如果您需要比查找精确词或前缀词更复杂的搜索词匹配,那么这对您的用处可能有限。
如果您需要更复杂的文本搜索匹配(同时获得良好的性能),我建议您查看专门为此目的构建的优秀 Lucene 库。
public struct WordLocation
{
public WordLocation(string fileName, int lineNumber, int wordIndex)
{
FileName = fileName;
LineNumber = lineNumber;
WordIndex = wordIndex;
}
public readonly string FileName; // file containing the word.
public readonly int LineNumber; // line within the file.
public readonly int WordIndex; // index within the line.
}
public struct WordOccurrences
{
private WordOccurrences(int nOccurrences, WordLocation[] locations)
{
NumberOfOccurrences = nOccurrences;
Locations = locations;
}
public static readonly WordOccurrences None = new WordOccurrences(0, new WordLocation[0]);
public static WordOccurrences FirstOccurrence(string fileName, int lineNumber, int wordIndex)
{
return new WordOccurrences(1, new [] { new WordLocation(fileName, lineNumber, wordIndex) });
}
public WordOccurances AddOccurrence(string fileName, int lineNumber, int wordIndex)
{
return new WordOccurrences(
NumberOfOccurrences + 1,
Locations
.Concat(
new [] { new WordLocation(fileName, lineNumber, wordIndex) })
.ToArray());
}
public readonly int NumberOfOccurrences;
public readonly WordLocation[] Locations;
}
public interface IWordIndexBuilder
{
void AddWordOccurrence(string word, string fileName, int lineNumber, int wordIndex);
IWordIndex Build();
}
public interface IWordIndex
{
WordOccurrences Find(string word);
}
public static class BuilderExtensions
{
public static IWordIndex BuildIndexFromFiles(this IWordIndexBuilder builder, IEnumerable<FileInfo> wordFiles)
{
var wordSeparators = new char[] {',', ' ', '\t', ';' /* etc */ };
foreach (var file in wordFiles)
{
var lineNumber = 1;
using (var reader = file.OpenText())
{
while (!reader.EndOfStream)
{
var words = reader
.ReadLine()
.Split(wordSeparators, StringSplitOptions.RemoveEmptyEntries)
.Select(f => f.Trim());
var wordIndex = 1;
foreach (var word in words)
builder.AddWordOccurrence(word, file.FullName, lineNumber, wordIndex++);
lineNumber++;
}
}
}
return builder.Build();
}
}
然后最简单的索引实现(只能进行精确匹配查找)在内部使用字典:
public class DictionaryIndexBuilder : IIndexBuilder
{
private Dictionary<string, WordOccurrences> _dict;
private class DictionaryIndex : IWordIndex
{
private readonly Dictionary<string, WordOccurrences> _dict;
public DictionaryIndex(Dictionary<string, WordOccurrences> dict)
{
_dict = dict;
}
public WordOccurrences Find(string word)
{
WordOccurrences found;
if (_dict.TryGetValue(word, out found);
return found;
return WordOccurrences.None;
}
}
public DictionaryIndexBuilder(IEqualityComparer<string> comparer)
{
_dict = new Dictionary<string, WordOccurrences>(comparer);
}
public void AddWordOccurrence(string word, string fileName, int lineNumber, int wordIndex)
{
WordOccurrences current;
if (!_dict.TryGetValue(word, out current))
_dict[word] = WordOccurrences.FirstOccurrence(fileName, lineNumber, wordIndex);
else
_dict[word] = current.AddOccurrence(fileName, lineNumber, wordIndex);
}
public IWordIndex Build()
{
var dict = _dict;
_dict = null;
return new DictionaryIndex(dict);
}
}
用法:
var builder = new DictionaryIndexBuilder(EqualityComparer<string>.Default);
var index = builder.BuildIndexFromFiles(myListOfFiles);
var matchSocks = index.Find("Socks");
如果您还想进行前缀查找,请实现一个使用排序字典的索引构建器/索引类(并更改IWordIndex.Find
方法以返回多个匹配项,或向接口添加一个新方法以查找部分/模式匹配项)。
如果您想做更复杂的查找,请选择 Lucence。