我知道这个线程已经很老了,但我上周试了一下,结果很痛苦。这远非完美,但这是我想出的。
片段生成器:
public static string SelectKeywordSnippets(string StringToSnip, string[] Keywords, int SnippetLength)
{
string snippedString = "";
List<int> keywordLocations = new List<int>();
//Get the locations of all keywords
for (int i = 0; i < Keywords.Count(); i++)
keywordLocations.AddRange(SharedTools.IndexOfAll(StringToSnip, Keywords[i], StringComparison.CurrentCultureIgnoreCase));
//Sort locations
keywordLocations.Sort();
//Remove locations which are closer to each other than the SnippetLength
if (keywordLocations.Count > 1)
{
bool found = true;
while (found)
{
found = false;
for (int i = keywordLocations.Count - 1; i > 0; i--)
if (keywordLocations[i] - keywordLocations[i - 1] < SnippetLength / 2)
{
keywordLocations[i - 1] = (keywordLocations[i] + keywordLocations[i - 1]) / 2;
keywordLocations.RemoveAt(i);
found = true;
}
}
}
//Make the snippets
if (keywordLocations.Count > 0 && keywordLocations[0] - SnippetLength / 2 > 0)
snippedString = "... ";
foreach (int i in keywordLocations)
{
int stringStart = Math.Max(0, i - SnippetLength / 2);
int stringEnd = Math.Min(i + SnippetLength / 2, StringToSnip.Length);
int stringLength = Math.Min(stringEnd - stringStart, StringToSnip.Length - stringStart);
snippedString += StringToSnip.Substring(stringStart, stringLength);
if (stringEnd < StringToSnip.Length) snippedString += " ... ";
if (snippedString.Length > 200) break;
}
return snippedString;
}
查找示例文本中所有关键字的索引的函数
private static List<int> IndexOfAll(string haystack, string needle, StringComparison Comparison)
{
int pos;
int offset = 0;
int length = needle.Length;
List<int> positions = new List<int>();
while ((pos = haystack.IndexOf(needle, offset, Comparison)) != -1)
{
positions.Add(pos);
offset = pos + length;
}
return positions;
}
它的执行有点笨拙。它的工作方式是查找字符串中所有关键字的位置。然后检查没有关键字彼此之间的距离比所需的片段长度更近,这样片段就不会重叠(这就是它有点不确定的地方......)。然后抓取以关键字位置为中心的所需长度的子字符串,并将整个内容缝合在一起。
我知道这已经晚了好几年,但张贴以防万一它可能会帮助遇到这个问题的人。