string doc = "This is a test sentence with some words with some words repeating like: is a test";
var result = doc.Split(' ')
.GroupBy(word => word)
.OrderByDescending(g=> g.Count())
.Take(1000)
.ToDictionary(r => r.Key ,r=> r.Count());
编辑:
我相信您正在寻找从字符串数组中获取最终字典,基于单词作为键并将它们的最终计数作为值。由于字典不能包含重复值,因此您不需要使用Distict
. 您必须将方法重写为:
private Dictionary<string,int> GenerateTerms(string[] docs)
{
List<Dictionary<string, int>> combinedDictionaryList = new List<Dictionary<string, int>>();
foreach (string str in docs)
{
//Add returned dictionaries to a list
combinedDictionaryList.Add(ProcessDocument(str));
}
//return a single dictionary from list od dictionaries
return combinedDictionaryList
.SelectMany(dict=> dict)
.ToLookup(pair => pair.Key, pair => pair.Value)
.ToDictionary(group => group.Key, group => group.Sum(value => value));
}
private Dictionary<string,int> ProcessDocument(string doc)
{
return doc.Split(' ')
.GroupBy(word => word)
.OrderByDescending(g => g.Count())
.Take(1000)
.ToDictionary(r => r.Key, r => r.Count());
}
然后你可以这样称呼它:
string[] docs = new[]
{
"This is a test sentence with some words with some words repeating like: is a test",
"This is a test sentence with some words with some words repeating like: is a test",
"This is a test sentence with some words",
"This is a test sentence with some words",
};
Dictionary<string, int> finalDictionary = GenerateTerms(docs);