0

谁能救我?我有以下代码:

private List<string> GenerateTerms(string[] docs)
{
    List <string> uniques = new List<string>();

    for (int i = 0; i < docs.Length; i++)
    {
        string[] tokens = docs[i].Split(' ');

        List<string> toktolist = new List<string>(tokens.ToList());

        var query = toktolist.GroupBy(word => word)
             .OrderByDescending(g => g.Count())
             .Select(g => g.Key)
             .Take(20000);              

        foreach (string k in query)
        {
            if (!uniques.Contains(k)) 
                uniques.Add(k);
        }
    }            

    return uniques;            
}

它是关于根据最高频率从多个文档中生成术语。我用字典做了同样的过程。在这两种情况下都花费了 440 毫秒。但令人惊讶的是,当我使用带有数组列表的过程时,如下代码所示

private ArrayList GenerateTerms(string[] docs)
{
    Dictionary<string, int> yy = new Dictionary<string, int>();
    ArrayList uniques = new ArrayList();

    for (int i = 0; i < docs.Length; i++)
    {
        string[] tokens = docs[i].Split(' ');
        yy.Clear();
        for (int j = 0; j < tokens.Length; j++)
            {
                if (!yy.ContainsKey(tokens[j].ToString()))
                    yy.Add(tokens[j].ToString(), 1);
                else
                    yy[tokens[j].ToString()]++;
            }

            var sortedDict = (from entry in yy
                              orderby entry.Value descending
                              select entry).Take(20000).ToDictionary
                          (pair => pair.Key, pair => pair.Value);               

            foreach (string k in sortedDict.Keys)
            {                    
                if (!uniques.Contains(k)) 
                uniques.Add(k);
            }
        }            

        return uniques;            
    }  

它花了 350 毫秒。数组列表不应该比列表和字典慢吗?请用这种时态拯救我。

4

2 回答 2

5

你的代码做了很多不必要的工作并且使用了低效的数据结构。

试试这个:

private List<string> GenerateTerms(string[] docs)
{
     var result = docs
         .SelectMany(doc => doc.Split(' ')
                               .GroupBy(word => word)
                               .OrderByDescending(g => g.Count())
                               .Select(g => g.Key)
                               .Take(20000))
         .Distinct()
         .ToList();   
     return result;
}

重构版本使其更易于阅读:

private List<string> GenerateTerms(string[] docs)
{
    return docs.SelectMany(doc => ProcessDocument(doc)).Distinct().ToList();
}

private IEnumerable<string> ProcessDocument(string doc)
{
    return doc.Split(' ')
              .GroupBy(word => word)
              .OrderByDescending(g => g.Count())
              .Select(g => g.Key)
              .Take(10000);
}
于 2012-05-05T17:45:26.900 回答
1

我喜欢马克的解决方案。但是,我认为如果您正确利用字典,您可以挤出更多性能。果然,这速度快了很多……

private static List<string> GenerateTerms(string[] docs)
{
    var termsDictionary = new Dictionary<string, int>();

    foreach (var doc in docs)
    {
        var terms = doc.Split(' ');
        int uniqueTermsCount = 0;

        foreach (string term in terms)
        {
            if (termsDictionary.ContainsKey(term))
                termsDictionary[term]++;
            else
            {
                uniqueTermsCount++;
                termsDictionary[term] = 1;
            }
        }

        if (uniqueTermsCount >= 20000)
            break;
    }

    return (from entry in termsDictionary
                    orderby entry.Value descending
                    select entry.Key).ToList();
}

简单地解释一下,termsDictionary包含一个术语词典和每个术语的重复次数。然后最后的 Linq 查询按出现次数降序返回术语。

更新

我添加了代码以将每个文档的唯一术语数限制为 20,000 个。

以下是基准测试结果...

  • 322 毫秒(原始)
  • 284 毫秒(马克拜尔斯解决方案)
  • 113 毫秒(如上利用字典)

下面是我用来生成docs数组并运行测试的代码......

static void Main(string[] args)
{
    string[] docs = new string[50000];

    for (int i = 0; i < docs.Length; i++)
    {
        docs[i] = "a man a plan a canal panama";
    }

    // warm up (don't time this)
    GenerateTermsOriginal(docs);

    Stopwatch sw = new Stopwatch();
    sw.Restart();
    var t1 = GenerateTermsOriginal(docs);
    sw.Stop();
    Console.WriteLine(sw.ElapsedMilliseconds + " ms");

    sw.Restart();
    var t2 = GenerateTermsLinq(docs);
    sw.Stop();
    Console.WriteLine(sw.ElapsedMilliseconds + " ms");

    sw.Restart();
    var t3 = GenerateTermsDictionary(docs);
    sw.Stop();
    Console.WriteLine(sw.ElapsedMilliseconds + " ms");
}
于 2012-05-05T18:19:43.140 回答