c++ - 像 C# HashSet 这样的快速 C++ 容器和字典?

Question

我在 C# 中经常使用 HashSet 和 Dictionary，发现它们非常快......

我试过使用 std::map 和 std::hash_map 并且发现它们比较慢。这听起来像预期的行为吗？我在使用 std::hash_map 时可能做错了什么？

或者，那里有更好的 C++ Hash 容器吗？

我正在散列 int32，通常大约 100,000 个。

更新：我用 C# 和 C++ 创建了一个 repro。它运行了两次试验，它们在 C# 中需要 19 毫秒和 13 毫秒，在 C++ 中大约需要 11,000 毫秒。我的 C++ 代码一定有什么问题 :)

（两者都作为发布版本运行，都是控制台应用程序）

C# 输出：

Found 511 values in the intersection, in 19 ms
Found 508 values in the intersection, in 13 ms

C++ 输出：

Found 308 values in the intersection, in 11764.7ms
Found 316 values in the intersection, in 11742.8ms

C++ 输出（使用 stdext::hash_map 而不是 std::map）

Found 300 values in the intersection, in 383.552ms
Found 306 values in the intersection, in 2277.02ms

C++ 输出（使用 stdext::hash_map，x64 版本）

Found 292 values in the intersection, in 1037.67ms
Found 302 values in the intersection, in 3663.71ms

笔记：

Set2 并没有像我在 C++ 中想要的那样被填充，我希望它与 Set1 有 50% 的交集（就像在 C# 中一样），但是出于某种原因我不得不将我的随机数乘以 10 甚至让它们部分不相交

C＃：

    static void Main(string[] args)
    {
        int start = DateTime.Now.Millisecond;
        int intersectionSize = runIntersectionTest();
        int duration = DateTime.Now.Millisecond - start;

        Console.WriteLine(String.Format("Found {0} values in the intersection, in {1} ms", intersectionSize, duration));

        start = DateTime.Now.Millisecond;
        intersectionSize = runIntersectionTest();
        duration = DateTime.Now.Millisecond - start;

        Console.WriteLine(String.Format("Found {0} values in the intersection, in {1} ms", intersectionSize, duration));

        Console.ReadKey();
    }

    static int runIntersectionTest()
    {
        Random random = new Random(DateTime.Now.Millisecond);

        Dictionary<int,int> theMap = new Dictionary<int,int>();

        List<int> set1 = new List<int>();
        List<int> set2 = new List<int>();

        // Create 100,000 values for set1
        for ( int i = 0; i < 100000; i++ )
        {
            int value = 1000000000 + i;
            set1.Add(value);
        }

        // Create 1,000 values for set2
        for ( int i = 0; i < 1000; i++ )
        {
            int value = 1000000000 + (random.Next() % 200000 + 1);
            set2.Add(value);
        }

        // Now intersect the two sets by populating the map
        foreach( int value in set1 )
        {
            theMap[value] = 1;
        }

        int intersectionSize = 0;

        foreach ( int value in set2 )
        {
            int count;
            if ( theMap.TryGetValue(value, out count ) )
            {
                intersectionSize++;
                theMap[value] = 2;
            }
        }

        return intersectionSize;
    }

C++：

int runIntersectionTest()
{
    std::map<int,int> theMap;

    vector<int> set1;
    vector<int> set2;

    // Create 100,000 values for set1
    for ( int i = 0; i < 100000; i++ )
    {
        int value = 1000000000 + i;
        set1.push_back(value);
    }

    // Create 1,000 values for set2
    for ( int i = 0; i < 1000; i++ )
    {
        int random = rand() % 200000 + 1;
        random *= 10;

        int value = 1000000000 + random;
        set2.push_back(value);
    }

    // Now intersect the two sets by populating the map
    for ( vector<int>::iterator iterator = set1.begin(); iterator != set1.end(); iterator++ )
    {
        int value = *iterator;

        theMap[value] = 1;
    }

    int intersectionSize = 0;

    for ( vector<int>::iterator iterator = set2.begin(); iterator != set2.end(); iterator++ )
    {
        int value = *iterator;

        map<int,int>::iterator foundValue = theMap.find(value);

        if ( foundValue != theMap.end() )
        {
            theMap[value] = 2;

            intersectionSize++;
        }
    }

    return intersectionSize;

}

int _tmain(int argc, _TCHAR* argv[])
{
    srand ( time(NULL) );

    Timer timer;
    int intersectionSize = runIntersectionTest();
    timer.Stop();

    cout << "Found " << intersectionSize << " values in the intersection, in " << timer.GetMilliseconds() << "ms" << endl;

    timer.Reset();
    intersectionSize = runIntersectionTest();
    timer.Stop();

    cout << "Found " << intersectionSize << " values in the intersection, in " << timer.GetMilliseconds() << "ms" << endl;

    getchar();

    return 0;
}

score 5 · Accepted Answer

Hash_map 和 hash_set 是非标准的，unordered_map和unordered_set最有可能很快成为标准版本。如果没有复制器，我认为这不会走得太远。在底层，它们是相同的数据结构，因此它们应该具有相似的性能。

我在 MS Visual Studio 2008 v9.0.30729.1 下将提供的示例编译为 Visual C++ -> Win32 -> 控制台应用程序（尽管我推出了自己的 Timer 类，因为我不确定您使用的是什么）。在调试下，我得到了 1000 毫秒的时间，但在发布下编译是 50 毫秒。

#include <vector>
#include <iostream>
#include <map>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <windows.h>

typedef struct {
    LARGE_INTEGER start;
    LARGE_INTEGER stop;
} stopWatch;

class CStopWatch {

private:
    stopWatch timer;
    LARGE_INTEGER frequency;
    double LIToSecs( LARGE_INTEGER & L);
public:
    CStopWatch();
    void startTimer( );
    void stopTimer( );
    double getElapsedTime();
};

double CStopWatch::LIToSecs( LARGE_INTEGER & L) {
    return ((double)L.QuadPart /(double)frequency.QuadPart) ;
}

CStopWatch::CStopWatch(){
    timer.start.QuadPart=0;
    timer.stop.QuadPart=0;
    QueryPerformanceFrequency( &frequency ) ;
}

void CStopWatch::startTimer( ) {
    QueryPerformanceCounter(&timer.start) ;
}

void CStopWatch::stopTimer( ) {
    QueryPerformanceCounter(&timer.stop) ;
}

double CStopWatch::getElapsedTime() {
    LARGE_INTEGER time;
    time.QuadPart = timer.stop.QuadPart - timer.start.QuadPart;
    return LIToSecs( time) ;
}

using namespace std;
int runIntersectionTest()
{
    std::map<int,int> theMap;

    vector<int> set1;
    vector<int> set2;

    // Create 100,000 values for set1
    for ( int i = 0; i < 100000; i++ )
    {
        int value = 1000000000 + i;
        set1.push_back(value);
    }

    // Create 1,000 values for set2
    for ( int i = 0; i < 1000; i++ )
    {
        int random = rand() % 200000 + 1;
        random *= 10;

        int value = 1000000000 + random;
        set2.push_back(value);
    }

    // Now intersect the two sets by populating the map
    for ( vector<int>::iterator iterator = set1.begin(); iterator != set1.end(); iterator++ )
    {
        int value = *iterator;

        theMap[value] = 1;
    }

    int intersectionSize = 0;

    for ( vector<int>::iterator iterator = set2.begin(); iterator != set2.end(); iterator++ )
    {
        int value = *iterator;

        map<int,int>::iterator foundValue = theMap.find(value);

        if ( foundValue != theMap.end() )
        {
                theMap[value] = 2;

                intersectionSize++;
        }
    }

    return intersectionSize;

}

int main(int argc, char* argv[])
{
    srand ( time(NULL) );
    int tests = 2;
    while(tests--){
      CStopWatch timer;
      timer.startTimer();
      int intersectionSize = runIntersectionTest();
      timer.stopTimer();

      cout << "Found " << intersectionSize << " values in the intersection, in " << timer.getElapsedTime() << "s\r\n";
    }

    getchar();

    return 0;
}

（我会尝试使用 unordered_map 但我的版本没有）。我怀疑您的 C++ 设置存在一些问题。

score 1 · Accepted Answer

我们设法弄清楚了这一点，请参阅：

为什么当我连接了调试器/IDE 时我的 STL 代码运行如此缓慢？

当您附加调试器时，会发生使用不同 (DEBUG) 内存堆的情况——您可以根据需要将其关闭。

score 0 · Accepted Answer

您在 C++ 代码中使用 std::map ，它的插入和查找时间为 O(log(n))。尝试使用 hash_map 进行测试以获得更好的比较。

score 0 · Accepted Answer

这听起来并不出人意料，但在我们真正提供帮助之前，您需要收集更多详细信息。你在使用谁的 hash_map 实现？您是否将分析器指向它，如果是，它告诉您什么？

一般来说，如果哈希表实现没有明显的原因表现不佳，通常是因为表使用的哈希函数恰好对您的特定输入执行不佳。这可能是你的问题——C++ hash_map 碰巧使用了一个哈希函数，将你的键映射到一小部分桶，而 C# HashSet 没有——或者它可能是完全不同的东西。

std::map 通常实现为树，因此具有不同的性能特征。同样，实现的细节和输入数据很重要。

score 0 · Accepted Answer

0

我从未使用过它，但Google Sparcehash可能很适合

于 2009-06-29T05:17:14.943 回答

score 0 · Accepted Answer

你真正比较的是

C# 哈希集是 O(1)，意味着几乎恒定且独立于输入大小，

与 C++ 向量....意思是（输入的大小）时间常数...

这没有什么实际意义。

您应该尝试在 C++ 中使用等效的 hashset （我认为在 2007 年的 tr1 之后）std::tr1::unordered_set<...> （和 std::tr1::unordered_set<...>）

TR1 上的维基百科链接

另请注意，根据此页面Visual Studio 有自己的次优 stl tr1 实现。（没有亲身经历，在这里找到）

c++ - 像 C# HashSet 这样的快速 C++ 容器和字典?

6 回答 6

Related

Reference