我想用无序映射实现完美的散列。我有一组编译时已知的字符串,它映射到某些东西。我想为他们生成完美的哈希函数。我发现如果我选择 unordered_map 的大小比已知字符串集的大小大 3 倍,我就能找到一个完美的散列函数(即种子数)。我想尽量减少这个数字。和相关的问题是,如果我使用更大的无序地图,我会得到更快的地图,这是真的吗?
我玩过 Google 的 CityHash 函数: http ://code.google.com/p/cityhash/
#include <sstream>
#include <iostream>
#include <string>
#include <unordered_map>
#include <city.h>
unsigned seed = 0;
const unsigned numberOfTestData = 100;
const unsigned sizeOfPreallocatedMap = 3 * numberOfTestData; // what is the minimum value of this
const unsigned chanceToFindPerfectHashFnSeed = 10000; // in number of iterations
bool foundPerfectHashSeed = false;
int minCollisionCount = 999;
class CityHash {
public:
uint64 operator()(const std::string& s) const {
// return CityHash64(s.c_str(), s.size());
return CityHash64WithSeed(s.c_str(), s.size(), seed);
}
};
class StringEqual {
public:
bool operator()(const std::string& left, const std::string& right) const {
return left == right;
}
};
template<typename T>
void mapTester(T& map)
{
for (unsigned i = 0; i < numberOfTestData; ++i) {
std::stringstream ss;
ss << "TestData_" << i;
map[ss.str()] = i;
}
int collisionCount = 0;
unsigned maxBucketSize = 0;
for (size_t i = 0; i < map.bucket_count(); ++i) {
if (map.bucket_size(i) > 1) {
collisionCount++;
if (maxBucketSize <= map.bucket_size(i))
maxBucketSize = map.bucket_size(i);
}
}
if (collisionCount < minCollisionCount) {
minCollisionCount = collisionCount;
std::cout << maxBucketSize << " collision count is " << collisionCount << " with seed " << seed << std::endl;
}
if (maxBucketSize == 0 && collisionCount == 0)
foundPerfectHashSeed = true;
}
int main() {
std::unordered_map<std::string, int> map;
mapTester(map);
for (; seed < chanceToFindPerfectHashFnSeed; ++seed) {
if (foundPerfectHashSeed)
break;
std::unordered_map<std::string, int, CityHash, StringEqual> cityMap(sizeOfPreallocatedMap);
mapTester(cityMap);
}
std::cout << (foundPerfectHashSeed ? "Found!" : "Not found!") << std::endl;
return 0;
}