我有一段并行哈希的代码,插入代码如下:
int main(int argc, char** argv){
.....
Entry* table;//hash table
for(size_t i=0;i<N;i++){
keys[i]=i;
values[i] = rand();//random key-value pairs
}
int omp_p = omp_get_max_threads();
#pragma omp parallel for
for(int p=0;p<omp_p;p++){
size_t start = p*N/omp_p;
size_t end = (p+1)*N/omp_p;//each thread gets contiguous chunks of the arrays
for(size_t i=start;i<end;i++){
size_t key = keys[i];
size_t value = values[i];
if(insert(table,key,value) == 0){
printf("Failure!\n");
}
}
}
....
return 0;
}
int insert(Entry* table,size_t key, size_t value){
Entry entry = (((Entry)key) << 32)+value; //Coalesce key and value into an entry
/*Use cuckoo hashing*/
size_t location = hash_function_1(key);
for(size_t its=0;its<MAX_ITERATIONS;its++){
entry = __sync_lock_test_and_set(&table[location],entry);
key=get_key(entry);
if(key == KEY_EMPTY)
return1;
}
/*We have replaced a valid key, try to hash it using next available hash function*/
size_t location_1 = hash_function_1(key);
size_t location_2 = hash_function_2(key);
size_t location_3 = hash_function_3(key);
if(location == location_1) location = location_2;
else if(location == location_2) location = location_3;
else location = location_1;
}
return 0;
}
插入代码根本无法缩放。如果我使用单个线程,例如 10M 个键,我会在大约 170 毫秒内完成,而使用 16 个线程,我需要 > 500 毫秒。我怀疑这是因为缓存行(由 table[] 数组组成)在写操作期间在线程之间移动(__sync_lock_test_and_set(...))并且失效导致速度变慢例如如果我将插入代码修改为:
int insert(Entry* table,size_t key, size_t value){
Entry entry = (((Entry)key) << 32)+value; //Coalesce key and value into an entry
size_t location = hash_function_1(key);
table[location] = entry;
return 1;
}
我仍然得到同样糟糕的表现。由于这是散列,我无法控制特定元素散列到的位置。那么有什么建议吗?另外,如果这不是正确的原因,还有其他关于可能出现问题的指示吗?我试过从 1M 到 100M 键,但单线程性能总是更好。