0

我有一段并行哈希的代码,插入代码如下:

int main(int argc, char** argv){
     .....
    Entry* table;//hash table
    for(size_t i=0;i<N;i++){
       keys[i]=i;
       values[i] = rand();//random key-value pairs
    }
    int omp_p = omp_get_max_threads();
    #pragma omp parallel for
    for(int p=0;p<omp_p;p++){
       size_t start = p*N/omp_p;
       size_t end = (p+1)*N/omp_p;//each thread gets contiguous chunks of the arrays
       for(size_t i=start;i<end;i++){
          size_t key = keys[i];
          size_t value = values[i];
          if(insert(table,key,value) == 0){
             printf("Failure!\n");
          }
       }
    }
    ....
    return 0;
}

int insert(Entry* table,size_t key, size_t value){
   Entry entry = (((Entry)key) << 32)+value; //Coalesce key and value into an entry
   /*Use cuckoo hashing*/
   size_t location = hash_function_1(key);

   for(size_t its=0;its<MAX_ITERATIONS;its++){
       entry = __sync_lock_test_and_set(&table[location],entry);
       key=get_key(entry);
       if(key == KEY_EMPTY)
          return1;
       }
       /*We have replaced a valid key, try to hash it using next available hash function*/
       size_t location_1 = hash_function_1(key);
       size_t location_2 = hash_function_2(key);
       size_t location_3 = hash_function_3(key);
       if(location == location_1) location = location_2;
       else if(location == location_2) location = location_3;
       else                            location = location_1;
   }
   return 0;
}

插入代码根本无法缩放。如果我使用单个线程,例如 10M 个键,我会在大约 170 毫秒内完成,而使用 16 个线程,我需要 > 500 毫秒。我怀疑这是因为缓存行(由 table[] 数组组成)在写操作期间在线程之间移动(__sync_lock_test_and_set(...))并且失效导致速度变慢例如如果我将插入代码修改为:

int insert(Entry* table,size_t key, size_t value){
   Entry entry = (((Entry)key) << 32)+value; //Coalesce key and value into an entry

   size_t location = hash_function_1(key);
   table[location] = entry;
   return 1;
}

我仍然得到同样糟糕的表现。由于这是散列,我无法控制特定元素散列到的位置。那么有什么建议吗?另外,如果这不是正确的原因,还有其他关于可能出现问题的指示吗?我试过从 1M 到 100M 键,但单线程性能总是更好。

4

2 回答 2

0

我有几个建议。由于insert函数的运行时间不是恒定的,因此您应该使用schedule(dynamic). 其次,您应该让 OpenMP 划分任务而不是自己做(一个原因,但不是主要原因,是您现在N拥有它的方式必须是 的倍数omp_p)。如果您想控制它如何划分任务,请尝试像这样更改块schedule(dynamic,n)大小,其中 n 是卡盘大小。

#pragma omp parallel for schedule(dynamic)
for(size_t i=0;i<N;i++){
    size_t key = keys[i];
    size_t value = values[i];
    if(insert(table,key,value) == 0){
       printf("Failure!\n");
    }
}
于 2013-05-09T09:10:07.893 回答
0

我会尝试使用基于锁的策略,就像这个简单的片段所示:

#include<omp.h>

#define NHASHES 4
#define NTABLE  1000000
typedef size_t (hash_f)(size_t);

int main(int argc, char** argv) {
  Entry      table [NTABLE ]; 
  hash_f     hashes[NHASHES];
  omp_lock_t locks [NTABLE ]
  /* ... */
  for(size_t ii = 0; ii < N; ii++) {
    keys    [ii] = ii;
    values  [ii] = rand();
  }
  for(size_t ii = 0; ii < NTABLE; ii++) {
    omp_init_lock(&locks[ii]);
  }
#pragma omp parallel
  {
#pragma omp for schedule(static)
    for(int ii = 0; ii < N; ii++) {

    size_t key   = keys  [ii];
    size_t value = values[ii];
    Entry  entry = (((Entry)key) << 32) + value;

    for ( jj = 0; jj < NHASHES; jj++ ) {
      size_t location = hashes[jj];   // I assume this is the computationally demanding part
      omp_set_lock(&locks[location]); // Locks the hash table location before working on it
      if ( get_key(table[location]) == KEY_EMPTY ) {
        table[location] = entry;
        break;
      }
      omp_unset_lock(&locks[location]); // Unlocks the hash table location
    }
    // Handle failures here
    }
  } /* pragma omp parallel */
  for(size_t ii = 0; ii < NTABLE; ii++) {
    omp_destroy_lock(&locks[ii]);
  }
  /* ... */
  return 0;
}

使用更多的机器,您可以处理从1 (相当于一个critical部分)到NTABLE(相当于一个atomic构造)的可变数量的锁,并查看中间的粒度是否提供了一些好处。

于 2013-05-09T19:42:15.220 回答