1

我正在尝试添加缓存行填充以避免错误共享问题,但我看不出加速有很大差异。使用填充,它的速度只有 1.2 倍。我正在运行没有填充的代码和带有填充 n = 7 亿次的代码进行测试。我应该获得超过 1.2 倍的加速吗?也许我的填充实现错过了一些东西?我要添加 15 个整数填充,因为我假设不必在缓存行的开头分配计数器。任何提示表示赞赏。

这是我的代码:

                                                                                        
template <const int k> void par_countingsort2(int *out, int const *in, const int n) {

    const int paddingAmount = cachelinesize / sizeof(int); 

    const int kPadded = k + (paddingAmount - 1);

    printf("/n%d", kPadded);

    int counters[nproc][kPadded] = {}; // all zeros
    #pragma omp parallel 
    {   
        int *thcounters = counters[omp_get_thread_num()];
        #pragma omp for
        for (int i = 0; i < n; ++i)
            ++thcounters[in[i]];
        #pragma omp single
        {
            int tmp, sum = 0;
            for (int j = 0; j < k; ++j)
                for (int i = 0; i < nproc; ++i) {
                    tmp = counters[i][j];
                    counters[i][j] = sum;
                    sum += tmp;
                }
        }
        #pragma omp for
        for (int i = 0; i < n; ++i)
            out[thcounters[in[i]]++] = in[i];
    }
}
#define k 1000

int main(int argc, char *argv[]) {

    //init input
    int n = argc>1 && atoi(argv[1])>0 ? atoi(argv[1]) : 0;
    int* in = (int*)malloc(sizeof(int)*n);
    int* out = (int*)malloc(sizeof(int)*n);;
    for (int i = 0; i < n; ++i)
        in[i] = rand()%k;
    printf("n = %d\n", n);

    //print some parameters
    printf("nproc = %d\n", nproc);
    printf("cachelinesize = %d byte\n", cachelinesize);
    printf("k = %d\n", k);

   double tp2 = omp_get_wtime();
    par_countingsort2<k>(out, in, n);
    tp2 = omp_get_wtime() - tp2;
    printf("par2, elapsed time = %.3f seconds (%.1fx speedup from par1), check passed = %c\n", tp2, tp/tp2, checkreset(out,in,n)?'y':'n');


    //free mem
    free(in);
    free(out);

    return EXIT_SUCCESS;
}

4

0 回答 0