c++ - TBB Reduction causes segfault when using more than 1 logical core

Question

I implemented the Jacobi algorithm using TBB and it works just fine. Then I parallelized the convergence calculation using a reduction, but for some reason if I use more than 1 logical core I get an segmentation fault and i can't figure out why.

I can use more than 1 thread on a system that has only 1 logical core.

The same implementation using OpenMP works without a hassle

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>
#include <tbb/blocked_range.h>
#include <tbb/task_scheduler_init.h>
#include <tbb/tick_count.h>

// ----------------------------------------------------------------

#define SIZE 1024
#define RESIDUO 0.0009f*SIZE
#define THREADS 2

using namespace tbb;

// ----------------------------------------------------------------

struct Sum {
    float ret;
    float (*a)[SIZE];
    float (*x);
    float (*b);
    Sum(float A[SIZE][SIZE], float X[SIZE], float B[SIZE]) : ret(0), a(A), x(X), b(B) {}
    Sum( Sum&, split ) {ret = 0;}
    void operator()( const blocked_range<int>& r ) {
        float temp = ret;
        for( int i = r.begin(); i != r.end(); i++ ) {
            float sum = 0.0f;
            for(int j = 0; j < SIZE; j++){
                sum += a[i][j] * x[j];
            }
            temp += pow(b[i] - sum, 2);
        }
        ret = temp;
    }
    void join( Sum& rhs ) {ret += rhs.ret;}
};


/*
    // || b - Ax ||
*/

int converge(float a[SIZE][SIZE], float x[SIZE], float b[SIZE]){

    Sum total(a, x, b);
    parallel_reduce( blocked_range<int>(0, SIZE), total );

    float norm = sqrt(total.ret);
    printf("Ret: %f | Residuo: %f\n", total.ret, norm);

    return (norm <= RESIDUO);
}

// ----------------------------------------------------------------

float randomFloat()
{
    float r = (float)rand()/(float)RAND_MAX;
    return r;
}

// ----------------------------------------------------------------

int check_ddm(float (*a)[SIZE]){
    float sum = 0.0f;
    int i = 0, j = 0;

    for(i = 0; i < SIZE; i++){
        sum = 0.0f;
        for(j = 0; j < SIZE; j++){
            if(i != j){
                sum += a[i][j];
            }
        }
        if(a[i][i] < sum){
            printf("line: %d, sum: %f, a[i][i]: %f  \n", i, sum, a[i][i]);
            for(j = 0; j < SIZE; j++){
                if(i != j) printf("%f ", a[i][j]);
                else printf("(%f) ", a[i][j]);
            }
            printf("\n");
            return 0;
        }
    }
    return 1;
}

// ----------------------------------------------------------------

int generate_ddm(float (*a)[SIZE], float *b)
{
    int i = 0, j = 0;
    float line = 0.0f;
    for(i = 0; i < SIZE; i++){
        line = 0.0f;
        for(j = 0; j < SIZE; j++){
            if(i != j){
                a[i][j] = randomFloat();
            }
            line += a[i][j];
        }
        a[i][i] = SIZE;
        b[i] = line + SIZE;
    }

    return check_ddm(a);
}

// ----------------------------------------------------------------

int main(  )
{
    float (*x)[SIZE] = (float(*)[SIZE])malloc(sizeof *x * 2);
    float (*a)[SIZE] = (float(*)[SIZE])malloc(sizeof *a * SIZE);
    float (*b) = (float*)malloc(sizeof(float) * SIZE);
    int i = 0, j = 0;
    float delta = 0.0f;
    int read = 0;
    int write = 1;

    srand(time(NULL));
    tbb::task_scheduler_init init(THREADS);

    // set up initial solution
    for(i = 0; i < SIZE; i++){
        x[0][i] = i;
        x[1][i] = i;
    }

    // generate a diagonal dominant matrix
    if(!generate_ddm(a, b)){
        printf("Array generated is not ddm!\n");
        return 1;
    }

    tick_count startTime = tick_count::now();        
    while(!converge(a, x[write], b)){
        read = !read;
        write = !write;

        parallel_for(blocked_range<int>(0,SIZE), 
            [&] (const blocked_range<int>& r) {
            for (int i = r.begin(); i < r.end(); i++) {
                float delta = 0.0f;
                for(int j = 0; j < SIZE; j++){
                    if(j != i){
                        delta += a[i][j] * x[read][j];
                    }
                }
                x[write][i] = (b[i] - delta) / a[i][i];
            }
        });
    }


    tick_count lastTime = tick_count::now();
    float walltime = (lastTime - startTime).seconds();

    printf("tbb %f\n", walltime);
    converge(a, x[write], b);
    printf("x0: %f | x%d: %f\n", x[write][0], SIZE-1, x[write][SIZE-1]);
    free(a);
    free(b);
    free(x);

    return 0;
}

The segfault occurs on the following line inside the Sum class:

sum += a[i][j] * x[j];

And if I change that line to

float tmpa = a[i][j];
float tmpx = x[j];
sum += tmpa * tmpx;

The error continues to be on

sum += tmpa * tmpx;

score 3 · Accepted Answer

在原始版本中，“拆分构造函数”未定义 a、x 和 b。它们需要从传入的 Sum& 参数中复制。例如，将拆分构造函数更改为：

Sum( Sum& s, split ) {a=s.a; b=s.b; x=s.x; ret = 0;}

score 0 · Accepted Answer

将 Class 更改为 lambda 表达式解决了这个问题。这可能是TBB中的一个错误parallel_reduce

int converge(float a[SIZE][SIZE], float x[SIZE], float b[SIZE]){

    float val = 0.0f; 
    val = parallel_reduce(
        blocked_range<int>(0, SIZE),  
        0.0f,   
        [&]( const blocked_range<int>& r, float init )->float {
            float temp = init; 
            for(int i = r.begin(); i != r.end(); i++ ) { 
                float sum = 0.0f; 
                for(int j = 0; j < SIZE; j++){
                    sum += a[i][j] * x[j]; 
                }       
                temp += pow(b[i] - sum, 2);
            }       
            return temp;
        },      
        []( float x, float y)->float{
            return x+y;
        }       
    );

    float norm = sqrt(val);
    printf("Ret: %f | Residuo: %f\n", val, norm);

    return (norm <= RESIDUO);
}

c++ - TBB Reduction causes segfault when using more than 1 logical core

2 回答 2

Related

Reference