我正在尝试创建一个主从模型,其中进程 0(主)将子块从结构数组发送到剩余的 n-1 个进程。这是我的代码的一部分:
#define BLOCK_LOW(id, p, n) ((id) * (n) / (p))
#define BLOCK_HIGH(id, p, n) (BLOCK_LOW((id) + 1, p, n) - 1)
#define BLOCK_SIZE(id, p, n) \
    (BLOCK_HIGH(id, p, n) - BLOCK_LOW(id, p, n) + 1)
#define BLOCK_OWNER(j, p, n) (((p) * ((j) + 1) - 1) / (n))
    struct itemset_tids {
    int itemset[ELEMENTS_SIZE];
    int tids[ELEMENTS_SIZE];
};
int main (int argc, char** argv)
{
    //Initialization of the MPI data structures
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &processes);
    
    MPI_Barrier(MPI_COMM_WORLD); 
    start = MPI_Wtime();
    MPI_Datatype item_type;
    int lengths[2] = {ELEMENTS_SIZE, ELEMENTS_SIZE};
    MPI_Aint displacements[2];
    struct itemset_tids item;
    MPI_Aint base_address;
    MPI_Get_address(&item, &base_address);
    MPI_Get_address(&item.itemset[0], &displacements[0]);
    MPI_Get_address(&item.tids[0], &displacements[1]);
    displacements[0] = MPI_Aint_diff(displacements[0], base_address);
    displacements[1] = MPI_Aint_diff(displacements[1], base_address);
    MPI_Datatype types[2] = {MPI_INT, MPI_INT};
    MPI_Type_create_struct(2, lengths, displacements, types, &item_type);
    MPI_Type_commit(&item_type);
    
    bcast_size_dataset_and_sort(item_type, rank, processes); 
    if (rank == 0) {
        int n_el_first = BLOCK_SIZE(0, processes, num_el_P);
        int start_pos = n_el_first;
        for (int i = 1; i < processes; i++) {
            int n_el = BLOCK_SIZE(i, processes, num_el_P);
            int end_pos = start_pos + n_el;
            struct itemset_tids *P_send = malloc(n_el*sizeof(struct itemset_tids));
            memcpy(P_send, &P[start_pos], end_pos*sizeof(struct itemset_tids));
            MPI_Send(P_send, n_el, item_type, i, 666, MPI_COMM_WORLD);
            free(P_send);
            start_pos = end_pos;
            printf("Process 0 sended to %d.\n", i);
        }
    } else {
        MPI_Status stat;
        int n_el = BLOCK_SIZE(rank, processes, num_el_P);
        printf("Process %d are receiveng from 0 -> %d elements... ", rank, n_el);
        struct itemset_tids *P_recv = malloc(n_el*sizeof(struct itemset_tids));
        MPI_Recv(P_recv, n_el, item_type, 0, 666, MPI_COMM_WORLD, &stat);
        for(int i = 0; i<n_el; i++) {
            printf("%d ", P_recv[i].itemset[0]);
        }
        printf("\n");
        free(P_recv);
    }
    printf("Process %d finished!.\n", rank);
    fflush(stdout);
    MPI_Barrier(MPI_COMM_WORLD); 
    end = MPI_Wtime();
    MPI_Finalize();
    return 0;
}
bcast_size_dataset_and_sort 函数允许您从文件中读取数据,方法是将它们放置在仅可用于处理 0 的结构 itemset_tids 的数组中,此外,相同的函数允许广播文件中的元素数量(对它们进行排序)-> num_el_P 到所有处理以了解通过 BLOCK_SIZE 宏到达的数据的量化。我已经验证数据到达进程,但这并不总是发生,在某些情况下,接收是在其他所有进程上我有以下类型的错误:
Process 0 sended to 1.
Process 0 sended to 2.
Process 1 are receiveng from 0 -> 1 elements... 2 
Process 1 finished!.
Process 2 are receiveng from 0 -> 1 elements... 6 
Process 2 finished!.
[MacBook-Pro-di-Danilo-2:25204] *** Process received signal ***
[MacBook-Pro-di-Danilo-2:25204] Signal: Abort trap: 6 (6)
[MacBook-Pro-di-Danilo-2:25204] Signal code:  (0)
[MacBook-Pro-di-Danilo-2:25204] [ 0] 0   libsystem_platform.dylib            0x00007ff803c78e2d _sigtramp + 29
[MacBook-Pro-di-Danilo-2:25204] [ 1] 0   ???                                 0x0000000000000000 0x0 + 0
[MacBook-Pro-di-Danilo-2:25204] [ 2] 0   libsystem_c.dylib                   0x00007ff803bafd10 abort + 123
[MacBook-Pro-di-Danilo-2:25204] [ 3] 0   libsystem_malloc.dylib              0x00007ff803a8d3e2 has_default_zone0 + 0
[MacBook-Pro-di-Danilo-2:25204] [ 4] 0   libsystem_malloc.dylib              0x00007ff803aa12f2 malloc_zone_error + 183
[MacBook-Pro-di-Danilo-2:25204] [ 5] 0   libsystem_malloc.dylib              0x00007ff803a86040 small_free_list_remove_ptr_no_clear + 1264
[MacBook-Pro-di-Danilo-2:25204] [ 6] 0   libsystem_malloc.dylib              0x00007ff803a8112f small_malloc_from_free_list + 359
[MacBook-Pro-di-Danilo-2:25204] [ 7] 0   libsystem_malloc.dylib              0x00007ff803a809c7 small_malloc_should_clear + 279
[MacBook-Pro-di-Danilo-2:25204] [ 8] 0   libsystem_malloc.dylib              0x00007ff803a807d2 szone_malloc_should_clear + 109
[MacBook-Pro-di-Danilo-2:25204] [ 9] 0   libsystem_malloc.dylib              0x00007ff803a9bad6 _malloc_zone_malloc + 125
[MacBook-Pro-di-Danilo-2:25204] [10] 0   parallel_charm                      0x000000010119d442 main + 562
[MacBook-Pro-di-Danilo-2:25204] [11] 0   dyld                                0x000000011dbd54fe start + 462
[MacBook-Pro-di-Danilo-2:25204] *** End of error message ***
或者
Process 0 sended to 1.
Process 0 sended to 2.
Process 1 are receiveng from 0 -> 1 elements... 2 
Process 1 finished!.
Process 2 are receiveng from 0 -> 1 elements... 6 
Process 2 finished!.
[MacBook-Pro-di-Danilo-2:25210] *** Process received signal ***
监控我可以发现死锁的过程,但我不明白为什么。谁能帮我理解错误?非常感谢。