我正在尝试创建一个主从模型,其中进程 0(主)将子块从结构数组发送到剩余的 n-1 个进程。这是我的代码的一部分:
#define BLOCK_LOW(id, p, n) ((id) * (n) / (p))
#define BLOCK_HIGH(id, p, n) (BLOCK_LOW((id) + 1, p, n) - 1)
#define BLOCK_SIZE(id, p, n) \
(BLOCK_HIGH(id, p, n) - BLOCK_LOW(id, p, n) + 1)
#define BLOCK_OWNER(j, p, n) (((p) * ((j) + 1) - 1) / (n))
struct itemset_tids {
int itemset[ELEMENTS_SIZE];
int tids[ELEMENTS_SIZE];
};
int main (int argc, char** argv)
{
//Initialization of the MPI data structures
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &processes);
MPI_Barrier(MPI_COMM_WORLD);
start = MPI_Wtime();
MPI_Datatype item_type;
int lengths[2] = {ELEMENTS_SIZE, ELEMENTS_SIZE};
MPI_Aint displacements[2];
struct itemset_tids item;
MPI_Aint base_address;
MPI_Get_address(&item, &base_address);
MPI_Get_address(&item.itemset[0], &displacements[0]);
MPI_Get_address(&item.tids[0], &displacements[1]);
displacements[0] = MPI_Aint_diff(displacements[0], base_address);
displacements[1] = MPI_Aint_diff(displacements[1], base_address);
MPI_Datatype types[2] = {MPI_INT, MPI_INT};
MPI_Type_create_struct(2, lengths, displacements, types, &item_type);
MPI_Type_commit(&item_type);
bcast_size_dataset_and_sort(item_type, rank, processes);
if (rank == 0) {
int n_el_first = BLOCK_SIZE(0, processes, num_el_P);
int start_pos = n_el_first;
for (int i = 1; i < processes; i++) {
int n_el = BLOCK_SIZE(i, processes, num_el_P);
int end_pos = start_pos + n_el;
struct itemset_tids *P_send = malloc(n_el*sizeof(struct itemset_tids));
memcpy(P_send, &P[start_pos], end_pos*sizeof(struct itemset_tids));
MPI_Send(P_send, n_el, item_type, i, 666, MPI_COMM_WORLD);
free(P_send);
start_pos = end_pos;
printf("Process 0 sended to %d.\n", i);
}
} else {
MPI_Status stat;
int n_el = BLOCK_SIZE(rank, processes, num_el_P);
printf("Process %d are receiveng from 0 -> %d elements... ", rank, n_el);
struct itemset_tids *P_recv = malloc(n_el*sizeof(struct itemset_tids));
MPI_Recv(P_recv, n_el, item_type, 0, 666, MPI_COMM_WORLD, &stat);
for(int i = 0; i<n_el; i++) {
printf("%d ", P_recv[i].itemset[0]);
}
printf("\n");
free(P_recv);
}
printf("Process %d finished!.\n", rank);
fflush(stdout);
MPI_Barrier(MPI_COMM_WORLD);
end = MPI_Wtime();
MPI_Finalize();
return 0;
}
bcast_size_dataset_and_sort 函数允许您从文件中读取数据,方法是将它们放置在仅可用于处理 0 的结构 itemset_tids 的数组中,此外,相同的函数允许广播文件中的元素数量(对它们进行排序)-> num_el_P 到所有处理以了解通过 BLOCK_SIZE 宏到达的数据的量化。我已经验证数据到达进程,但这并不总是发生,在某些情况下,接收是在其他所有进程上我有以下类型的错误:
Process 0 sended to 1.
Process 0 sended to 2.
Process 1 are receiveng from 0 -> 1 elements... 2
Process 1 finished!.
Process 2 are receiveng from 0 -> 1 elements... 6
Process 2 finished!.
[MacBook-Pro-di-Danilo-2:25204] *** Process received signal ***
[MacBook-Pro-di-Danilo-2:25204] Signal: Abort trap: 6 (6)
[MacBook-Pro-di-Danilo-2:25204] Signal code: (0)
[MacBook-Pro-di-Danilo-2:25204] [ 0] 0 libsystem_platform.dylib 0x00007ff803c78e2d _sigtramp + 29
[MacBook-Pro-di-Danilo-2:25204] [ 1] 0 ??? 0x0000000000000000 0x0 + 0
[MacBook-Pro-di-Danilo-2:25204] [ 2] 0 libsystem_c.dylib 0x00007ff803bafd10 abort + 123
[MacBook-Pro-di-Danilo-2:25204] [ 3] 0 libsystem_malloc.dylib 0x00007ff803a8d3e2 has_default_zone0 + 0
[MacBook-Pro-di-Danilo-2:25204] [ 4] 0 libsystem_malloc.dylib 0x00007ff803aa12f2 malloc_zone_error + 183
[MacBook-Pro-di-Danilo-2:25204] [ 5] 0 libsystem_malloc.dylib 0x00007ff803a86040 small_free_list_remove_ptr_no_clear + 1264
[MacBook-Pro-di-Danilo-2:25204] [ 6] 0 libsystem_malloc.dylib 0x00007ff803a8112f small_malloc_from_free_list + 359
[MacBook-Pro-di-Danilo-2:25204] [ 7] 0 libsystem_malloc.dylib 0x00007ff803a809c7 small_malloc_should_clear + 279
[MacBook-Pro-di-Danilo-2:25204] [ 8] 0 libsystem_malloc.dylib 0x00007ff803a807d2 szone_malloc_should_clear + 109
[MacBook-Pro-di-Danilo-2:25204] [ 9] 0 libsystem_malloc.dylib 0x00007ff803a9bad6 _malloc_zone_malloc + 125
[MacBook-Pro-di-Danilo-2:25204] [10] 0 parallel_charm 0x000000010119d442 main + 562
[MacBook-Pro-di-Danilo-2:25204] [11] 0 dyld 0x000000011dbd54fe start + 462
[MacBook-Pro-di-Danilo-2:25204] *** End of error message ***
或者
Process 0 sended to 1.
Process 0 sended to 2.
Process 1 are receiveng from 0 -> 1 elements... 2
Process 1 finished!.
Process 2 are receiveng from 0 -> 1 elements... 6
Process 2 finished!.
[MacBook-Pro-di-Danilo-2:25210] *** Process received signal ***
监控我可以发现死锁的过程,但我不明白为什么。谁能帮我理解错误?非常感谢。