#include <math.h>
#include <time.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/*-------------------------------------*/
#include<fcntl.h>
#include<unistd.h>
#include<sys/stat.h>
#include<sys/time.h>
#include<sys/mman.h>
#include<sys/types.h>
#include <mpi.h>
#define PERMS 0600
#define PAGE 4096
int ntask=0, mytask=0;
void
write_result (char *detail)
{
int fd;
char *filename = (char *) malloc (1000*sizeof(char));
sprintf(filename, "%d", mytask);
strcat(filename,".fa");
fd = open (filename, O_CREAT | O_RDWR, S_IRWXU);
write (fd, detail, strlen (detail));
close (fd);
}
char *
mmaping (char *source)
{
int src;
char *sm;
struct stat statbuf;
if ((src = open (source, O_RDONLY)) < 0)
{
perror (" open source ");
exit (EXIT_FAILURE);
}
if (fstat (src, &statbuf) < 0)
{
perror (" fstat source ");
exit (EXIT_FAILURE);
}
printf("task->%d\n",mytask);
printf("total task->%d\n",ntask);
int piece = statbuf.st_size/PAGE;
int share = piece/ntask;
printf("share->%d\n",share);
sm = mmap (0,statbuf.st_size/ntask, PROT_READ, MAP_SHARED | MAP_NORESERVE,src, mytask*share*PAGE);
printf("%lld\n",(long long int) statbuf.st_size);
if (MAP_FAILED == sm)
{
perror (" mmap source ");
exit (EXIT_FAILURE);
}
return sm;
}
int main(int argc, char **argv)
{
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &ntask);
MPI_Comm_rank(MPI_COMM_WORLD, &mytask);
char *x = mmaping(argv[1]);
printf("%d\n",strlen(x));
write_result(x);
MPI_Finalize();
return 0;
}
每个线程读取一个文件的一部分并将其写下来。
在本地机器上工作得很好(mpich)在集群上出错(openmpi)
我的程序在函数 write_result 中的 close(fd) 处终止。
这是错误消息
[kalkyl3:30022] *** Process received signal ***
[kalkyl3:30022] Signal: Segmentation fault (11)
[kalkyl3:30022] Signal code: Address not mapped (1)
[kalkyl3:30022] Failing at address: 0x2b9d952d7000
[kalkyl3:30022] [ 0] /lib64/libpthread.so.0() [0x371900f4a0]
[kalkyl3:30022] [ 1] /lib64/libc.so.6() [0x371892fcef]
[kalkyl3:30022] [ 2] ./a.out(write_result+0x58) [0x400df8]
[kalkyl3:30022] [ 3] ./a.out(main+0x4c) [0x400e6c]
[kalkyl3:30022] [ 4] /lib64/libc.so.6(__libc_start_main+0xfd) [0x371881ecdd]
[kalkyl3:30022] [ 5] ./a.out() [0x400ba9]
[kalkyl3:30022] *** End of error message ***
在搜索了相关问题后,我猜测问题可能来自文件打开和关闭过程。由一个特定线程打开的文件应该由同一个线程关闭。也许 openmpi 无法识别哪个线程打开了它,哪个线程关闭了它。但它在 MPICH 上运行良好。