I am using MPI to parallellize my C++ interplanetary trajectory optimization program. A huge part of that is being able to distribute the load to multiple worker nodes, have them do some computations on the data distributed, and return the data back to master node. I use asynchronous communication routines MPI_Isend and MPI_Irecv in my program along with MPI_Wait appropriately I think. However I am running into abrupt program termination with EXIT CODE: 11, which I think stands for a segmentation fault. I have tried a thorough search on Stack Overflow on this topic and made sure to cover mistakes others were making in their code. My code however, still does not work. Here is the code:
mat GeneticAlgorithm::mpi_pool_fitness(mat pool, int flyby_limit, int source, int target, bool isSolar, vec mu_system, vec rp_system, cube ephemerides, IPMGAConfig config)
{
int poolsize = size(pool,0);
int chromsize = size(pool,1);
double* poolptr = NULL;
mat rPool = zeros(poolsize,chromsize+1);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Request* rq_status = (MPI_Request*)malloc(world_size*sizeof(MPI_Request));
MPI_Status* status = (MPI_Status*)malloc(world_size*sizeof(MPI_Status));
int k = 0;
if ( world_rank == 0 )
{
//pool.print();
//initialize poolptr with input pool elements, since mat is stored in memory column by column, it's not possible to use memptr() function
poolptr = (double *) malloc(sizeof(double)*poolsize*chromsize);
for(int i=0;i<poolsize;i++)
{
for (int j=0;j<chromsize;j++)
{
poolptr[k++] = pool(i,j);
//cout << poolptr[k-1] << " " ;
}
//cout << endl;
}
}
double perproc = poolsize/(world_size-1);
int elems_per_proc = (int)perproc;
if (elems_per_proc*(world_size-1) < poolsize)
{
elems_per_proc = elems_per_proc + 1;
}
//cout << world_rank << " Elements per processor : " << elems_per_proc << endl;
if ( world_rank == 0 )
{
//cout << "poolptr size: " << k << endl;
//cout << "expected poolsize: " << (world_size-1)*elems_per_proc*chromsize << endl;
//MPI_Scatter(poolptr,elems_per_proc*chromsize,MPI_DOUBLE,row,elems_per_proc*chromsize,MPI_DOUBLE,0,MPI_COMM_WORLD);
for (int i=1;i<world_size;i++)
{
cout << "0 Scattering chromosomes to processor: " << i << endl;
MPI_Isend(&poolptr[(i-1)*elems_per_proc*chromsize],elems_per_proc*chromsize,MPI_DOUBLE,i,i,MPI_COMM_WORLD,&rq_status[i]);
}
/*
for (int i=1;i<world_size;i++)
{
MPI_Wait(&rq_status[i],&status[i]);
}
*/
cout << "0 successfully sent off chromosomes for fitness evaluation....." << endl;
free(poolptr);
}
double *row[100];
double *iResults[100];
mat iPool = zeros(poolsize,chromsize+1);
if ( world_rank != 0 )
{
row[world_rank] = (double*)malloc(sizeof(double)*elems_per_proc*chromsize);
cout << world_rank << " Starting to receive chromosomes from processor 0" << endl;
MPI_Irecv(&row[world_rank],elems_per_proc*chromsize,MPI_DOUBLE,0,world_rank,MPI_COMM_WORLD,&rq_status[0]);
MPI_Wait(&rq_status[0],&status[0]);
cout << world_rank << " Received chromosomes from processor 0" << endl;
//Convert MPI data back to arma matrix
for (int i=0;i<elems_per_proc;i++)
{
cout << "Composing " << i << "th element at the given processor " << world_rank << endl;
k = 1;
for (int j=0;j<chromsize;j++,k++)
{
iPool(((world_rank-1)*elems_per_proc)+i,k)=row[world_rank][(i*chromsize)+j];
}
}
//iPool.print();
//Compute the fitness of each chromosome in intermediate pool
cout << world_rank << " Attempting fitness calculations....." << endl;
for (int i=0;i<elems_per_proc;i++)
{
iPool(((world_rank-1)*elems_per_proc)+i,span(0,chromsize)) = fitness_multi_rev_lambert(iPool(((world_rank-1)*elems_per_proc)+i,span(1,chromsize)),flyby_limit,source,target,isSolar,mu_system,rp_system,ephemerides,config);
}
cout << world_rank << " Successfully finished fitness calculations....." << endl;
//iPool.print();
//Convert the results back to MPI data type
iResults[world_rank]=(double *) malloc(sizeof(double)*elems_per_proc*(chromsize+1));// = iPool.memptr();
k=0;
for(int i=0;i<elems_per_proc;i++)
{
for (int j=0;j<chromsize+1;j++)
{
iResults[world_rank][k++] = iPool(((world_rank-1)*elems_per_proc)+i,j);
}
}
//cout << world_rank << " Starting to send processed chromosomes to processor 0" << endl;
MPI_Isend(&iResults[world_rank],elems_per_proc*(chromsize+1),MPI_DOUBLE,0,world_rank,MPI_COMM_WORLD,&rq_status[0]);
//cout << world_rank << " Sent processed chromosomes to processor 0" << endl;
MPI_Wait(&rq_status[0],&status[0]);
}
//Declare a variable holder for global results
if ( world_rank == 0)
{
double* gResults = (double*)malloc(sizeof(double)*poolsize*(chromsize+1));
//cout << "0 Gathering chromosomes with fitness evaluated from all processors...." << endl;
//MPI_Gather(iResults,elems_per_proc*(chromsize+1),MPI_DOUBLE,gResults,poolsize*(chromsize+1),MPI_DOUBLE,0,MPI_COMM_WORLD);
k=0;
for (int i=1;i<world_size;i++)
{
MPI_Irecv(&gResults[(i-1)*elems_per_proc*(chromsize+1)],elems_per_proc*(chromsize+1),MPI_DOUBLE,i,i,MPI_COMM_WORLD,&rq_status[i]);
}
cout << "0 waiting to hear back from all the worker nodes...." << endl;
for(int i=1;i<world_size;i++)
{
MPI_Wait(&rq_status[i],&status[i]);
}
cout << "Populating return pool...." << endl;
for (int i=0;i<poolsize;i++)
{
for(int j=0;j<chromsize+1;j++)
{
rPool(i,j) = gResults[(i*(chromsize+1))+j];
}
}
//cout << "Finished populating return pool...." << endl;
}
free(rq_status);
free(status);
return rPool;
}
The program seems to have various symptoms I found in my search on Stack Overflow, for example, the MPI_Isend from master node only works if I specify '-n 11' or '-n 26' in my mpiexec. For all other specifications of number of nodes to use, the master node runs into a segmentation fault. If MPI_Isend from master works then the worker nodes are running into a segmentation fault I think either during or slightly after the MPI_Irecv.
Here is my full log from one sample execution of the program when I run mpiexec with 11 nodes:
10 Starting to receive chromosomes from processor 0
Best results are in : best_results_20160217T1902.mat
Generational chromosomes are in : chromosomes_20160217T1902.mat
0 Starting the GA.....
0 Processing generation : 1
6 Starting to receive chromosomes from processor 0
9 Starting to receive chromosomes from processor 0
4 Starting to receive chromosomes from processor 0
7 Starting to receive chromosomes from processor 0
5 Starting to receive chromosomes from processor 0
3 Starting to receive chromosomes from processor 0
8 Starting to receive chromosomes from processor 0
2 Starting to receive chromosomes from processor 0
1 Starting to receive chromosomes from processor 0
0 Scattering chromosomes to processor: 1
0 Scattering chromosomes to processor: 2
0 Scattering chromosomes to processor: 3
0 Scattering chromosomes to processor: 4
0 Scattering chromosomes to processor: 5
0 Scattering chromosomes to processor: 6
0 Scattering chromosomes to processor: 7
0 Scattering chromosomes to processor: 8
0 Scattering chromosomes to processor: 9
0 Scattering chromosomes to processor: 10
0 successfully sent off chromosomes for fitness evaluation.....
0 waiting to hear back from all the worker nodes....
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 12223 RUNNING AT 192.168.0.101
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
[proxy:0:2@odroid3] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed
[proxy:0:2@odroid3] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:2@odroid3] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event
[proxy:0:3@odroid4] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed
[proxy:0:3@odroid4] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:3@odroid4] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event
[proxy:0:5@odroid6] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed
[proxy:0:5@odroid6] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:5@odroid6] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event
[proxy:0:4@odroid5] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed
[proxy:0:4@odroid5] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:4@odroid5] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event
[proxy:0:6@odroid7] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed
[proxy:0:6@odroid7] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:6@odroid7] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event
[proxy:0:1@odroid2] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed
[proxy:0:1@odroid2] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:1@odroid2] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event
[mpiexec@odroid1] HYDT_bscu_wait_for_completion (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/bootstrap/utils/bscu_wait.c:76): one of the processes terminated badly; aborting
[mpiexec@odroid1] HYDT_bsci_wait_for_completion (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/bootstrap/src/bsci_wait.c:23): launcher returned error waiting for completion
[mpiexec@odroid1] HYD_pmci_wait_for_completion (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmiserv_pmci.c:218): launcher returned error waiting for completion
[mpiexec@odroid1] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/ui/mpich/mpiexec.c:344): process manager error waiting for completion
I would appreciate any help on this matter, I am really on a time crunch to finish the execution of this program for my thesis deadline!