0

I am using MPI to parallellize my C++ interplanetary trajectory optimization program. A huge part of that is being able to distribute the load to multiple worker nodes, have them do some computations on the data distributed, and return the data back to master node. I use asynchronous communication routines MPI_Isend and MPI_Irecv in my program along with MPI_Wait appropriately I think. However I am running into abrupt program termination with EXIT CODE: 11, which I think stands for a segmentation fault. I have tried a thorough search on Stack Overflow on this topic and made sure to cover mistakes others were making in their code. My code however, still does not work. Here is the code:

    mat GeneticAlgorithm::mpi_pool_fitness(mat pool, int flyby_limit, int source, int target, bool isSolar, vec mu_system, vec rp_system, cube ephemerides, IPMGAConfig config)
{
    int poolsize = size(pool,0);
    int chromsize = size(pool,1);
    double* poolptr = NULL;
    mat rPool = zeros(poolsize,chromsize+1);

    int world_rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    MPI_Request* rq_status = (MPI_Request*)malloc(world_size*sizeof(MPI_Request));
    MPI_Status* status = (MPI_Status*)malloc(world_size*sizeof(MPI_Status));

    int k = 0;
    if ( world_rank == 0 )
    {
        //pool.print();
        //initialize poolptr with input pool elements, since mat is stored in memory column by column, it's not possible to use memptr() function
        poolptr = (double *) malloc(sizeof(double)*poolsize*chromsize);
        for(int i=0;i<poolsize;i++)
        {
            for (int j=0;j<chromsize;j++)
            {
                poolptr[k++] = pool(i,j);
                //cout << poolptr[k-1] << " " ;
            }
            //cout << endl;
        }
    }

    double perproc = poolsize/(world_size-1);
    int elems_per_proc = (int)perproc;
    if (elems_per_proc*(world_size-1) < poolsize)
    {
        elems_per_proc = elems_per_proc + 1;
    }
    //cout << world_rank << " Elements per processor : " << elems_per_proc << endl;
    if ( world_rank == 0 )
    {
        //cout << "poolptr size: " << k << endl;
        //cout << "expected poolsize: " << (world_size-1)*elems_per_proc*chromsize << endl;
        //MPI_Scatter(poolptr,elems_per_proc*chromsize,MPI_DOUBLE,row,elems_per_proc*chromsize,MPI_DOUBLE,0,MPI_COMM_WORLD);
        for (int i=1;i<world_size;i++)
        {
            cout << "0 Scattering chromosomes to processor: " << i << endl;
            MPI_Isend(&poolptr[(i-1)*elems_per_proc*chromsize],elems_per_proc*chromsize,MPI_DOUBLE,i,i,MPI_COMM_WORLD,&rq_status[i]);
        }
        /*
        for (int i=1;i<world_size;i++)
        {
            MPI_Wait(&rq_status[i],&status[i]);
        }
        */
        cout << "0 successfully sent off chromosomes for fitness evaluation....." << endl;
        free(poolptr);
    }

    double *row[100];
    double *iResults[100];
    mat iPool = zeros(poolsize,chromsize+1);
    if ( world_rank != 0 )
    {
        row[world_rank] = (double*)malloc(sizeof(double)*elems_per_proc*chromsize);
        cout << world_rank << " Starting to receive chromosomes from processor 0" << endl;
        MPI_Irecv(&row[world_rank],elems_per_proc*chromsize,MPI_DOUBLE,0,world_rank,MPI_COMM_WORLD,&rq_status[0]);
        MPI_Wait(&rq_status[0],&status[0]);
        cout << world_rank << " Received chromosomes from processor 0" << endl;
        //Convert MPI data back to arma matrix
        for (int i=0;i<elems_per_proc;i++)
        {
            cout << "Composing " << i << "th element at the given processor " << world_rank << endl;
            k = 1;
            for (int j=0;j<chromsize;j++,k++)
            {
                iPool(((world_rank-1)*elems_per_proc)+i,k)=row[world_rank][(i*chromsize)+j];
            }
        }
        //iPool.print();
        //Compute the fitness of each chromosome in intermediate pool
        cout << world_rank << " Attempting fitness calculations....." << endl;
        for (int i=0;i<elems_per_proc;i++)
        {
            iPool(((world_rank-1)*elems_per_proc)+i,span(0,chromsize)) = fitness_multi_rev_lambert(iPool(((world_rank-1)*elems_per_proc)+i,span(1,chromsize)),flyby_limit,source,target,isSolar,mu_system,rp_system,ephemerides,config);
        }
        cout << world_rank << " Successfully finished fitness calculations....." << endl;
        //iPool.print();
        //Convert the results back to MPI data type
        iResults[world_rank]=(double *) malloc(sizeof(double)*elems_per_proc*(chromsize+1));// = iPool.memptr();
        k=0;
        for(int i=0;i<elems_per_proc;i++)
        {
            for (int j=0;j<chromsize+1;j++)
            {
                iResults[world_rank][k++] = iPool(((world_rank-1)*elems_per_proc)+i,j);
            }
        }
        //cout << world_rank << " Starting to send processed chromosomes to processor 0" << endl;
        MPI_Isend(&iResults[world_rank],elems_per_proc*(chromsize+1),MPI_DOUBLE,0,world_rank,MPI_COMM_WORLD,&rq_status[0]);
        //cout << world_rank << " Sent processed chromosomes to processor 0" << endl;
        MPI_Wait(&rq_status[0],&status[0]);
    }
    //Declare a variable holder for global results 
    if ( world_rank == 0)
    {
        double* gResults = (double*)malloc(sizeof(double)*poolsize*(chromsize+1));

        //cout << "0 Gathering chromosomes with fitness evaluated from all processors...." << endl;
        //MPI_Gather(iResults,elems_per_proc*(chromsize+1),MPI_DOUBLE,gResults,poolsize*(chromsize+1),MPI_DOUBLE,0,MPI_COMM_WORLD);
        k=0;
        for (int i=1;i<world_size;i++)
        {
            MPI_Irecv(&gResults[(i-1)*elems_per_proc*(chromsize+1)],elems_per_proc*(chromsize+1),MPI_DOUBLE,i,i,MPI_COMM_WORLD,&rq_status[i]);
        }
        cout << "0 waiting to hear back from all the worker nodes...." << endl;
        for(int i=1;i<world_size;i++)
        {
            MPI_Wait(&rq_status[i],&status[i]);
        }
        cout << "Populating return pool...." << endl;
        for (int i=0;i<poolsize;i++)
        {
            for(int j=0;j<chromsize+1;j++)
            {
                rPool(i,j) = gResults[(i*(chromsize+1))+j];
            }
        }
        //cout << "Finished populating return pool...." << endl;
    }

    free(rq_status);
    free(status);
    return rPool;
}

The program seems to have various symptoms I found in my search on Stack Overflow, for example, the MPI_Isend from master node only works if I specify '-n 11' or '-n 26' in my mpiexec. For all other specifications of number of nodes to use, the master node runs into a segmentation fault. If MPI_Isend from master works then the worker nodes are running into a segmentation fault I think either during or slightly after the MPI_Irecv.

Here is my full log from one sample execution of the program when I run mpiexec with 11 nodes:

    10 Starting to receive chromosomes from processor 0
Best results are in : best_results_20160217T1902.mat
Generational chromosomes are in : chromosomes_20160217T1902.mat
0 Starting the GA.....
0 Processing generation : 1
6 Starting to receive chromosomes from processor 0
9 Starting to receive chromosomes from processor 0
4 Starting to receive chromosomes from processor 0
7 Starting to receive chromosomes from processor 0
5 Starting to receive chromosomes from processor 0
3 Starting to receive chromosomes from processor 0
8 Starting to receive chromosomes from processor 0
2 Starting to receive chromosomes from processor 0
1 Starting to receive chromosomes from processor 0
0 Scattering chromosomes to processor: 1
0 Scattering chromosomes to processor: 2
0 Scattering chromosomes to processor: 3
0 Scattering chromosomes to processor: 4
0 Scattering chromosomes to processor: 5
0 Scattering chromosomes to processor: 6
0 Scattering chromosomes to processor: 7
0 Scattering chromosomes to processor: 8
0 Scattering chromosomes to processor: 9
0 Scattering chromosomes to processor: 10
0 successfully sent off chromosomes for fitness evaluation.....
0 waiting to hear back from all the worker nodes....

===================================================================================
=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
=   PID 12223 RUNNING AT 192.168.0.101
=   EXIT CODE: 11
=   CLEANING UP REMAINING PROCESSES
=   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
[proxy:0:2@odroid3] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed
[proxy:0:2@odroid3] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:2@odroid3] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event
[proxy:0:3@odroid4] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed
[proxy:0:3@odroid4] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:3@odroid4] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event
[proxy:0:5@odroid6] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed
[proxy:0:5@odroid6] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:5@odroid6] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event
[proxy:0:4@odroid5] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed
[proxy:0:4@odroid5] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:4@odroid5] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event
[proxy:0:6@odroid7] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed
[proxy:0:6@odroid7] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:6@odroid7] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event
[proxy:0:1@odroid2] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed
[proxy:0:1@odroid2] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:1@odroid2] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event
[mpiexec@odroid1] HYDT_bscu_wait_for_completion (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/bootstrap/utils/bscu_wait.c:76): one of the processes terminated badly; aborting
[mpiexec@odroid1] HYDT_bsci_wait_for_completion (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/bootstrap/src/bsci_wait.c:23): launcher returned error waiting for completion
[mpiexec@odroid1] HYD_pmci_wait_for_completion (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmiserv_pmci.c:218): launcher returned error waiting for completion
[mpiexec@odroid1] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/ui/mpich/mpiexec.c:344): process manager error waiting for completion

I would appreciate any help on this matter, I am really on a time crunch to finish the execution of this program for my thesis deadline!

2 Answers 2

0

At least one critical MPI_Wait is commented out.

    for (int i=1;i<world_size;i++)
    {
        cout << "0 Scattering chromosomes to processor: " << i << endl;
        MPI_Isend(&poolptr[(i-1)*elems_per_proc*chromsize],elems_per_proc*chromsize,MPI_DOUBLE,i,i,MPI_COMM_WORLD,&rq_status[i]);
    }
    /*
    for (int i=1;i<world_size;i++)
    {
        MPI_Wait(&rq_status[i],&status[i]);
    }
    */
    cout << "0 successfully sent off chromosomes for fitness evaluation....." << endl;
    free(poolptr);

You must not free or write to poolptr before the all send communications are completed.

In general, you are overusing nonblocking communication:

  1. Any nonblocking reqest that you immedeately MPI_Wait for is pointless. Use a blocking call instead.
  2. Use collective communiation calls, in particular MPI_Scatter / MPI_Gather, whenever possible. In general, use collectives if you want to overlap multiple communications, use non-blocking communication if you want to overlap communication with computation.
  3. Use MPI_Waitall if you want to wait for multiple requests.

For a better discussion, include a Minimal, Complete, and Verifiable example - and make sure to cleanup your commented out stuff.

Sign up to request clarification or add additional context in comments.

Comments

0

Thanks for pointing out the oversight on my part! I ended up leveraging MPI_Scatter/MPI_Gather in my problem like you suggested instead of asynchronous communications routines, which are overused anyway.

I found example at this stack overflow link to be immensely useful in being able to successfully use MPI Scatter/Gather.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.