Quick startΒΆ
The following code is a standalone example showing how to use cuFFTMp.
#include <cufftMp.h>
#include <mpi.h>
#include <vector>
#include <complex>
int main() {
    // Initialize MPI, pick a device
    MPI_Init(NULL, NULL);
    MPI_Comm comm = MPI_COMM_WORLD;
    
    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    
    int ndevices;
    cudaGetDeviceCount(&ndevices);
    cudaSetDevice(rank % ndevices);
    
    // Allocate CPU memory
    size_t N = 32;
    std::vector<std::complex<float>> cpu_data((N / size) * N * N, {314, 0});
    
    // Create plan, attach to communicator, make plan
    cufftHandle plan = 0;
    size_t workspace;
    cufftCreate(&plan);
    cufftMpAttachComm(plan, CUFFT_COMM_MPI, &comm);
    cufftMakePlan3d(plan, N, N, N, CUFFT_C2C, &workspace);
    
    // Allocate memory, copy CPU data to GPU
    cudaLibXtDesc *desc;
    cufftXtMalloc(plan, &desc, CUFFT_XT_FORMAT_INPLACE);
    cufftXtMemcpy(plan, desc, cpu_data.data(), CUFFT_COPY_HOST_TO_DEVICE);
    
    // Run C2C FFT Forward
    cufftXtExecDescriptor(plan, desc, desc, CUFFT_FORWARD);
    
    // Copy back to CPU
    cufftXtMemcpy(plan, cpu_data.data(), desc, CUFFT_COPY_DEVICE_TO_HOST);
    
    // Data in cpu_data is now distributed along the Y dimension, of size N * (N / size) * N
    // Test by comparing the very first entry on rank 0
    if(rank == 0) {
        if(cpu_data[0].real() == 314 * N * N * N) {
            printf("PASSED\n");
        } else {
            printf("FAILED\n");
        }
    }
    
    // Cleanup
    cufftXtFree(desc);
    cufftDestroy(plan);
    MPI_Finalize();
    
}
Note
More examples can be found on Github.
This example compute a 32 x 32 x 32 complex-to-complex FFT over size GPUs.
It does the following
- It initializes MPI, and picks a device for the current process 
- It allocates a - (32 / size) x 32 x 32CPU array on each process. Each process owns- (32 / size) x 32 x 32elements (i.e., a Slab of the global data, distributed along X).
- It creates a cuFFT plan, enables the multi-process API with - cufftMpAttachand makes the plan.
- It allocates GPU memory and copies the local CPU buffer to the local GPU buffer 
- It executes the C2C plan. 
- It copies data back from GPU to CPU. At this point, every process owns - 32 x (32 / size) x 32elements, i.e., a Slab distributed along Y.
The example can then easily be compiled and ran like this
$ nvcc app.cu -I/path/to/cufftMp/include -L/path/to/cufftMp/lib -lcufftMp -I/path/to/mpi/include -L/path/to/mpi/lib -lmpi -o app
$ mpirun -n 2 ./app
PASSED