Quick startΒΆ

The following code is a standalone example showing how to use cuFFTMp.

#include <cufftMp.h>
#include <mpi.h>
#include <vector>
#include <complex>

int main() {

    // Initialize MPI, pick a device
    MPI_Init(NULL, NULL);
    MPI_Comm comm = MPI_COMM_WORLD;
    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    int ndevices;
    cudaSetDevice(rank % ndevices);
    // Allocate CPU memory
    size_t N = 32;
    std::vector<std::complex<float>> cpu_data((N / size) * N * N, {314, 0});
    // Create plan, attach to communicator, make plan
    cufftHandle plan = 0;
    size_t workspace;
    cufftMpAttachComm(plan, CUFFT_COMM_MPI, &comm);
    cufftMakePlan3d(plan, N, N, N, CUFFT_C2C, &workspace);
    // Allocate memory, copy CPU data to GPU
    cudaLibXtDesc *desc;
    cufftXtMalloc(plan, &desc, CUFFT_XT_FORMAT_INPLACE);
    cufftXtMemcpy(plan, desc, cpu_data.data(), CUFFT_COPY_HOST_TO_DEVICE);
    // Run C2C FFT Forward
    cufftXtExecDescriptor(plan, desc, desc, CUFFT_FORWARD);
    // Copy back to CPU
    cufftXtMemcpy(plan, cpu_data.data(), desc, CUFFT_COPY_DEVICE_TO_HOST);
    // Data in cpu_data is now distributed along the Y dimension, of size N * (N / size) * N
    // Test by comparing the very first entry on rank 0
    if(rank == 0) {
        if(cpu_data[0].real() == 314 * N * N * N) {
        } else {
    // Cleanup


More examples can be found on Github.

This example computes a 32 x 32 x 32 complex-to-complex FFT over size GPUs. It does the following:

  1. It initializes MPI, and picks a device for the current process.

  2. It allocates a (32 / size) x 32 x 32 CPU array on each process. Each process owns (32 / size) x 32 x 32 elements (i.e., a Slab of the global data, distributed along X).

  3. It creates a cuFFT plan, enables the multi-process API with cufftMpAttach and makes the plan.

  4. It allocates GPU memory and copies the local CPU buffer to the local GPU buffer.

  5. It executes the C2C plan.

  6. It copies data back from GPU to CPU. At this point, every process owns 32 x (32 / size) x 32 elements, i.e., a Slab distributed along Y.

The example can then easily be compiled and ran like this:

$ nvcc app.cu -I/path/to/cufftMp/include -L/path/to/cufftMp/lib -lcufftMp  -L/path/to/nvshmem/lib -lnvshmem_host -I/path/to/mpi/include -L/path/to/mpi/lib -lmpi -o app
$ mpirun -n 2 ./app