Quick startΒΆ
The following code is a standalone example showing how to use cuFFTMp.
#include <cufftMp.h>
#include <mpi.h>
#include <vector>
#include <complex>
int main() {
// Initialize MPI, pick a device
MPI_Init(NULL, NULL);
MPI_Comm comm = MPI_COMM_WORLD;
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int ndevices;
cudaGetDeviceCount(&ndevices);
cudaSetDevice(rank % ndevices);
// Allocate CPU memory
size_t N = 32;
std::vector<std::complex<float>> cpu_data((N / size) * N * N, {314, 0});
// Create plan, attach to communicator, make plan
cufftHandle plan = 0;
size_t workspace;
cufftCreate(&plan);
cufftMpAttachComm(plan, CUFFT_COMM_MPI, &comm);
cufftMakePlan3d(plan, N, N, N, CUFFT_C2C, &workspace);
// Allocate memory, copy CPU data to GPU
cudaLibXtDesc *desc;
cufftXtMalloc(plan, &desc, CUFFT_XT_FORMAT_INPLACE);
cufftXtMemcpy(plan, desc, cpu_data.data(), CUFFT_COPY_HOST_TO_DEVICE);
// Run C2C FFT Forward
cufftXtExecDescriptor(plan, desc, desc, CUFFT_FORWARD);
// Copy back to CPU
cufftXtMemcpy(plan, cpu_data.data(), desc, CUFFT_COPY_DEVICE_TO_HOST);
// Data in cpu_data is now distributed along the Y dimension, of size N * (N / size) * N
// Test by comparing the very first entry on rank 0
if(rank == 0) {
if(cpu_data[0].real() == 314 * N * N * N) {
printf("PASSED\n");
} else {
printf("FAILED\n");
}
}
// Cleanup
cufftXtFree(desc);
cufftDestroy(plan);
MPI_Finalize();
}
Note
More examples can be found on Github.
This example computes a 32 x 32 x 32
complex-to-complex FFT over size
GPUs.
It does the following:
It initializes MPI, and picks a device for the current process.
It allocates a
(32 / size) x 32 x 32
CPU array on each process. Each process owns(32 / size) x 32 x 32
elements (i.e., a Slab of the global data, distributed along X).It creates a cuFFT plan, enables the multi-process API with
cufftMpAttach
and makes the plan.It allocates GPU memory and copies the local CPU buffer to the local GPU buffer.
It executes the C2C plan.
It copies data back from GPU to CPU. At this point, every process owns
32 x (32 / size) x 32
elements, i.e., a Slab distributed along Y.
The example can then easily be compiled and ran like this:
$ nvcc app.cu -I/path/to/cufftMp/include -L/path/to/cufftMp/lib -lcufftMp -L/path/to/nvshmem/lib -lnvshmem_host -I/path/to/mpi/include -L/path/to/mpi/lib -lmpi -o app
$ mpirun -n 2 ./app
PASSED