######## Examples ######## The examples in this section provide an overall view of how to use NCCL in various environments, combining one or multiple techniques: * using multiple GPUs per thread/process * using multiple threads * using multiple processes - the examples with multiple processes use MPI as parallel runtime environment, but any multi-process system should be able to work similarly. Ensure that you always check the return codes from the NCCL functions. For clarity, the following examples do not contain error checking. ********************************************** Communicator Creation and Destruction Examples ********************************************** The following examples demonstrate common use cases for NCCL initialization. Example 1: Single Process, Single Thread, Multiple Devices ---------------------------------------------------------- In the specific case of a single process, ncclCommInitAll can be used. Here is an example creating a communicator for 4 devices, therefore, there are 4 communicator objects: .. code:: C ncclComm_t comms[4]; int devs[4] = { 0, 1, 2, 3 }; ncclCommInitAll(comms, 4, devs); Next, you can call NCCL collective operations using a single thread and group calls, or multiple threads, each provided with a comm object. At the end of the program, all of the communicator objects are destroyed: .. code:: C for (int i=0; i<4; i++) ncclCommDestroy(comms[i]); The following code depicts a complete working example with a single process that manages multiple devices: .. code:: C #include #include #include "cuda_runtime.h" #include "nccl.h" #define CUDACHECK(cmd) do { \ cudaError_t err = cmd; \ if (err != cudaSuccess) { \ printf("Failed: Cuda error %s:%d '%s'\n", \ __FILE__,__LINE__,cudaGetErrorString(err)); \ exit(EXIT_FAILURE); \ } \ } while(0) #define NCCLCHECK(cmd) do { \ ncclResult_t res = cmd; \ if (res != ncclSuccess) { \ printf("Failed, NCCL error %s:%d '%s'\n", \ __FILE__,__LINE__,ncclGetErrorString(res)); \ exit(EXIT_FAILURE); \ } \ } while(0) int main(int argc, char* argv[]) { ncclComm_t comms[4]; //managing 4 devices int nDev = 4; int size = 32*1024*1024; int devs[4] = { 0, 1, 2, 3 }; //allocating and initializing device buffers float** sendbuff = (float**)malloc(nDev * sizeof(float*)); float** recvbuff = (float**)malloc(nDev * sizeof(float*)); cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev); for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(i)); CUDACHECK(cudaMalloc((void**)sendbuff + i, size * sizeof(float))); CUDACHECK(cudaMalloc((void**)recvbuff + i, size * sizeof(float))); CUDACHECK(cudaMemset(sendbuff[i], 1, size * sizeof(float))); CUDACHECK(cudaMemset(recvbuff[i], 0, size * sizeof(float))); CUDACHECK(cudaStreamCreate(s+i)); } //initializing NCCL NCCLCHECK(ncclCommInitAll(comms, nDev, devs)); //calling NCCL communication API. Group API is required when using //multiple devices per thread NCCLCHECK(ncclGroupStart()); for (int i = 0; i < nDev; ++i) NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], size, ncclFloat, ncclSum, comms[i], s[i])); NCCLCHECK(ncclGroupEnd()); //synchronizing on CUDA streams to wait for completion of NCCL operation for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(i)); CUDACHECK(cudaStreamSynchronize(s[i])); } //free device buffers for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(i)); CUDACHECK(cudaFree(sendbuff[i])); CUDACHECK(cudaFree(recvbuff[i])); } //finalizing NCCL for(int i = 0; i < nDev; ++i) ncclCommDestroy(comms[i]); printf("Success \n"); return 0; } Example 2: One Device per Process or Thread ------------------------------------------- When a process or host thread is responsible for at most one GPU, ncclCommInitRank can be used as a collective call to create a communicator. Each thread or process will get its own object. The following code is an example of a communicator creation in the context of MPI, using one device per MPI rank. First, we retrieve MPI information about processes: .. code:: C int myRank, nRanks; MPI_Comm_rank(MPI_COMM_WORLD, &myRank); MPI_Comm_size(MPI_COMM_WORLD, &nRanks); Next, a single rank will create a unique ID and send it to all other ranks to make sure everyone has it: .. code:: C ncclUniqueId id; if (myRank == 0) ncclGetUniqueId(&id); MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); Finally, we create the communicator: .. code:: C ncclComm_t comm; ncclCommInitRank(&comm, nRanks, id, myRank); We can now call the NCCL collective operations using the communicator. .. code:: C ncclAllReduce( ... , comm); Finally, we destroy the communicator object: .. code:: C ncclCommDestroy(comm); The following code depicts a complete working example with multiple MPI processes and one device per process: .. code:: C #include #include "cuda_runtime.h" #include "nccl.h" #include "mpi.h" #include #include #include #define MPICHECK(cmd) do { \ int e = cmd; \ if( e != MPI_SUCCESS ) { \ printf("Failed: MPI error %s:%d '%d'\n", \ __FILE__,__LINE__, e); \ exit(EXIT_FAILURE); \ } \ } while(0) #define CUDACHECK(cmd) do { \ cudaError_t e = cmd; \ if( e != cudaSuccess ) { \ printf("Failed: Cuda error %s:%d '%s'\n", \ __FILE__,__LINE__,cudaGetErrorString(e)); \ exit(EXIT_FAILURE); \ } \ } while(0) #define NCCLCHECK(cmd) do { \ ncclResult_t r = cmd; \ if (r!= ncclSuccess) { \ printf("Failed, NCCL error %s:%d '%s'\n", \ __FILE__,__LINE__,ncclGetErrorString(r)); \ exit(EXIT_FAILURE); \ } \ } while(0) static uint64_t getHostHash(const char* string) { // Based on DJB2a, result = result * 33 ^ char uint64_t result = 5381; for (int c = 0; string[c] != '\0'; c++){ result = ((result << 5) + result) ^ string[c]; } return result; } static void getHostName(char* hostname, int maxlen) { gethostname(hostname, maxlen); for (int i=0; i< maxlen; i++) { if (hostname[i] == '.') { hostname[i] = '\0'; return; } } } int main(int argc, char* argv[]) { int size = 32*1024*1024; int myRank, nRanks, localRank = 0; //initializing MPI MPICHECK(MPI_Init(&argc, &argv)); MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank)); MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks)); //calculating localRank based on hostname which is used in selecting a GPU uint64_t hostHashs[nRanks]; char hostname[1024]; getHostName(hostname, 1024); hostHashs[myRank] = getHostHash(hostname); MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD)); for (int p=0; p #include "cuda_runtime.h" #include "nccl.h" #include "mpi.h" #include #include #define MPICHECK(cmd) do { \ int e = cmd; \ if( e != MPI_SUCCESS ) { \ printf("Failed: MPI error %s:%d '%d'\n", \ __FILE__,__LINE__, e); \ exit(EXIT_FAILURE); \ } \ } while(0) #define CUDACHECK(cmd) do { \ cudaError_t e = cmd; \ if( e != cudaSuccess ) { \ printf("Failed: Cuda error %s:%d '%s'\n", \ __FILE__,__LINE__,cudaGetErrorString(e)); \ exit(EXIT_FAILURE); \ } \ } while(0) #define NCCLCHECK(cmd) do { \ ncclResult_t r = cmd; \ if (r!= ncclSuccess) { \ printf("Failed, NCCL error %s:%d '%s'\n", \ __FILE__,__LINE__,ncclGetErrorString(r)); \ exit(EXIT_FAILURE); \ } \ } while(0) static uint64_t getHostHash(const char* string) { // Based on DJB2a, result = result * 33 ^ char uint64_t result = 5381; for (int c = 0; string[c] != '\0'; c++){ result = ((result << 5) + result) ^ string[c]; } return result; } static void getHostName(char* hostname, int maxlen) { gethostname(hostname, maxlen); for (int i=0; i< maxlen; i++) { if (hostname[i] == '.') { hostname[i] = '\0'; return; } } } int main(int argc, char* argv[]) { int size = 32*1024*1024; int myRank, nRanks, localRank = 0; //initializing MPI MPICHECK(MPI_Init(&argc, &argv)); MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank)); MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks)); //calculating localRank which is used in selecting a GPU uint64_t hostHashs[nRanks]; char hostname[1024]; getHostName(hostname, 1024); hostHashs[myRank] = getHostHash(hostname); MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD)); for (int p=0; p`.