Examples

Source code for the examples described in this section is available in the examples folder of the NVSHMEM package.

Attribute-Based Initialization Example

The following code shows an MPI version of the simple shift program that was explained in The NVSHMEM Programming Model. It shows the use of the NVSHMEM attribute-based initialization API where the MPI communicator can be used to set up NVSHMEM.

#include <stdio.h>
#include "mpi.h"
#include "nvshmem.h"
#include "nvshmemx.h"

#define CUDA_CHECK(stmt)                                  \
do {                                                      \
    cudaError_t result = (stmt);                          \
    if (cudaSuccess != result) {                          \
        fprintf(stderr, "[%s:%d] CUDA failed with %s \n", \
         __FILE__, __LINE__, cudaGetErrorString(result)); \
        exit(-1);                                         \
    }                                                     \
} while (0)

__global__ void simple_shift(int *destination) {
    int mype = nvshmem_my_pe();
    int npes = nvshmem_n_pes();
    int peer = (mype + 1) % npes;

    nvshmem_int_p(destination, mype, peer);
}

int main (int argc, char *argv[]) {
    int mype_node, msg;
    cudaStream_t stream;
    int rank, nranks;
    MPI_Comm mpi_comm = MPI_COMM_WORLD;
    nvshmemx_init_attr_t attr;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    attr.mpi_comm = &mpi_comm;
    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
    mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);

    CUDA_CHECK(cudaSetDevice(mype_node));
    CUDA_CHECK(cudaStreamCreate(&stream));
    int *destination = (int *) nvshmem_malloc (sizeof(int));

    simple_shift<<<1, 1, 0, stream>>>(destination);
    CUDA_CHECK(cudaMemcpyAsync(&msg, destination, sizeof(int),
                cudaMemcpyDeviceToHost, stream));

    CUDA_CHECK(cudaStreamSynchronize(stream));
    printf("%d: received message %d\n", nvshmem_my_pe(), msg);

    nvshmem_free(destination);
    nvshmem_finalize();
    MPI_Finalize();
    return 0;
}

Collective Launch Example

The following code shows an example implementation of a single ring-based reduction where multiple iterations of the code, including computation, communication and synchronization are expressed as a single kernel.

This example also demonstrates the use of NVSHMEM collective launch, required when the NVSHMEM synchronization API is used from inside the CUDA kernel.

There is no MPI dependency for the example. NVSHMEM can be used to port existing MPI applications and develop new applications.

#include <stdio.h>
#include "nvshmem.h"
#include "nvshmemx.h"

#define CUDA_CHECK(stmt)                                                \
    do {                                                                \
        cudaError_t result = (stmt);                                    \
        if (cudaSuccess != result) {                                    \
            fprintf(stderr, "[%s:%d] cuda failed with %s \n",           \
                    __FILE__, __LINE__, cudaGetErrorString(result));    \
            exit(-1);                                                   \
        }                                                               \
    } while (0)

#define NVSHMEM_CHECK(stmt)                                             \
    do {                                                                \
        int result = (stmt);                                            \
        if (0 != result) {                                              \
            fprintf(stderr, "[%s:%d] nvshmem failed with error %d \n",  \
                    __FILE__, __LINE__, result);                        \
            exit(-1);                                                   \
        }                                                               \
    } while (0)


__global__ void reduce_ring (int *target, int mype, int npes) {
    int peer = (mype + 1)%npes;
    int lvalue = mype;

    for (int i=1; i<npes; i++) {
        nvshmem_int_p(target, lvalue, peer);
        nvshmem_barrier_all();
        lvalue = *target + mype;
        nvshmem_barrier_all();
    }
}

int main (int c, char *v[])
{
    int mype, npes;
    nvshmem_init();
    mype = nvshmem_my_pe();
    npes = nvshmem_n_pes();

    //application picks the device each PE will use
    CUDA_CHECK(cudaSetDevice(mype));
    double *u = (double *) nvshmem_calloc(sizeof(double), 1);

    void *args[] = {&u, &mype, &npes};
    dim3 dimBlock(1);
    dim3 dimGrid(1);

    NVSHMEM_CHECK(nvshmemx_collective_launch ((const void *)reduce_ring,
                dimGrid, dimBlock, args, 0 , 0));
    CUDA_CHECK(cudaDeviceSynchronize());

    printf("[%d of %d] run complete \n", mype, npes);

    nvshmem_free(u);

    nvshmem_finalize();
    return 0;
}

On-Stream Example

The following example shows how nvshmemx_*_on_stream functions can be used to enqueue a SHMEM operation onto a CUDA stream for execution in stream order. Specifically, the example shows the following:

  • How a collective SHMEM reduction operation can be made to wait on a preceding kernel in the stream.
  • How a kernel can be made to wait for a communication result from a previous collective SHMEM reduction operation.

The example shows one use case for relieving CPU control over GPU compute and communication.

#include <stdio.h>
#include "nvshmem.h"
#include "nvshmemx.h"

#define THRESHOLD 42
#define CORRECTION 7

#define CUDA_CHECK(stmt)                                                \
    do {                                                                \
        cudaError_t result = (stmt);                                    \
        if (cudaSuccess != result) {                                    \
            fprintf(stderr, "[%s:%d] cuda failed with %s \n",           \
                    __FILE__, __LINE__, cudaGetErrorString(result));    \
            exit(-1);                                                   \
        }                                                               \
    } while (0)

__global__ void accumulate(int *input, int *partial_sum)
{
    int index = threadIdx.x;
    if (0 == index) *partial_sum = 0;
    __syncthreads();
    atomicAdd(partial_sum, input[index]);
}

__global__ void correct_accumulate(int *input, int *partial_sum, int *full_sum)
{
    int index = threadIdx.x;
    if (*full_sum > THRESHOLD) {
        input[index] = input[index] - CORRECTION;
    }
    if (0 == index) *partial_sum = 0;
    __syncthreads();
    atomicAdd(partial_sum, input[index]);
}

int main (int c, char *v[])
{
    int mype, npes;
    int *input;
    int *partial_sum;
    int *full_sum;
    int input_nelems = 512;
    int to_all_nelems = 1;
    int PE_start = 0;
    int PE_size = 0;
    int logPE_stride = 0;
    long *pSync;
    int *pWrk;
    cudaStream_t stream;

    nvshmem_init ();
    PE_size = nvshmem_n_pes();
    mype = nvshmem_my_pe();
    npes = nvshmem_n_pes();

    CUDA_CHECK(cudaSetDevice(mype));
    CUDA_CHECK(cudaStreamCreate(&stream));

    input = (int *) nvshmem_malloc(sizeof(int) * input_nelems);
    partial_sum = (int *) nvshmem_malloc(sizeof(int));
    full_sum = (int *) nvshmem_malloc(sizeof(int));
    pWrk = (int *) nvshmem_malloc(sizeof(int) * NVSHMEM_REDUCE_MIN_WRKDATA_SIZE);
    pSync = (long *) nvshmem_malloc(sizeof(long) * NVSHMEM_REDUCE_SYNC_SIZE);

    accumulate <<<1, input_nelems, 0, stream>>> (input, partial_sum);
    nvshmemx_int_sum_to_all_on_stream(full_sum, partial_sum, to_all_nelems,
            PE_start, logPE_stride, PE_size, pWrk, pSync, stream);
    correct_accumulate <<<1, input_nelems, 0, stream>>> (input, partial_sum, full_sum);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    printf("[%d of %d] run complete \n", mype, npes);

    CUDA_CHECK(cudaStreamDestroy(stream));

    nvshmem_free(input);
    nvshmem_free(partial_sum);
    nvshmem_free(full_sum);
    nvshmem_free(pWrk);
    nvshmem_free(pSync);

    nvshmem_finalize();
    return 0;
}

Threadgroup Example

The example in this section shows how nvshmemx_collect32_block can be used to leverage threads to accelerate a SHMEM collect operation when all threads in the block depend on the result of a preceding communication operation. For this instance, partial vector sums are computed across different PEs and have a SHMEM collect operation to obtain the complete sum across PEs.

#include <stdio.h>
#include "nvshmem.h"
#include "nvshmemx.h"

#define NTHREADS 512

#define CUDA_CHECK(stmt)                                                \
    do {                                                                \
        cudaError_t result = (stmt);                                    \
        if (cudaSuccess != result) {                                    \
            fprintf(stderr, "[%s:%d] cuda failed with %s \n",           \
                    __FILE__, __LINE__, cudaGetErrorString(result));    \
            exit(-1);                                                   \
        }                                                               \
    } while (0)


__global__ void distributed_vector_sum(int *x, int *y, int *partial_sum,
        int *sum, long *pSync, int use_threadgroup, int mype, int npes)
{
    int index = threadIdx.x;
    int nelems = blockDim.x;
    int PE_start = 0;
    int logPE_stride = 0;
    partial_sum[index] = x[index] + y[index];

    if (use_threadgroup) {
        /* all threads realize the entire collect operation */
        nvshmemx_collect32_block(sum, partial_sum, nelems, PE_start,
                logPE_stride, npes, pSync);
    } else {
        /* thread 0 realizes the entire collect operation */
        if (0 == index) {
            nvshmem_collect32(sum, partial_sum, nelems, PE_start, logPE_stride,
                    npes, pSync);
        }
    }
}

int main (int c, char *v[])
{
    int mype, npes;
    int *x;
    int *y;
    int *partial_sum;
    int *sum;
    int use_threadgroup = 1;
    long *pSync;
    int nthreads = NTHREADS;

    nvshmem_init ();
    npes = nvshmem_n_pes();
    mype = nvshmem_my_pe();

    CUDA_CHECK(cudaSetDevice(mype));

    x = (int *) nvshmem_malloc(sizeof(int) * nthreads);
    y = (int *) nvshmem_malloc(sizeof(int) * nthreads);
    partial_sum = (int *) nvshmem_malloc(sizeof(int) * nthreads);
    sum = (int *) nvshmem_malloc(sizeof(int) * nthreads * npes);
    pSync = (long *) nvshmem_malloc(sizeof(long) * NVSHMEM_COLLECT_SYNC_SIZE);

    void *args[] = {&x, &y, &partial_sum, &sum, &pSync, &use_threadgroup,
        &mype, &npes};
    dim3 dimBlock(nthreads);
    dim3 dimGrid(1);
    nvshmemx_collective_launch((const void *)distributed_vector_sum, dimGrid,
            dimBlock, args, 0, 0);
    CUDA_CHECK(cudaDeviceSynchronize());

    printf("[%d of %d] run complete \n", mype, npes);

    nvshmem_free(x);
    nvshmem_free(y);
    nvshmem_free(partial_sum);
    nvshmem_free(sum);
    nvshmem_free(pSync);

    nvshmem_finalize();

    return 0;
}

Put on Block Example

In the example below, every thread in block 0 calls nvshmemx_float_put_block. Alternatively, every thread can call nvshmem_float_p, but nvshmem_float_p has a disadvantage that when the destination GPU is connected via InfiniBand, there is one RMA message for every single element, which can be detrimental to performance.

The disadvantage with using nvshmem_float_put in this case is that when the destination GPU is P2P-connected, a single thread will copy the entire data to the destination GPU. While nvshmemx_float_put_block can leverage all the threads in the block to copy the data in parallel to the destination GPU.

#include <stdio.h>
#include <assert.h>
#include "nvshmem.h"
#include "nvshmemx.h"

#define CUDA_CHECK(stmt)                                                \
    do {                                                                \
        cudaError_t result = (stmt);                                    \
        if (cudaSuccess != result) {                                    \
            fprintf(stderr, "[%s:%d] cuda failed with %s \n",           \
                    __FILE__, __LINE__, cudaGetErrorString(result));    \
            exit(-1);                                                   \
        }                                                               \
    } while (0)

#define THREADS_PER_BLOCK 1024

__global__ void set_and_shift_kernel (float *send_data, float *recv_data,
        int num_elems, int mype, int npes) {
    int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;

    /* set the corresponding element of send_data */
    if (thread_idx < num_elems)
        send_data[thread_idx] = mype;

    int peer = (mype + 1) % npes;

    int block_offset = blockIdx.x * blockDim.x;
    /* All threads in the block call API with same arguments */
    nvshmemx_float_put_block(recv_data + block_offset, send_data +
            block_offset, min(blockDim.x, num_elems - block_offset), peer);
}

int main (int c, char *v[])
{
    int mype, npes;
    float *send_data, *recv_data;
    int num_elems = 8192;
    int num_blocks;

    nvshmem_init ();
    mype = nvshmem_my_pe();
    npes = nvshmem_n_pes();

    //application picks the device each PE will use
    CUDA_CHECK(cudaSetDevice(mype));
    send_data = (float *) nvshmem_malloc(sizeof(float) * num_elems);
    recv_data = (float *) nvshmem_malloc(sizeof(float) * num_elems);

    assert(num_elems % THREADS_PER_BLOCK == 0); /* for simplicity */
    num_blocks = num_elems / THREADS_PER_BLOCK;

    set_and_shift_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(send_data,
            recv_data, num_elems, mype, npes);
    CUDA_CHECK(cudaDeviceSynchronize());

    printf("[%d of %d] run complete \n", mype, npes);

    nvshmem_free(send_data);
    nvshmem_free(recv_data);

    nvshmem_finalize();
    return 0;
}