Simple PCG Example#

The following code uses PCG generator to create the same sequence of 32-bit random numbers as these generated with NVPL RAND with strict ordering.

// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

#include <iostream>
#include <vector>

#include <curanddx.hpp>
#include "common.hpp"

#ifdef CURANDDX_EXAMPLE_NVPL_RAND_AVAILABLE
#    include <nvpl_rand.h>
#    define NVPL_RAND_CHECK(x)                                  \
        do {                                                    \
            if ((x) != NVPL_RAND_STATUS_SUCCESS) {              \
                printf("Error at %s:%d\n", __FILE__, __LINE__); \
                _Exit(EXIT_FAILURE);                            \
            }                                                   \
        } while (0)
#endif

// This example demonstrates how to use the default PCG generator and cuRANDDx Thread-level operator to generate a sequence of random 32-bit 
// numbers and compare with the results generated using NVPL RAND, if available, with STRICT ordering

template<class RNG, typename data_type>
__global__ void generate_kernel(data_type*                      d_out,
                                const unsigned long long        seed,
                                const typename RNG::offset_type offset,
                                const size_t                    size) {
    const int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid >= size)
        return;

    curanddx::uniform_bits<data_type> dist;

    // compare with NVPL RAND PCG strict ordering
    RNG rng(seed, 0, offset + tid);

    d_out[tid] = dist.generate(rng);
}

template<unsigned int Arch>
int simple_pcg_thread_api() {
    using RNG = decltype(curanddx::Generator<curanddx::pcg>() + curanddx::SM<Arch>() + curanddx::Thread());

    using DataType = typename RNG::bitgenerator_result_type;

    // Allocate output memory
    DataType*    d_out;
    const size_t size = 5000;
    CUDA_CHECK_AND_EXIT(cudaMalloc((void**)&d_out, size * sizeof(DataType)));

    unsigned long long seed   = 1234ULL;
    unsigned long long offset = 1ULL;

    // Invokes kernel
    const unsigned int block_dim = 256;
    const unsigned int grid_size = (size + block_dim - 1) / block_dim;
    
    generate_kernel<RNG, DataType><<<grid_size, block_dim, 0>>>(d_out, seed, offset, size);
    CUDA_CHECK_AND_EXIT(cudaPeekAtLastError());
    CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize());

    std::vector<DataType> h_out(size);
    CUDA_CHECK_AND_EXIT(cudaMemcpy(h_out.data(), d_out, size * sizeof(DataType), cudaMemcpyDeviceToHost));
    CUDA_CHECK_AND_EXIT(cudaFree(d_out));

#ifdef CURANDDX_EXAMPLE_NVPL_RAND_AVAILABLE
    // nvpl RAND
    std::vector<DataType> h_ref(size);
    nvplRandGenerator_t   gen;
    const auto            generator_type = NVPL_RAND_RNG_PSEUDO_PCG;

    nvplRandMTCreateGeneratorDefault(&gen, generator_type);
    NVPL_RAND_CHECK(nvplRandSetPseudoRandomGeneratorSeed(gen, seed));
    NVPL_RAND_CHECK(nvplRandSetGeneratorOffset(gen, offset));
    NVPL_RAND_CHECK(nvplRandMTSetGeneratorOrdering(gen, NVPL_RAND_ORDERING_STRICT));

    // Generate
    NVPL_RAND_CHECK(nvplRandGenerate(gen, h_ref.data(), size));
    NVPL_RAND_CHECK(nvplRandDestroyGenerator(gen));

    // Compare Results
    if (h_out == h_ref) {
        std::cout << "SUCCESS: Same sequence is generated with cuRANDDx and nvpl RAND API using STRICT ordering.\n";
    } else {
        int count {0};
        for (auto i = 0U; i < size; i++) {
            if (h_out[i] != h_ref[i] && count < 10) {
                printf("array_curanddx[%u] = %u, array_nvplrand[%u] = %u \n", i, h_out[i], i, h_ref[i]);
                count++;
            }
        }
        std::cout << "FAILED: Different sequence is generated with cuRANDDx and NVPL RAND Host API using LEGACY "
                     "ordering.\n";
        return 1;
    }

    // compute hash to be used if nvpl rand is not available
    unsigned int xor_nvplrand = 0x0;
    for (auto i = 0U; i < size; i++) {
        xor_nvplrand ^= h_ref[i];
    }
    std::cout << "NVPL RAND reference xor output is " << std::hex << xor_nvplrand << std::endl;
    return 0;

#else
    // Compare hash
    unsigned int xor_curand = 0x0;
    for (auto i = 0U; i < size; i++) {
        xor_curand ^= h_out[i];
    }
    if (xor_curand == 0xaa706742) {
        std::cout << "Compared to the hash value: Same sequence is generated with NVPL RAND and cuRANDDx generator "
                     "using STRICT ordering.\n";
        std::cout << "SUCCESS \n";
        return 0;
    } else {
        std::cout
            << "FAILED: different sequence is generated with NVPL RAND and cuRANDDx generator using STRICT ordering.\n";
        return -1;
    }
#endif
}

template<unsigned int Arch>
struct simple_pcg_thread_api_functor {
    int operator()() { return simple_pcg_thread_api<Arch>(); }
};

int main(int, char**) {
    return example::sm_runner<simple_pcg_thread_api_functor>();
}