NVRTC Example#
The NVRTC example presents how to use cuRANDDx with NVRTC runtime compilation. The problem description created with cuRANDDx operators are defined only in the device code. The header file curanddx.hpp
is also included only in the device code, which is passed to NVRTC.
// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
#include <iostream>
#include <vector>
#include <memory>
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "nvrtc_helper.hpp"
#include "common.hpp"
const char* generate_kernel = R"kernel(
#include <curanddx.hpp>
// CURANDDX Operators
using RNG = decltype(curanddx::Generator<curanddx::pcg>() + curanddx::SM<RNG_SM>() + curanddx::Thread());
extern "C" __global__ void generate_kernel(unsigned int* d_out,
const unsigned long long seed,
const typename RNG::offset_type offset,
const size_t size) {
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= size)
return;
curanddx::uniform_bits<unsigned int> dist;
// compare with NVPL RAND PCG strict ordering
RNG rng(seed, 0, offset + tid);
d_out[tid] = dist.generate(rng);
}
)kernel";
// This example demonstrates how to use NVRTC to runtime compile a kernel using CURANDDX functions and check correctness
int main(int, char**) {
// Get current device
int current_device;
CU_CHECK_AND_EXIT(cudaGetDevice(¤t_device));
// Create NVRTC program
nvrtcProgram program;
NVRTC_SAFE_CALL(nvrtcCreateProgram(&program, // program
generate_kernel, // buffer
"generate_kernel.cu", // name
0, // numHeaders
NULL, // headers
NULL)); // includeNames
// Prepare compilation options
std::vector<const char*> opts = {
"--std=c++17",
"--device-as-default-execution-space",
"--include-path=" CUDA_INCLUDE_DIR // Add path to CUDA include directory
};
// Parse cuRANDDx include dirs
std::vector<std::string> curanddx_include_dirs = example::nvrtc::get_curanddx_include_dirs();
// Add cuBLASDx include dirs to opts
for (auto& d : curanddx_include_dirs) {
opts.push_back(d.c_str());
}
// Add GPU_ARCHITECTURE definition to opts
std::string gpu_architecture_definition =
"-DRNG_SM=" + std::to_string(example::nvrtc::get_device_architecture(current_device) * 10);
opts.push_back(gpu_architecture_definition.c_str());
// Add gpu-architecture to opts
std::string gpu_architecture_option = example::nvrtc::get_device_architecture_option(current_device);
opts.push_back(gpu_architecture_option.c_str());
nvrtcResult compileResult = nvrtcCompileProgram(program, // program
static_cast<int>(opts.size()), // numOptions
opts.data()); // options
// Obtain compilation log from the program
if (compileResult != NVRTC_SUCCESS) {
for (auto o : opts) {
std::cout << o << std::endl;
}
example::nvrtc::print_program_log(program);
std::exit(1);
}
// Obtain cubin from the program.
size_t cubin_size;
NVRTC_SAFE_CALL(nvrtcGetCUBINSize(program, &cubin_size));
auto cubin = std::make_unique<char[]>(cubin_size);
NVRTC_SAFE_CALL(nvrtcGetCUBIN(program, cubin.get()));
// Destroy the program.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&program));
// Load the generated Cubin and get a handle to the generate_kernel
CUdevice cuDevice;
CUcontext context;
CUmodule module;
CUfunction kernel;
CU_CHECK_AND_EXIT(cuInit(0));
CU_CHECK_AND_EXIT(cuDeviceGet(&cuDevice, current_device));
CU_CHECK_AND_EXIT(cuCtxCreate(&context, 0, cuDevice));
CU_CHECK_AND_EXIT(cuModuleLoadDataEx(&module, cubin.get(), 0, 0, 0));
CU_CHECK_AND_EXIT(cuModuleGetFunction(&kernel, module, "generate_kernel"));
// Allocate output memory
using DataType = unsigned int;
CUdeviceptr d_out;
size_t size = 5000;
CU_CHECK_AND_EXIT(cuMemAlloc(&d_out, size * sizeof(DataType)));
unsigned long long seed = 1234ULL;
unsigned long long offset = 1ULL;
// Invokes kernel
const unsigned int block_dim = 256;
const unsigned int grid_size = (size + block_dim - 1) / block_dim;
// Execute generate_kernel
void* args[] = {&d_out, &seed, &offset, &size};
CU_CHECK_AND_EXIT(cuLaunchKernel(kernel,
grid_size, // number of blocks
1,
1,
block_dim, // number of threads
1,
1,
0, // shared memory size
NULL, // NULL stream
args,
0));
CU_CHECK_AND_EXIT(cuCtxSynchronize());
// Copy to host
std::vector<DataType> h_out(size);
CU_CHECK_AND_EXIT(cuMemcpyDtoH(h_out.data(), d_out, size * sizeof(DataType)));
CU_CHECK_AND_EXIT(cuMemFree(d_out));
// Compare hash
unsigned int xor_curand = 0x0;
for (auto i = 0U; i < size; i++) {
xor_curand ^= h_out[i];
}
if (xor_curand == 0xaa706742) {
std::cout << "Compared to the hash value: Same sequence is generated with NVPL RAND and cuRANDDx generator "
"using STRICT ordering.\n";
std::cout << "SUCCESS \n";
return 0;
} else {
std::cout
<< "FAILED: different sequence is generated with NVPL RAND and cuRANDDx generator using STRICT ordering.\n";
return -1;
}
}