NVRTC Example#
The NVRTC example presents how to use cuRANDDx with NVRTC runtime compilation. The problem description created with cuRANDDx operators are defined only in the device code. The header file curanddx.hpp is also included only in the device code, which is passed to NVRTC.
/*
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <iostream>
#include <vector>
#include <memory>
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "nvrtc_helper.hpp"
#include "../common.hpp"
const char* generate_kernel = R"kernel(
#include <curanddx.hpp>
// CURANDDX Operators
using RNG = decltype(curanddx::Generator<curanddx::pcg>() + curanddx::SM<RNG_SM>() + curanddx::Thread());
extern "C" __global__ void generate_kernel(unsigned int* d_out,
const unsigned long long seed,
const typename RNG::offset_type offset,
const size_t size) {
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid >= size)
return;
curanddx::uniform_bits<unsigned int> dist;
// compare with NVPL RAND PCG strict ordering
RNG rng(seed, 0, offset + tid);
d_out[tid] = dist.generate(rng);
}
)kernel";
// This example demonstrates how to use NVRTC to runtime compile a kernel using CURANDDX functions and check correctness
int main(int, char**) {
// Get current device
int current_device;
CUDA_CHECK_AND_EXIT(cudaGetDevice(¤t_device));
// Create NVRTC program
nvrtcProgram program;
NVRTC_SAFE_CALL(nvrtcCreateProgram(&program, // program
generate_kernel, // buffer
"generate_kernel.cu", // name
0, // numHeaders
NULL, // headers
NULL)); // includeNames
// Prepare compilation options
std::vector<const char*> opts = {
"--std=c++17",
"--device-as-default-execution-space",
"--include-path=" CUDA_INCLUDE_DIR, // Add path to CUDA include directory
"--include-path=" CUDA_CCCL_INCLUDE_DIR // Add path to CCCL include directory for CTK 13.0
};
// Parse cuRANDDx include dirs
std::vector<std::string> curanddx_include_dirs = example::nvrtc::get_curanddx_include_dirs();
// Add cuBLASDx include dirs to opts
for (auto& d : curanddx_include_dirs) {
opts.push_back(d.c_str());
}
// Add GPU_ARCHITECTURE definition to opts
std::string gpu_architecture_definition =
"-DRNG_SM=" + std::to_string(example::nvrtc::get_device_architecture(current_device) * 10);
opts.push_back(gpu_architecture_definition.c_str());
// Add gpu-architecture to opts
std::string gpu_architecture_option = example::nvrtc::get_device_architecture_option(current_device);
opts.push_back(gpu_architecture_option.c_str());
nvrtcResult compileResult = nvrtcCompileProgram(program, // program
static_cast<int>(opts.size()), // numOptions
opts.data()); // options
// Obtain compilation log from the program
if (compileResult != NVRTC_SUCCESS) {
for (auto o : opts) {
std::cout << o << std::endl;
}
example::nvrtc::print_program_log(program);
std::exit(1);
}
// Obtain cubin from the program.
size_t cubin_size;
NVRTC_SAFE_CALL(nvrtcGetCUBINSize(program, &cubin_size));
auto cubin = std::make_unique<char[]>(cubin_size);
NVRTC_SAFE_CALL(nvrtcGetCUBIN(program, cubin.get()));
// Destroy the program.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&program));
// Load the generated Cubin and get a handle to the generate_kernel
CUdevice cuDevice;
CUcontext context;
CUmodule module;
CUfunction kernel;
CU_CHECK_AND_EXIT(cuInit(0));
CU_CHECK_AND_EXIT(cuDeviceGet(&cuDevice, current_device));
#if CUDA_VERSION >= 13000
CU_CHECK_AND_EXIT(cuCtxCreate(&context, (CUctxCreateParams*)0, 0, cuDevice));
#else
CU_CHECK_AND_EXIT(cuCtxCreate(&context, 0, cuDevice));
#endif
CU_CHECK_AND_EXIT(cuModuleLoadDataEx(&module, cubin.get(), 0, 0, 0));
CU_CHECK_AND_EXIT(cuModuleGetFunction(&kernel, module, "generate_kernel"));
// Allocate output memory
using DataType = unsigned int;
CUdeviceptr d_out;
size_t size = 5000;
CU_CHECK_AND_EXIT(cuMemAlloc(&d_out, size * sizeof(DataType)));
unsigned long long seed = 1234ULL;
unsigned long long offset = 1ULL;
// Invokes kernel
const unsigned int block_dim = 256;
const unsigned int grid_size = (size + block_dim - 1) / block_dim;
// Execute generate_kernel
void* args[] = {&d_out, &seed, &offset, &size};
CU_CHECK_AND_EXIT(cuLaunchKernel(kernel,
grid_size, // number of blocks
1,
1,
block_dim, // number of threads
1,
1,
0, // shared memory size
NULL, // NULL stream
args,
0));
CU_CHECK_AND_EXIT(cuCtxSynchronize());
// Copy to host
std::vector<DataType> h_out(size);
CU_CHECK_AND_EXIT(cuMemcpyDtoH(h_out.data(), d_out, size * sizeof(DataType)));
CU_CHECK_AND_EXIT(cuMemFree(d_out));
CU_CHECK_AND_EXIT(cuModuleUnload(module));
CU_CHECK_AND_EXIT(cuCtxDestroy(context));
// Compare hash
unsigned int xor_curand = 0x0;
for (auto i = 0U; i < size; i++) {
xor_curand ^= h_out[i];
}
if (xor_curand == 0xaa706742) {
std::cout << "Compared to the hash value: Same sequence is generated with NVPL RAND and cuRANDDx generator "
"using STRICT ordering.\n";
std::cout << "SUCCESS \n";
return 0;
} else {
std::cout
<< "FAILED: different sequence is generated with NVPL RAND and cuRANDDx generator using STRICT ordering.\n";
return 1;
}
}