NVRTC + nvJitLink Example#

Location	Description
`05_lz4_cpu_and_nvrtc/lz4_cpu_compression_nvrtc_decompression.cu`	CPU compression, Warp-level GPU nvRTC + nvJitLink LZ4 decompression
This NVRTC example demonstrates how to use nvCOMPDx with NVTRC runtime compilation and nvJitLink runtime linking.
The problem description created with nvCOMPDx operators is defined only in the device code. The header file nvcompdx.hpp is included only in the device code, which is passed to NVRTC to create an LTO IR (link-time-optimizable intermediate representation). Then, nvJitLink is used to link the generated LTO IR with nvCOMPDx’s LTO library, perform optimization on the linked LTO IR modules, and generate the cubin for the specified GPU architecture.
The compressed buffer is generated on the CPU using the lz4 library. This library must be installed separately on the system to build the example. Detailed installation instructions are provided on the GitHub example page.
The example is located in the example/nvcompdx/05_lz4_cpu_and_nvrtc folder of the nvCOMPDx package.
// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

#include <lz4.h>
#include <lz4hc.h>

#include <nvrtc.h>
#include <nvJitLink.h>

#include <nvcompdx.hpp>
#include "../common/batch_data.hpp"
#include "../common/util_nvrtc.hpp"

using namespace nvcompdx;

// This sample demonstrates the usage of the warp-level device API for
// LZ4 GPU decompression. The decompression kernel is compiled and linked
// during runtime. The compression happens through the host-side
// lz4 CPU library.

// LZ4 decompression kernel, using the preconfigured decompressor
// 1 warp per chunk, but multiple chunks per thread block

const char* decomp_kernel = R"kernel(
#include <nvcompdx.hpp>

using namespace nvcompdx;

extern "C" __global__ void decomp_warp_kernel(
    size_t batch_size,
    const void * const * comp_chunks,
    void * const * uncomp_chunks,
    const size_t * comp_chunk_sizes,
    size_t * decomp_chunk_sizes) {

  const unsigned int global_chunk_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
  const unsigned int local_chunk_id = threadIdx.x / 32;
  if(global_chunk_id >= batch_size) {
    return;
  }

  using decompressor_type =
    decltype(Algorithm<algorithm::ALGORITHM>() +
             DataType<datatype::DATATYPE>() +
             MaxUncompChunkSize<MAX_UNCOMP_CHUNK_SIZE>() +
             Direction<direction::decompress>() +
             Warp() +
             SM<ARCH>());

  auto decompressor = decompressor_type();
  constexpr auto shmem_size_warp = decompressor.shmem_size_group();
  extern __shared__ __align__(decompressor.shmem_alignment()) uint8_t shared_scratch_decomp_buffer[];

  decompressor.execute(
    comp_chunks[global_chunk_id],
    uncomp_chunks[global_chunk_id],
    comp_chunk_sizes[global_chunk_id],
    decomp_chunk_sizes + global_chunk_id,
    shared_scratch_decomp_buffer + shmem_size_warp * local_chunk_id,
    nullptr);
}
)kernel";

static std::string get_device_architecture_option(CUdevice& device)
{
  // Note:
  // -arch=compute_...       will generate PTX
  // -arch=sm_...            will generate SASS
  // -arch=sm_... with -dlto will generate LTO IR
  std::string option = "-arch=sm_" + std::to_string(get_device_architecture(device));
  return option;
}

static std::vector<std::string> get_comp_include_dirs()
{
#ifndef NVCOMPDX_INCLUDE_DIRS
  return std::vector<std::string>();
#else
  std::vector<std::string> comp_include_dirs_array;
  {
    std::string comp_include_dirs = NVCOMPDX_INCLUDE_DIRS;
    std::string delim             = ",";
    size_t      start             = 0U;
    size_t      end               = comp_include_dirs.find(delim);
    while (end != std::string::npos) {
      comp_include_dirs_array.push_back("--include-path=" +
        comp_include_dirs.substr(start, end - start));
      start = end + delim.length();
      end   = comp_include_dirs.find(delim, start);
    }
    comp_include_dirs_array.push_back("--include-path=" +
      comp_include_dirs.substr(start, end - start));
  }
#endif // NVCOMPDX_INCLUDE_DIRS
#ifdef COMMONDX_INCLUDE_DIR
  {
    comp_include_dirs_array.push_back("--include-path=" + std::string(COMMONDX_INCLUDE_DIR));
  }
#endif // COMMONDX_INCLUDE_DIR
#ifdef CUTLASS_INCLUDE_DIR
  {
    comp_include_dirs_array.push_back("--include-path=" + std::string(CUTLASS_INCLUDE_DIR));
  }
#endif // CUTLASS_INCLUDE_DIR
  {
    const char* env_ptr = std::getenv("NVCOMPDX_EXAMPLE_COMMONDX_INCLUDE_DIR");
    if (env_ptr != nullptr) {
        comp_include_dirs_array.push_back("--include-path=" + std::string(env_ptr));
    }
  }
  {
    const char* env_ptr = std::getenv("NVCOMPDX_EXAMPLE_CUTLASS_INCLUDE_DIR");
    if (env_ptr != nullptr) {
        comp_include_dirs_array.push_back("--include-path=" + std::string(env_ptr));
    }
  }
  {
    const char* env_ptr = std::getenv("NVCOMPDX_EXAMPLE_NVCOMPDX_INCLUDE_DIR");
    if (env_ptr != nullptr) {
        comp_include_dirs_array.push_back("--include-path=" + std::string(env_ptr));
    }
  }
  {
    const char* env_ptr = std::getenv("NVCOMPDX_EXAMPLE_CUDA_INCLUDE_DIR");
    if (env_ptr != nullptr) {
        comp_include_dirs_array.push_back("--include-path=" + std::string(env_ptr));
        comp_include_dirs_array.push_back("--include-path=" + std::string(env_ptr) + "/cuda/std");
    }
  }
  return comp_include_dirs_array;
}

// Benchmark performance from the binary data file
template<unsigned int Arch>
static int run_nvrtc_example(const std::vector<std::vector<char>>& data)
{
  assert(!data.empty());

  size_t total_bytes = 0;
  for (const std::vector<char>& part : data) {
    total_bytes += part.size();
  }
  std::cout << "----------" << std::endl;
  std::cout << "files: " << data.size() << std::endl;
  std::cout << "uncompressed (B): " << total_bytes << std::endl;

  // Compile-time (de)compression parameters
  constexpr size_t num_warps_per_chunk = 1;
  constexpr size_t num_chunks_per_block = 4;
  constexpr size_t num_warps_per_block = num_warps_per_chunk * num_chunks_per_block;
  constexpr unsigned int block_size = static_cast<unsigned int>(num_warps_per_block * 32);
  constexpr size_t chunk_size = 1 << 16; // [bytes]

  // Build up input batch on CPU
  BatchDataCPU input_data_cpu(data, chunk_size);
  size_t batch_size = input_data_cpu.batch_size();
  std::cout << "chunks: " << batch_size << std::endl;

  // Allocate and prepare output/compressed batch
  BatchDataCPU compressed_data_cpu(
      LZ4_compressBound(chunk_size), batch_size);

  // Compressing on the CPU
  // loop over chunks on the CPU, compressing each one one by one
  for (size_t i = 0; i < batch_size; ++i) {
    // could use LZ4_compress_default or LZ4_compress_fast instead
    const int size = LZ4_compress_HC(
        static_cast<const char*>(input_data_cpu.chunk_ptrs()[i]),
        static_cast<char*>(compressed_data_cpu.chunk_ptrs()[i]),
        static_cast<int>(input_data_cpu.chunk_sizes()[i]),
        static_cast<int>(compressed_data_cpu.chunk_sizes()[i]),
        12 /* compression level */);
    if (size == 0) {
      throw std::runtime_error(
          "LZ4 CPU failed to compress chunk " + std::to_string(i) + ".");
    }

    // Set the actual compressed size
    compressed_data_cpu.chunk_sizes()[i] = static_cast<size_t>(size);
  }

  // Compute compression ratio
  size_t* compressed_sizes_host = compressed_data_cpu.chunk_sizes();
  size_t comp_bytes =
    std::accumulate(compressed_sizes_host, compressed_sizes_host + batch_size, size_t(0));

  std::cout << "comp_size: " << comp_bytes
            << ", compressed ratio: " << std::fixed << std::setprecision(2)
            << (double)total_bytes / comp_bytes << std::endl;

  // Configure the GPU decompressor
  using lz4_decompressor_type =
    decltype(Algorithm<algorithm::lz4>() +
             DataType<datatype::uint8>() +
             MaxUncompChunkSize<chunk_size>() +
             Direction<direction::decompress>() +
             Warp() +
             SM<Arch>());

  // Runtime decompression parameters
  const auto block_count =
    static_cast<unsigned int>((batch_size + num_chunks_per_block - 1) / num_chunks_per_block);

  // Global scratch buffer
  // Note: lz4 decompression requires no global scratch buffer
  static_assert(lz4_decompressor_type().tmp_size_group() == 0);

  // Shared scratch buffer
  const auto decomp_shared_memory =
    static_cast<unsigned int>(lz4_decompressor_type().shmem_size_group() * num_chunks_per_block);

  // Copy compressed data to GPU
  BatchData compressed_data(compressed_data_cpu, true, lz4_decompressor_type().input_alignment());

  // Allocate and build up decompression batch on GPU
  BatchData decomp_data(input_data_cpu, false, lz4_decompressor_type().output_alignment());

  // Create an NVRTC program out of the string-defined kernel
  nvrtcProgram program;
  NVRTC_CHECK(nvrtcCreateProgram(&program,
                                 decomp_kernel,
                                 NULL /* CUDA program name */,
                                 0 /* numHeaders */,
                                 NULL /* headers */,
                                 NULL /* includeNames */));

  // Prepare compilation options
  CUdevice cuDevice;
  CU_CHECK(cuInit(0 /* flags */));
  CU_CHECK(cuDeviceGet(&cuDevice, 0 /* by default using the first device */));
  const auto gpu_architecture_option = get_device_architecture_option(cuDevice);
  std::vector<const char*> opts = {
    "--std=c++17",
    "--device-as-default-execution-space",
    "--include-path=" CUDAToolkit_INCLUDE_DIR, // Path to the CUDA include directory
    "--include-path=" CUDAToolkit_INCLUDE_DIR "/cuda/std", // Path to standard headers
    "-dlto",
    "-rdc=true",
#ifdef NVCOMPDX_DISABLE_CUTLASS
    "-DNVCOMPDX_DISABLE_CUTLASS",
#endif // NVCOMPDX_DISABLE_CUTLASS
    gpu_architecture_option.c_str()
  };

  auto opt_convert_define = [](const auto& s1, const auto& s2) {
    return std::string("-D") + s1 + std::string("=") + s2;
  };

  // Compiler definitions
  std::vector<std::string> comp_config_values = {
    opt_convert_define("ALGORITHM", "lz4"),
    opt_convert_define("DATATYPE", "uint8"),
    opt_convert_define("MAX_UNCOMP_CHUNK_SIZE", std::to_string(chunk_size)),
    opt_convert_define("ARCH", std::to_string(Arch))
  };
  for (auto& config : comp_config_values) {
    opts.push_back(config.c_str());
  }

  // Include folder paths
  std::vector<std::string> comp_include_dirs = get_comp_include_dirs();
  for (auto& d : comp_include_dirs) {
    opts.push_back(d.c_str());
  }

  // Compile kernel via nvrtc
  nvrtcResult compileResult = nvrtcCompileProgram(program,
                                                  static_cast<int>(opts.size()),
                                                  opts.data());
  if (compileResult != NVRTC_SUCCESS) {
      // Obtain compilation log from the program if unsuccessful
      for (auto option : opts) {
          std::cout << option << std::endl;
      }
      print_nvrtc_program_log(std::cerr, program);
      std::exit(1);
  }

  // Obtain generated LTO IR from the program
  size_t lto_size;
  NVRTC_CHECK(nvrtcGetLTOIRSize(program, &lto_size));
  auto ltoir = std::make_unique<char[]>(lto_size);
  NVRTC_CHECK(nvrtcGetLTOIR(program, ltoir.get()));
  NVRTC_CHECK(nvrtcDestroyProgram(&program));

  // Load the generated Cubin and get a handle to our kernel
  CUcontext context;
  CU_CHECK(cuCtxCreate(&context, 0 /* flags */, cuDevice));

  // Load the generated LTO IR and the static nvCOMPDx LTO library
  nvJitLinkHandle linker;
  std::vector<const char*> lopts;
  lopts.emplace_back("-lto");
  lopts.emplace_back(gpu_architecture_option.c_str());
  NVJITLINK_CHECK(linker, nvJitLinkCreate(&linker,
                                          static_cast<uint32_t>(lopts.size()),
                                          lopts.data()));

  // Add the runtime-compiled kernel LTO IR
  NVJITLINK_CHECK(linker,
    nvJitLinkAddData(linker, NVJITLINK_INPUT_LTOIR, ltoir.get(), lto_size, "lto_online"));

  // Add nvCOMPDx LTO library or the nvCOMPDx fatbinary
  const char* fatbin_env_ptr = std::getenv("NVCOMPDX_EXAMPLE_NVCOMPDX_FATBIN");
  const char* library_env_ptr = std::getenv("NVCOMPDX_EXAMPLE_NVCOMPDX_LIBRARY");
  if(fatbin_env_ptr) {
    NVJITLINK_CHECK(linker, nvJitLinkAddFile(linker, NVJITLINK_INPUT_FATBIN, fatbin_env_ptr));
  } else if(library_env_ptr) {
    NVJITLINK_CHECK(linker, nvJitLinkAddFile(linker, NVJITLINK_INPUT_LIBRARY, library_env_ptr));
  } else {
#if defined(NVCOMPDX_FATBIN)
    NVJITLINK_CHECK(linker, nvJitLinkAddFile(linker, NVJITLINK_INPUT_FATBIN, NVCOMPDX_FATBIN));
#elif defined(NVCOMPDX_LIBRARY)
    NVJITLINK_CHECK(linker, nvJitLinkAddFile(linker, NVJITLINK_INPUT_LIBRARY, NVCOMPDX_LIBRARY));
#else
    std::cerr << "Please set one of the environment variables: "
              << "NVCOMPDX_EXAMPLE_NVCOMPDX_LIBRARY, "
              << "NVCOMPDX_EXAMPLE_NVCOMPDX_FATBIN, " << std::endl
              << "or during compilation define NVCOMPDX_LIBRARY or "
              << "NVCOMPDX_FATBIN." << std::endl;
    return 1;
#endif
  }

  // Generate the cubin from the LTO IR sources
  NVJITLINK_CHECK(linker, nvJitLinkComplete(linker));

  // Acquire cubin
  size_t cubin_size;
  NVJITLINK_CHECK(linker, nvJitLinkGetLinkedCubinSize(linker, &cubin_size));
  auto cubin = std::make_unique<char[]>(cubin_size);
  NVJITLINK_CHECK(linker, nvJitLinkGetLinkedCubin(linker, cubin.get()));
  NVJITLINK_CHECK(linker, nvJitLinkDestroy(&linker));

  // Load cubin
  CUmodule module;
  CUfunction kernel;
  CU_CHECK(cuModuleLoadDataEx(&module, cubin.get(), 0 /* numOptions */, NULL, NULL));
  CU_CHECK(cuModuleGetFunction(&kernel, module, "decomp_warp_kernel"));

  // Set dynamic shared memory needs
  CU_CHECK(cuFuncSetAttribute(kernel,
                              CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
                              decomp_shared_memory));

  // Start with the actual decompression
  auto comp_chunks = compressed_data.chunk_ptrs();
  auto uncomp_chunks = decomp_data.chunk_ptrs();
  auto comp_chunk_sizes = compressed_data.chunk_sizes();
  auto decomp_chunk_sizes = decomp_data.chunk_sizes();
  void* args[] = {
    &batch_size,
    &comp_chunks,
    &uncomp_chunks,
    &comp_chunk_sizes,
    &decomp_chunk_sizes
  };
  CU_CHECK(cuLaunchKernel(kernel,
                          block_count /* gridDimX */,
                          1 /* gridDimY */,
                          1 /* gridDimZ */,
                          block_size /* blockDimX */,
                          1 /* blockDimY */,
                          1 /* blockDimZ */,
                          decomp_shared_memory,
                          NULL /* hStream */,
                          args,
                          NULL));
  CU_CHECK(cuCtxSynchronize());

  // Validate decompressed data against input
  if (decomp_data != input_data_cpu) {
    throw std::runtime_error("Failed to validate decompressed data");
  } else {
    std::cout << "decompression validated :)" << std::endl;
  }
  return 0;
}

template<unsigned int Arch>
struct Runner {
  template<typename... Args>
  static int run(Args&&... args)
  {
    return run_nvrtc_example<Arch>(std::forward<Args>(args)...);
  }
};

int main(int argc, char* argv[])
{
  std::vector<std::string> file_names;

  do {
    if (argc < 3) {
      break;
    }

    int i = 1;
    while (i < argc) {
      const char* current_argv = argv[i++];
      if (strcmp(current_argv, "-f") == 0) {
        // parse until next `-` argument
        while (i < argc && argv[i][0] != '-') {
          file_names.emplace_back(argv[i++]);
        }
      } else {
        std::cerr << "Unknown argument: " << current_argv << std::endl;
        return 1;
      }
    }
  } while (0);

  if (file_names.empty()) {
    std::cerr << "Must specify at least one file via '-f <file>'." << std::endl;
    return 1;
  }

  auto data = multi_file(file_names);

  return run_with_current_arch<Runner>(data);
}