Quick start

This section contains a simplified and annotated version of the cuFFT LTO EA sample distributed alongside the binaries in the zip file.

The sample performs a low-pass filter of multiple signals in the frequency domain.

/* 
 * Example showing the use of LTO callbacks with CUFFT to perform 
 * R2C -> callback -> C2R.
 * 
*/

#include <vector>
#include <random>
#include <cuda_runtime_api.h>
#include <cufftXt.h>

#define ERROR_VALUE -1
#define PASS_VALUE   0

// Check CUDA API error
inline int checkErrors(cudaError_t error, int line_number) {
	if(error != cudaSuccess) {
		printf("Example failed in CUDA API on line %d with error %d\n", line_number, error);
		return ERROR_VALUE;
	}
	return PASS_VALUE;
}

// Check cuFFT API error
inline int checkErrors(cufftResult error, int line_number) {
	if(error != CUFFT_SUCCESS) {
		printf("Example failed in cuFFT API on line %d with error %d\n", line_number, error);
		return ERROR_VALUE;
	}
	return PASS_VALUE;
}
#define CHECK_ERROR(error) checkErrors(error, __LINE__)

// NOTE: Header containing the compiled LTO callback device function in a C array, generated with bin2c
#include "callback_fatbin.h"

// Struct to pass data to callback
struct cb_params {
  unsigned window_N;
  unsigned signal_size;
};

// Problem input parameters
constexpr unsigned batches              = 830;
constexpr unsigned signal_size          = 328;
constexpr unsigned window_size          =  32;
constexpr unsigned complex_signal_size  = signal_size / 2 + 1;

// Initialize the input signal with random values
void init_input_signals(unsigned batches, unsigned signal_size, float* signals) {

	std::mt19937 e2(0);
	std::uniform_real_distribution<> dist(0., 1.);

	for(unsigned batch = 0; batch < batches; ++batch) {
		for(unsigned s = 0; s < signal_size; ++s) {
			unsigned idx = batch * signal_size + s;
			signals[idx] = dist(e2);
		}
	}
}

int main() {

	// Padded array for in-place transforms
	float  input_signals[batches][2 * complex_signal_size] = {};
	float output_signals[batches][2 * complex_signal_size];
	float      reference[batches][2 * complex_signal_size];

	init_input_signals(batches, 2 * complex_signal_size, &input_signals[0][0]);

	const size_t complex_size_bytes = batches * complex_signal_size * 2 * sizeof(float);

	// Allocate and copy input from host to GPU
	float *device_signals;
	CHECK_ERROR(cudaMalloc((void **)&device_signals, complex_size_bytes));
	CHECK_ERROR(cudaMemcpy(device_signals, input_signals, complex_size_bytes, cudaMemcpyHostToDevice));

	// Define a structure used to pass in the window size
	cb_params host_params;
	host_params.window_N    = window_size;
	host_params.signal_size = complex_signal_size;

	// Allocate and copy callback parameters from host to GPU
	cb_params *device_params;
	CHECK_ERROR(cudaMalloc((void **)&device_params, sizeof(cb_params)));
	CHECK_ERROR(cudaMemcpy(device_params, &host_params, sizeof(cb_params), cudaMemcpyHostToDevice));

	// Create a CUFFT plan for the forward transform, and a cuFFT plan for the inverse transform with load callback
	cufftHandle forward_plan, inverse_plan_cb;
	size_t work_size;

	CHECK_ERROR(cufftCreate(&forward_plan));
	CHECK_ERROR(cufftCreate(&inverse_plan_cb));

	// NOTE: LTO callbacks must be set before plan creation and cannot be unset (yet)
	size_t lto_callback_fatbin_size = sizeof(window_callback);
	CHECK_ERROR(cufftXtSetJITCallback(inverse_plan_cb, (void*)window_callback, lto_callback_fatbin_size,
				          CUFFT_CB_LD_COMPLEX, (void **)&device_params));

	CHECK_ERROR(cufftMakePlan1d(forward_plan, signal_size, CUFFT_R2C, batches, &work_size));
	CHECK_ERROR(cufftMakePlan1d(inverse_plan_cb, signal_size, CUFFT_C2R, batches, &work_size));

	// Transform signal forward
	printf("Transforming signal cufftExecR2C\n");
	CHECK_ERROR(cufftExecR2C(forward_plan,    (cufftReal *)device_signals, (cufftComplex *)device_signals));

	// Apply window via load callback and inverse-transform the signal
	printf("Transforming signal cufftExecC2R\n");
	CHECK_ERROR(cufftExecC2R(inverse_plan_cb, (cufftComplex *)device_signals, (cufftReal *)device_signals));

	// Copy device memory to host
	CHECK_ERROR(cudaMemcpy(output_signals, device_signals, complex_size_bytes, cudaMemcpyDeviceToHost));

	// Destroy CUFFT context
	CHECK_ERROR(cufftDestroy(forward_plan));
	CHECK_ERROR(cufftDestroy(inverse_plan_cb));

	// Cleanup memory
	CHECK_ERROR(cudaFree(device_signals));
	CHECK_ERROR(cudaFree(device_params));

	return PASS_VALUE;
}

Specifically, it does the following:

  1. It allocates and initializes the input: batches of signals of size signal_size are initialized with random values.

  2. It copies the input data to the GPU.

  3. It creates a forward (R2C, Real-To-Complex) plan and an inverse (C2R, Complex-To-Real) plan.

  4. After plan creation (with cufftCreate(…)) but before the planning function is called, it associates the array containing the fatbin with the callback function with the plan using the extension to the cuFFT API cufftXtSetJITCallback(…).

  5. It calls the planning function (cufftMakePlan1d(…)) for both plans.

  6. It executes the forward plan.

  7. It executes the inverse plan. This plan runs the loading callback that implements the low-pass filter as a window function.

  8. It copies the results back to the host.

  9. It destroys the plans with cufftDestroy(…), and frees the GPU resources.

The callback device function is located in a separate source file that looks like this:

/* 
 * Example showing the use of LTO callbacks with CUFFT to perform 
 * truncation with zero padding.
 * 
*/

#include <cufftXt.h>

struct cb_params {
	unsigned window_N;
	unsigned signal_size;
};

// This is the load callback routine. It filters high frequencies
// based on a truncation window specified by the user
// NOTE: unlike the non-LTO version, the callback device function
// must have the name cufftJITCallbackLoadComplex, it cannot be aliased
__device__ cufftComplex cufftJITCallbackLoadComplex(void *input,
                                                    size_t index,
                                                    void *info,
                                                    void *sharedmem) {
  
	const cb_params* params = static_cast<const cb_params*>(info);
	cufftComplex* cb_output = static_cast<cufftComplex*>(input);
	const unsigned sample   = index % params->signal_size;

	return (sample < params->window_N) ? cb_output[index] : cufftComplex{0.f, 0.f};
}

There are some restrictions when it comes to naming the LTO-callback functions in the cuFFT LTO EA. See here for more details.

Before compiling the example, we need to copy the library files and headers included in the tar ball into the CUDA Toolkit folder.

$ cp nvidia-cufft-11.1.0-Linux/opt/cufft/include/* /path/to/cuda/toolkit/include
$ cp nvidia-cufft-11.1.0-Linux/opt/cufft/lib/* /path/to/cuda/toolkit/lib64

The example can then be compiled and run like this:

$ nvcc --std=c++11 --generate-code arch=compute_50,code=lto_50 -dc -fatbin callback.cu -o callback.fatbin
$ bin2c --name window_callback --type longlong callback.fatbin > callback_fatbin.h
$ g++ -I /path/to/cuda/toolkit/include -L /path/to/cuda/toolkit/lib64 lto_ea.cpp -o lto_ea -lcufft -lcudart
$ ./lto_ea
Transforming signal cufftExecR2C
Transforming signal cufftExecC2R