When an iteration count greater than one is supplied, the sample benchmarks both modes and prints timing statistics so you can compare the overhead of manual synchronization against the host-function approach.
When run with an iteration count, timing statistics (min, max, mean, median, standard deviation) are printed for each mode.
For convenience, here's the code that is also installed in the samples directory.
6 #include <opencv2/core/version.hpp>
7 #include <opencv2/opencv.hpp>
14 #if CV_MAJOR_VERSION >= 3
15 # include <opencv2/imgcodecs.hpp>
17 # include <opencv2/highgui/highgui.hpp>
24 #include <cuda_runtime.h>
41 #define CHECK_VPI_STATUS(STMT) \
44 VPIStatus status = (STMT); \
45 if (status != VPI_SUCCESS) \
47 char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH]; \
48 vpiGetLastStatusMessage(buffer, sizeof(buffer)); \
49 std::ostringstream ss; \
50 ss << vpiStatusGetName(status) << ": " << buffer; \
51 throw std::runtime_error(ss.str()); \
55 #define CHECK_CUDA_STATUS(STMT) \
58 cudaError_t status = (STMT); \
59 if (status != cudaSuccess) \
61 std::ostringstream ss; \
62 ss << cudaGetErrorString(status); \
63 throw std::runtime_error(ss.str()); \
67 #define CHECK_NPP_STATUS(STMT) \
70 NppStatus status = (STMT); \
71 if (status != NPP_SUCCESS) \
73 std::ostringstream ss; \
75 throw std::runtime_error(ss.str()); \
79 enum class PipelineMode
81 CUDA_HOST_FUNCTION_MODE,
85 static void printBenchmarkStats(std::vector<int64_t> data)
87 size_t n = data.size();
88 std::cout << std::fixed << std::setprecision(2);
89 std::cout <<
"Input Data Size: " << n << std::endl;
90 std::cout <<
"\nStatistics (in microseconds):" << std::endl;
94 int64_t min_val = *std::min_element(data.begin(), data.end());
95 int64_t max_val = *std::max_element(data.begin(), data.end());
96 std::cout <<
" Min Value: " << min_val << std::endl;
97 std::cout <<
" Max Value: " << max_val << std::endl;
101 int64_t sum = std::accumulate(data.begin(), data.end(), int64_t{0});
102 double mean = ((double)sum / n);
103 std::cout <<
" Sum: " << (double)sum << std::endl;
104 std::cout <<
" Mean: " << mean << std::endl;
108 std::vector<int64_t> sorted_data = data;
109 std::sort(sorted_data.begin(), sorted_data.end());
114 median = (double)sorted_data[n / 2];
119 int64_t mid1 = sorted_data[n / 2 - 1];
120 int64_t mid2 = sorted_data[n / 2];
121 median = (double)(mid1 + mid2) / 2.0;
123 std::cout <<
" Median: " << median << std::endl;
127 long double variance_sum = 0.0;
128 for (int64_t val : data)
130 long double diff = (
long double)val - mean;
131 variance_sum += diff * diff;
133 double std_dev = (double)std::sqrt(variance_sum / n);
134 std::cout <<
" Standard Deviation: " << std_dev << std::endl;
154 struct CudaHostFnData
156 uint8_t *cudaGrayImg;
157 uint8_t *cudaBlurredImg;
158 uint8_t *cudaBorderedImg;
162 NppStreamContext nppCtx;
166 static void nppBlurAndBorderCallback(cudaStream_t cudaStream,
void *userData)
168 CudaHostFnData *data =
static_cast<CudaHostFnData *
>(userData);
169 data->nppCtx.hStream = cudaStream;
170 cudaStreamGetFlags(cudaStream, &data->nppCtx.nStreamFlags);
172 CHECK_NPP_STATUS(nppiFilterGaussBorder_8u_C1R_Ctx(
173 data->cudaGrayImg, data->pitchBytes, {data->width, data->height}, {0, 0}, data->cudaBlurredImg,
174 data->pitchBytes, {data->width, data->height}, NPP_MASK_SIZE_3_X_3, NPP_BORDER_REPLICATE, data->nppCtx));
176 submitBorderChange(data->cudaBlurredImg, data->cudaBorderedImg,
static_cast<uint32_t
>(data->width),
177 static_cast<uint32_t
>(data->height), data->borderPixels, cudaStream);
180 static std::vector<int64_t> runPipeline(
const std::string &filename, cv::Mat &output, PipelineMode mode,
int numIters,
181 bool writeOutputOnFinalIteration)
200 cudaStream_t cudaStream =
nullptr;
202 if (mode == PipelineMode::CUDA_HOST_FUNCTION_MODE)
208 CHECK_CUDA_STATUS(cudaStreamCreate(&cudaStream));
212 NppStreamContext nppCtx;
213 nppCtx.hStream = cudaStream;
214 CHECK_CUDA_STATUS(cudaGetDevice(&nppCtx.nCudaDeviceId));
215 cudaDeviceProp gpuProps;
216 CHECK_CUDA_STATUS(cudaGetDeviceProperties(&gpuProps, nppCtx.nCudaDeviceId));
217 nppCtx.nMultiProcessorCount = gpuProps.multiProcessorCount;
218 nppCtx.nMaxThreadsPerMultiProcessor = gpuProps.maxThreadsPerMultiProcessor;
219 nppCtx.nMaxThreadsPerBlock = gpuProps.maxThreadsPerBlock;
220 nppCtx.nSharedMemPerBlock = gpuProps.sharedMemPerBlock;
221 nppCtx.nCudaDevAttrComputeCapabilityMajor = gpuProps.major;
222 nppCtx.nCudaDevAttrComputeCapabilityMinor = gpuProps.minor;
223 if (cudaStream !=
nullptr)
225 CHECK_CUDA_STATUS(cudaStreamGetFlags(cudaStream, &nppCtx.nStreamFlags));
229 nppCtx.nStreamFlags = 0;
232 cv::Mat bgrImg, cvImage;
233 bgrImg = cv::imread(filename);
234 if (bgrImg.channels() == 3)
236 cv::cvtColor(bgrImg, cvImage, cv::COLOR_BGR2BGRA);
242 int width = cvImage.cols;
243 int height = cvImage.rows;
245 VPIImage input, gray, bordered, vpiOutput;
249 int pitchBytes =
sizeof(uint8_t) * width;
250 uint8_t *cudaGrayImg, *cudaBlurredImg, *cudaBorderedImg;
251 CHECK_CUDA_STATUS(cudaMalloc(&cudaGrayImg, pitchBytes * height));
252 CHECK_CUDA_STATUS(cudaMalloc(&cudaBlurredImg, pitchBytes * height));
253 CHECK_CUDA_STATUS(cudaMalloc(&cudaBorderedImg, pitchBytes * height));
256 VPIImageData borderedData = getImgData(width, height,
static_cast<VPIByte *
>(cudaBorderedImg));
260 std::vector<int64_t> timings;
261 timings.reserve(
static_cast<size_t>(numIters));
263 for (
int iter = 0; iter < numIters; ++iter)
265 auto start = std::chrono::high_resolution_clock::now();
267 if (mode == PipelineMode::CUDA_HOST_FUNCTION_MODE)
273 CudaHostFnData hostFnData = {cudaGrayImg, cudaBlurredImg, cudaBorderedImg, width,
274 height, pitchBytes, nppCtx, 50};
288 CHECK_NPP_STATUS(nppiFilterGaussBorder_8u_C1R_Ctx(cudaGrayImg, pitchBytes, {width, height}, {0, 0},
289 cudaBlurredImg, pitchBytes, {width, height},
290 NPP_MASK_SIZE_3_X_3, NPP_BORDER_REPLICATE, nppCtx));
292 submitBorderChange(cudaBlurredImg, cudaBorderedImg, width, height, 50, cudaStream);
300 auto end = std::chrono::high_resolution_clock::now();
301 timings.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - start).count());
303 const bool isFinalIteration = (iter == numIters - 1);
304 if (isFinalIteration && writeOutputOnFinalIteration)
309 cv::Mat cvOut(height, width, CV_8U, outDataPitch.
planes[0].
pBase);
310 output = cvOut.clone();
319 cudaFree(cudaGrayImg);
320 cudaFree(cudaBlurredImg);
321 cudaFree(cudaBorderedImg);
323 if (cudaStream !=
nullptr)
325 cudaStreamDestroy(cudaStream);
330 int main(
int argc,
char *argv[])
337 if (argc < 2 || argc > 3)
339 throw std::runtime_error(std::string(
"Usage: ") + argv[0] +
" <input image> [iteration_count]");
342 const std::string filename = argv[1];
343 int iterationCount = 1;
346 iterationCount = std::atoi(argv[2]);
347 if (iterationCount < 1)
349 throw std::runtime_error(
"iteration_count must be >= 1.");
356 cv::Mat hostFnOutput;
359 if (iterationCount > 1)
361 std::cout <<
"Benchmark: CUDA host function mode (" << iterationCount <<
" iterations)" << std::endl;
362 std::vector<int64_t> hostFnTimings =
363 runPipeline(filename, hostFnOutput, PipelineMode::CUDA_HOST_FUNCTION_MODE, iterationCount,
false);
364 std::cout <<
"--- CUDA host function mode ---" << std::endl;
365 printBenchmarkStats(hostFnTimings);
366 std::cout << std::endl;
368 std::cout <<
"Benchmark: Sync mode (" << iterationCount <<
" iterations)" << std::endl;
369 std::vector<int64_t> syncTimings =
370 runPipeline(filename, syncOutput, PipelineMode::SYNC_MODE, iterationCount,
false);
371 std::cout <<
"--- Sync mode ---" << std::endl;
372 printBenchmarkStats(syncTimings);
373 std::cout << std::endl;
376 runPipeline(filename, hostFnOutput, PipelineMode::CUDA_HOST_FUNCTION_MODE, 1,
true);
377 runPipeline(filename, syncOutput, PipelineMode::SYNC_MODE, 1,
true);
379 if (hostFnOutput.size() != syncOutput.size() || hostFnOutput.type() != syncOutput.type())
381 throw std::runtime_error(
"FAIL: CUDA host function and sync outputs differ in size or type.");
385 cv::absdiff(hostFnOutput, syncOutput, diff);
386 int numDiffPixels = cv::countNonZero(diff);
387 if (numDiffPixels != 0)
389 std::ostringstream ss;
390 ss <<
"FAIL: Outputs differ (" << numDiffPixels <<
" pixels).";
391 throw std::runtime_error(ss.str());
394 std::cout <<
"PASS: CUDA host function and sync mode outputs are identical." << std::endl;
396 cv::imwrite(
"multi_backend_cuda_hostfn_pipelined.png", hostFnOutput);
397 cv::imwrite(
"multi_backend_sync_pipelined.png", syncOutput);
399 catch (std::exception &e)
401 std::cerr << e.what() << std::endl;
413 std::_Exit(EXIT_SUCCESS);
Functions and structures for handling CUDA interoperability with VPI.
Functions and structures for dealing with VPI contexts.
Declares functions that implement Image flip algorithms.
Functions for handling OpenCV interoperability with VPI.
#define VPI_PIXEL_TYPE_INVALID
Signal format conversion errors.
Declares functions dealing with VPI streams.
unsigned char VPIByte
Definition of a byte type.
@ VPI_FLIP_VERT
Flip vertically.
VPIStatus vpiSubmitCUDAHostFunction(VPIStream stream, VPICUDAHostFunction hostFunc, void *userData)
Submits a CUDA host function to run on the stream in submission order.
VPIStatus vpiContextCreate(uint64_t flags, VPIContext *ctx)
Create a context instance.
VPIStatus vpiContextSetCurrent(VPIContext ctx)
Sets the context for the calling thread.
void vpiContextDestroy(VPIContext ctx)
Destroy a context instance as well as all resources it owns.
struct VPIContextImpl * VPIContext
A handle to a context.
VPIStatus vpiSubmitImageFlip(VPIStream stream, uint64_t backend, VPIImage input, VPIImage output, VPIFlipMode flipMode)
Flips a 2D image either horizontally, vertically or both.
VPIImageBuffer buffer
Stores the image contents.
VPIImagePlanePitchLinear planes[VPI_MAX_PLANE_COUNT]
Data of all image planes in pitch-linear layout.
VPIImageBufferPitchLinear pitch
Image stored in pitch-linear layout.
int32_t numPlanes
Number of planes.
VPIImageFormat format
Image format.
VPIPixelType pixelType
Type of each pixel within this plane.
VPIImageBufferType bufferType
Type of image buffer.
int64_t offsetBytes
Offset in bytes from pBase to the first column of the first plane row.
int32_t height
Height of this plane in pixels.
VPIByte * pBase
Pointer to the memory buffer which contains the plane data.
int32_t width
Width of this plane in pixels.
int32_t pitchBytes
Difference in bytes of beginning of one row and the beginning of the previous.
void vpiImageDestroy(VPIImage img)
Destroy an image instance.
struct VPIImageImpl * VPIImage
A handle to an image.
VPIStatus vpiImageCreateWrapper(const VPIImageData *data, const VPIImageWrapperParams *params, uint64_t flags, VPIImage *img)
Create an image object by wrapping an existing memory block.
VPIStatus vpiImageLockData(VPIImage img, VPILockMode mode, VPIImageBufferType bufType, VPIImageData *data)
Acquires the lock on an image object and returns the image contents.
VPIStatus vpiImageCreate(int32_t width, int32_t height, VPIImageFormat fmt, uint64_t flags, VPIImage *img)
Create an empty image instance with the specified flags.
VPIStatus vpiImageUnlock(VPIImage img)
Releases the lock on an image object.
@ VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR
CUDA-accessible with planes in pitch-linear memory layout.
@ VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR
Host-accessible with planes in pitch-linear memory layout.
Stores the image plane contents.
Stores information about image characteristics and content.
VPIStatus vpiImageCreateWrapperOpenCVMat(const cv::Mat &mat, VPIImageFormat fmt, uint64_t flags, VPIImage *img)
Wraps a cv::Mat in an VPIImage with the given image format.
struct VPIStreamImpl * VPIStream
A handle to a stream.
VPIStatus vpiStreamSync(VPIStream stream)
Blocks the calling thread until all submitted commands in this stream queue are done (queue is empty)...
void vpiStreamDestroy(VPIStream stream)
Destroy a stream instance and deallocate all HW resources.
VPIStatus vpiStreamCreateWrapperCUDA(CUstream cudaStream, uint64_t flags, VPIStream *stream)
Wraps an existing cudaStream_t into a VPI stream.
VPIStatus vpiStreamCreate(uint64_t flags, VPIStream *stream)
Create a stream instance.
@ VPI_BACKEND_CUDA
CUDA backend.
@ VPI_BACKEND_VIC
VIC backend.
@ VPI_BACKEND_CPU
CPU backend.
@ VPI_INTERP_NEAREST
Nearest neighbor interpolation.
@ VPI_LOCK_READ
Lock memory only for reading.