Overview

This application tracks bounding boxes on an input video, draws them on each frame and saves the result in video file. The user can define what backend will be used for processing.

Note: The output will be in grayscale as the algorithm currently doesn't support color inputs.

Instructions

The usage is:

./vpi_sample_06_klt_tracker <backend> <input video> <input bboxes>

where

backend: either cpu, cuda or pva; it defines the backend that will perform the processing.
input video: input video file name, it accepts all video types that OpenCV's cv::VideoCapture accepts.
input bboxes: file with input bounding boxes and in what frame they appear. The file is composed of multiple lines with the following format:
```
   <frame> <bbox_x> <bbox_y> <bbox_width> <bbox_height>
```
It's important that the lines are sorted with frames in ascending order.

Here's one example:

./vpi_sample_06_klt_tracker cuda ../assets/dashcam.mp4 ../assets/dashcam_bboxes.txt

This is using the CUDA backend and one of the provided sample videos and bounding boxes. It'll render the tracked bounding boxes into klt_cuda.mp4.

Note: If using OpenCV-2.4 or older (i.e. on Ubuntu 16.04), output file is klt_cuda.avi.

Results

Tracking Result

Note: Video output requires HTML5-capable browser that supports H.264 mp4 video decoding.

Source Code

For convenience, here's the code that is also installed in the samples directory.

Language: C++ Python

 from __future__ import print_function
  
 import sys
 from argparse import ArgumentParser
 import numpy as np
 import cv2
 import vpi
  
  
 # Convert a colored input frame to grayscale (if needed)
 # and then, if using PVA backend, convert it to 16-bit unsigned pixels;
 # The converted frame is copied before wrapping it as a VPI image so
 # later draws in the gray frame do not change the reference VPI image.
 def convertFrameImage(inputFrame, backend):
     if inputFrame.ndim == 3 and inputFrame.shape[2] == 3:
         grayFrame = cv2.cvtColor(inputFrame, cv2.COLOR_BGR2GRAY)
     else:
         grayFrame = inputFrame
     if backend == vpi.Backend.PVA:
         # PVA only supports 16-bit unsigned inputs,
         # where each element is in 0-255 range, so
         # no rescaling is needed.
         grayFrame = grayFrame.astype(np.uint16)
     grayImage = vpi.asimage(grayFrame.copy())
     return grayFrame, grayImage
  
  
 # Write the input gray frame to output video with
 # input bounding boxes and predictions
 def writeOutput(outVideo, cvGray, inBoxes, inPreds, colors, backend):
     try:
         if cvGray.dtype == np.uint16:
             cvGray = cvGray.astype(np.uint8)
         if cvGray.dtype != np.uint8:
             raise Exception('Input frame format must be grayscale, 8-bit unsigned')
         cvGrayBGR = cv2.cvtColor(cvGray, cv2.COLOR_GRAY2BGR)
  
         # Tracking the number of valid bounding boxes in the current frame
         numValidBoxes = 0
  
         # Draw the input bounding boxes considering the input predictions
         with inBoxes.lock(vpi.Lock.READ), inPreds.lock(vpi.Lock.READ):
             # Array of bounding boxes (bbox) and predictions (pred)
             bbox = inBoxes.cpu().view(np.recarray)
             pred = inPreds.cpu()
  
             for i in range(inBoxes.size):
                 if bbox[i].tracking_status == vpi.KLTTrackStatus.LOST:
                     # If the tracking status of the current bounding box is lost, skip it
                     continue
  
                 # Gather information of the current (i) bounding box and prediction
                 # Prediction scaling width, height and x, y
                 predScaleWidth = pred[i][0, 0]
                 predScaleHeight = pred[i][1, 1]
                 predX = pred[i][0, 2]
                 predY = pred[i][1, 2]
  
                 # Bounding box scaling width, height and x, y and bbox width, height
                 bboxScaleWidth = bbox[i].bbox.xform.mat3[0, 0]
                 bboxScaleHeight = bbox[i].bbox.xform.mat3[1, 1]
                 bboxX = bbox[i].bbox.xform.mat3[0, 2]
                 bboxY = bbox[i].bbox.xform.mat3[1, 2]
                 bboxWidth = bbox[i].bbox.width
                 bboxHeight = bbox[i].bbox.height
  
                 # Compute corrected x, y and width, height (w, h) by proper adding
                 # bounding box and prediction x, y and by proper multiplying
                 # bounding box w, h with its own scaling and prediction scaling
                 x = bboxX + predX
                 y = bboxY + predY
                 w = bboxWidth * bboxScaleWidth * predScaleWidth
                 h = bboxHeight * bboxScaleHeight * predScaleHeight
  
                 # Start point and end point of the bounding box for OpenCV drawing
                 startPoint = tuple(np.array([x, y], dtype=int))
                 endPoint = tuple(np.array([x, y], dtype=int) + np.array([w, h], dtype=int))
  
                 # The color of the bounding box to be drawn
                 bboxColor = tuple([ int(c) for c in colors[0, i] ])
                 cv2.rectangle(cvGrayBGR, startPoint, endPoint, bboxColor, 2)
  
                 # Incrementing the number of valid bounding boxes in the current frame
                 numValidBoxes += 1
  
         print(' Valid: {:02d} boxes'.format(numValidBoxes))
  
         outVideo.write(cvGrayBGR)
     except Exception as e:
         print('Error while writing output video:\n', e, file=sys.stderr)
         exit(1)
  
  
 # ----------------------------
 # Parse command line arguments
  
 parser = ArgumentParser()
 parser.add_argument('backend', choices=['cpu','cuda','pva'],
                     help='Backend to be used for processing')
  
 parser.add_argument('input',
                     help='Input video to be denoised')
  
 parser.add_argument('boxes',
                     help='Text file with bounding boxes description')
  
 args = parser.parse_args()
  
 if args.backend == 'cpu':
     backend = vpi.Backend.CPU
 elif args.backend == 'cuda':
     backend = vpi.Backend.CUDA
 else:
     assert args.backend == 'pva'
     backend = vpi.Backend.PVA
  
 # -----------------------------
 # Open input and output videos
  
 inVideo = cv2.VideoCapture(args.input)
  
 if int(cv2.__version__.split('.')[0]) >= 3:
     extOutputVideo = '.mp4'
     fourcc = cv2.VideoWriter_fourcc(*'avc1')
     inSize = (int(inVideo.get(cv2.CAP_PROP_FRAME_WIDTH)), int(inVideo.get(cv2.CAP_PROP_FRAME_HEIGHT)))
     fps = inVideo.get(cv2.CAP_PROP_FPS)
 else:
     # MP4 support with OpenCV-2.4 has issues, we'll use
     # avi/mpeg instead.
     extOutputVideo = '.avi'
     fourcc = cv2.cv.CV_FOURCC('M','P','E','G')
     inSize = (int(inVideo.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH)), int(inVideo.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)))
     fps = inVideo.get(cv2.cv.CV_CAP_PROP_FPS)
  
 outVideo = cv2.VideoWriter('klt_python'+str(sys.version_info[0])+'_'+args.backend+extOutputVideo,
                            fourcc, fps, inSize)
  
 if not outVideo.isOpened():
     print("Error creating output video", file=sys.stderr)
     exit(1)
  
 # -----------------------------
 # Reading input bounding boxes
  
 # All boxes is a dictionary of all bounding boxes to be tracked in the input video,
 # where each value is a list of new bounding boxes to track at the frame indicated by its key
 allBoxes = {}
 totalNumBoxes = 0
  
 # Array capacity 0 means no restricted maximum number of bounding boxes
 arrayCapacity = 0
  
 if backend == vpi.Backend.PVA:
     # PVA requires 128 array capacity or maximum number of bounding boxes
     arrayCapacity = 128
  
 with open(args.boxes) as f:
     # The input file (f) should have one bounding box per lines as:
     # "startFrame bboxX bboxY bboxWidth bboxHeight"; e.g.:
     # "61 547 337 14 11"
     for line in f.readlines():
         line = line.replace('\n', '').replace('\r', '')
         startFrame, x, y, w, h = [ float(v) for v in line.split(' ') ]
         bb = (x, y, w, h)
         if startFrame not in allBoxes:
             allBoxes[startFrame] = [bb]
         else:
             allBoxes[startFrame].append(bb)
         totalNumBoxes += 1
         if totalNumBoxes == arrayCapacity:
             # Stop adding boxes if its total number reached the array capacity
             break
  
 curFrame    = 0
 curNumBoxes = len(allBoxes[curFrame])
  
 # ------------------------------------------------------------------------------
 # Initialize VPI array with all input bounding boxes (same as C++ KLT sample)
  
 if arrayCapacity == 0:
     arrayCapacity = totalNumBoxes
  
 inBoxes = vpi.Array(arrayCapacity, vpi.Type.KLT_TRACKED_BOUNDING_BOX)
  
 inBoxes.size = totalNumBoxes
 with inBoxes.lock(vpi.Lock.WRITE):
     data = inBoxes.cpu().view(np.recarray)
  
     # Global index i of all bounding boxes data, initializing it starts at 0
     i = 0
  
     for f in sorted(allBoxes.keys()):
         for bb in allBoxes[f]:
             # Each bounding box bb is a tuple of (x, y, w, h) that can be unpacked
             x, y, w, h = bb
  
             # The bounding box data is the identity for the scaling part,
             # meaning no scaling, and the offset part is its position x, y
             data[i].bbox.xform.mat3[0, 0] = 1
             data[i].bbox.xform.mat3[1, 1] = 1
             data[i].bbox.xform.mat3[2, 2] = 1
             data[i].bbox.xform.mat3[0, 2] = x
             data[i].bbox.xform.mat3[1, 2] = y
  
             # The bounding box data stores its width and height w, h
             data[i].bbox.width = w
             data[i].bbox.height = h
  
             # Initially all boxes have status tracked and update needed
             data[i].tracking_status = vpi.KLTTrackStatus.TRACKED
             data[i].template_status = vpi.KLTUpdateStatus.NEEDED
  
             # Incrementing the global index for the next bounding box
             i += 1
  
 #-------------------------------------------------------------------------------
 # Generate random colors for bounding boxes equal to the C++ KLT sample
  
 hues = np.zeros((totalNumBoxes,), dtype=np.uint8)
  
 if int(cv2.__version__.split('.')[0]) >= 3:
     cv2.setRNGSeed(1)
     hues = cv2.randu(hues, 0, 180)
 else:
     # Random differs in OpenCV-2.4
     rng = cv2.cv.RNG(1)
     hues = cv2.cv.fromarray(np.array([[ h for h in hues ]], dtype=np.uint8))
     cv2.cv.RandArr(rng, hues, cv2.cv.CV_RAND_UNI, 0, 180)
     hues = [ hues[0, i] for i in range(totalNumBoxes) ]
  
 colors = np.array([[ [int(h), 255, 255] for h in hues ]], dtype=np.uint8)
 colors = cv2.cvtColor(colors, cv2.COLOR_HSV2BGR)
  
 #-------------------------------------------------------------------------------
 # Initialize the KLT Feature Tracker algorithm
  
 # Load up first frame
 validFrame, cvFrame = inVideo.read()
 if not validFrame:
     print("Error reading first input frame", file=sys.stderr)
     exit(1)
  
 # Convert OpenCV frame to gray returning also the VPI image for given backend
 cvGray, imgTemplate = convertFrameImage(cvFrame, backend)
  
 # Create the KLT Feature Tracker object using the backend specified by the user
 klt = vpi.KLTFeatureTracker(imgTemplate, inBoxes, backend=backend)
  
 #-------------------------------------------------------------------------------
 # Main processing loop
  
 while validFrame:
     print('Frame: {:04d} ; Total: {:02d} boxes ;'.format(curFrame, curNumBoxes), end='')
  
     # Adjust input boxes and predictions to the current number of boxes
     inPreds = klt.in_predictions()
  
     inPreds.size = curNumBoxes
     inBoxes.size = curNumBoxes
  
     # Write current frame to the output video
     writeOutput(outVideo, cvGray, inBoxes, inPreds, colors, backend)
  
     # Read next input frame
     curFrame += 1
     validFrame, cvFrame = inVideo.read()
     if not validFrame:
         break
  
     cvGray, imgReference = convertFrameImage(cvFrame, backend)
  
     outBoxes = klt(imgReference)
  
     if curFrame in allBoxes:
         curNumBoxes += len(allBoxes[curFrame])
  
 outVideo.release()
     
 # vim: ts=8:sw=4:sts=4:et:ai

 #include <opencv2/core/version.hpp>
 #if CV_MAJOR_VERSION >= 3
 #    include <opencv2/imgcodecs.hpp>
 #    include <opencv2/videoio.hpp>
 #else
 #    include <opencv2/highgui/highgui.hpp>
 #endif
  
 #include <opencv2/imgproc/imgproc.hpp>
 #include <vpi/OpenCVInterop.hpp>
  
 #include <vpi/Array.h>
 #include <vpi/Image.h>
 #include <vpi/Status.h>
 #include <vpi/Stream.h>
 #include <vpi/algo/KLTFeatureTracker.h>
  
 #include <cstring> // for memset
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <sstream>
 #include <vector>
  
 #define CHECK_STATUS(STMT)                                    \
     do                                                        \
     {                                                         \
         VPIStatus status = (STMT);                            \
         if (status != VPI_SUCCESS)                            \
         {                                                     \
             char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH];       \
             vpiGetLastStatusMessage(buffer, sizeof(buffer));  \
             std::ostringstream ss;                            \
             ss << vpiStatusGetName(status) << ": " << buffer; \
             throw std::runtime_error(ss.str());               \
         }                                                     \
     } while (0);
  
 // Utility to draw the bounding boxes into an image and save it to disk.
 static cv::Mat WriteKLTBoxes(VPIImage img, VPIArray boxes, VPIArray preds)
 {
     // Convert img into a cv::Mat
     cv::Mat out;
     {
         VPIImageData imgdata;
         CHECK_STATUS(vpiImageLock(img, VPI_LOCK_READ, &imgdata));
  
         int cvtype;
         switch (imgdata.format)
         {
         case VPI_IMAGE_FORMAT_U8:
             cvtype = CV_8U;
             break;
  
         case VPI_IMAGE_FORMAT_S8:
             cvtype = CV_8S;
             break;
  
         case VPI_IMAGE_FORMAT_U16:
             cvtype = CV_16UC1;
             break;
  
         case VPI_IMAGE_FORMAT_S16:
             cvtype = CV_16SC1;
             break;
  
         default:
             throw std::runtime_error("Image type not supported");
         }
  
         cv::Mat cvimg(imgdata.planes[0].height, imgdata.planes[0].width, cvtype, imgdata.planes[0].data,
                       imgdata.planes[0].pitchBytes);
  
         if (cvimg.type() == CV_16U)
         {
             cvimg.convertTo(out, CV_8U);
             cvimg = out;
             out   = cv::Mat();
         }
  
         cvtColor(cvimg, out, cv::COLOR_GRAY2BGR);
  
         CHECK_STATUS(vpiImageUnlock(img));
     }
  
     // Now draw the bounding boxes.
     VPIArrayData boxdata;
     CHECK_STATUS(vpiArrayLock(boxes, VPI_LOCK_READ, &boxdata));
  
     VPIArrayData preddata;
     CHECK_STATUS(vpiArrayLock(preds, VPI_LOCK_READ, &preddata));
  
     auto *pboxes = reinterpret_cast<VPIKLTTrackedBoundingBox *>(boxdata.data);
     auto *ppreds = reinterpret_cast<VPIHomographyTransform2D *>(preddata.data);
  
     // Use random high-saturated colors
     static std::vector<cv::Vec3b> colors;
     if ((int)colors.size() != *boxdata.sizePointer)
     {
         colors.resize(*boxdata.sizePointer);
  
         cv::RNG rand(1);
         for (size_t i = 0; i < colors.size(); ++i)
         {
             colors[i] = cv::Vec3b(rand.uniform(0, 180), 255, 255);
         }
         cvtColor(colors, colors, cv::COLOR_HSV2BGR);
     }
  
     // For each tracked bounding box...
     for (int i = 0; i < *boxdata.sizePointer; ++i)
     {
         if (pboxes[i].trackingStatus == 1)
         {
             continue;
         }
  
         float x, y, w, h;
         x = pboxes[i].bbox.xform.mat3[0][2] + ppreds[i].mat3[0][2];
         y = pboxes[i].bbox.xform.mat3[1][2] + ppreds[i].mat3[1][2];
         w = pboxes[i].bbox.width * pboxes[i].bbox.xform.mat3[0][0] * ppreds[i].mat3[0][0];
         h = pboxes[i].bbox.height * pboxes[i].bbox.xform.mat3[1][1] * ppreds[i].mat3[1][1];
  
         rectangle(out, cv::Rect(x, y, w, h), cv::Scalar(colors[i][0], colors[i][1], colors[i][2]), 2);
     }
  
     CHECK_STATUS(vpiArrayUnlock(preds));
     CHECK_STATUS(vpiArrayUnlock(boxes));
  
     return out;
 }
  
 int main(int argc, char *argv[])
 {
     // OpenCV image that will be wrapped by a VPIImage.
     // Define it here so that it's destroyed *after* wrapper is destroyed
     cv::Mat cvTemplate, cvReference;
  
     // Arrays that will store our input bboxes and predicted transform.
     VPIArray inputBoxList = NULL, inputPredList = NULL;
  
     // Other VPI objects that will be used
     VPIStream stream         = NULL;
     VPIArray outputBoxList   = NULL;
     VPIArray outputEstimList = NULL;
     VPIPayload klt           = NULL;
     VPIImage imgReference    = NULL;
     VPIImage imgTemplate     = NULL;
  
     int retval = 0;
     try
     {
         if (argc != 4)
         {
             throw std::runtime_error(std::string("Usage: ") + argv[0] + " <cpu|pva|cuda> <input_video> <bbox descr>");
         }
  
         std::string strBackend     = argv[1];
         std::string strInputVideo  = argv[2];
         std::string strInputBBoxes = argv[3];
  
         // Load the input video
         cv::VideoCapture invid;
         if (!invid.open(strInputVideo))
         {
             throw std::runtime_error("Can't open '" + strInputVideo + "'");
         }
  
         // Open the output video for writing using input's characteristics
 #if CV_MAJOR_VERSION >= 3
         int w                      = invid.get(cv::CAP_PROP_FRAME_WIDTH);
         int h                      = invid.get(cv::CAP_PROP_FRAME_HEIGHT);
         int fourcc                 = cv::VideoWriter::fourcc('a', 'v', 'c', '1');
         double fps                 = invid.get(cv::CAP_PROP_FPS);
         std::string extOutputVideo = ".mp4";
 #else
         // MP4 support with OpenCV-2.4 has issues, we'll use
         // avi/mpeg instead.
         int w                      = invid.get(CV_CAP_PROP_FRAME_WIDTH);
         int h                      = invid.get(CV_CAP_PROP_FRAME_HEIGHT);
         int fourcc                 = CV_FOURCC('M', 'P', 'E', 'G');
         double fps                 = invid.get(CV_CAP_PROP_FPS);
         std::string extOutputVideo = ".avi";
 #endif
  
         cv::VideoWriter outVideo("klt_" + strBackend + extOutputVideo, fourcc, fps, cv::Size(w, h));
         if (!outVideo.isOpened())
         {
             throw std::runtime_error("Can't create output video");
         }
  
         // Load the bounding boxes
         // Format is: <frame number> <bbox_x> <bbox_y> <bbox_width> <bbox_height>
         // Important assumption: bboxes must be sorted with increasing frame numbers.
  
         // These arrays will actually wrap these vectors.
         std::vector<VPIKLTTrackedBoundingBox> bboxes;
         int32_t bboxesSize = 0;
         std::vector<VPIHomographyTransform2D> preds;
         int32_t predsSize = 0;
  
         // Stores how many bboxes there are in each frame. Only
         // stores when the bboxes count change.
         std::map<int, size_t> bboxes_size_at_frame; // frame -> bbox count
  
         // PVA requires that array capacity is 128.
         bboxes.reserve(128);
         preds.reserve(128);
  
         // Read bounding boxes
         {
             std::ifstream in(strInputBBoxes);
             if (!in)
             {
                 throw std::runtime_error("Can't open '" + strInputBBoxes + "'");
             }
  
             // For each bounding box,
             int frame, x, y, w, h;
             while (in >> frame >> x >> y >> w >> h)
             {
                 if (bboxes.size() == 64)
                 {
                     throw std::runtime_error("Too many bounding boxes");
                 }
  
                 // Convert the axis-aligned bounding box into our tracking
                 // structure.
  
                 VPIKLTTrackedBoundingBox track = {};
                 // scale
                 track.bbox.xform.mat3[0][0] = 1;
                 track.bbox.xform.mat3[1][1] = 1;
                 // position
                 track.bbox.xform.mat3[0][2] = x;
                 track.bbox.xform.mat3[1][2] = y;
                 // must be 1
                 track.bbox.xform.mat3[2][2] = 1;
  
                 track.bbox.width     = w;
                 track.bbox.height    = h;
                 track.trackingStatus = 0; // valid tracking
                 track.templateStatus = 1; // must update
  
                 bboxes.push_back(track);
  
                 // Identity predicted transform.
                 VPIHomographyTransform2D xform = {};
                 xform.mat3[0][0]               = 1;
                 xform.mat3[1][1]               = 1;
                 xform.mat3[2][2]               = 1;
                 preds.push_back(xform);
  
                 bboxes_size_at_frame[frame] = bboxes.size();
             }
  
             if (!in && !in.eof())
             {
                 throw std::runtime_error("Can't parse bounding boxes, stopped at bbox #" +
                                          std::to_string(bboxes.size()));
             }
  
             // Wrap the input arrays into VPIArray's
             VPIArrayData data = {};
             data.type         = VPI_ARRAY_TYPE_KLT_TRACKED_BOUNDING_BOX;
             data.capacity     = bboxes.capacity();
             data.sizePointer  = &bboxesSize;
             data.data         = &bboxes[0];
             CHECK_STATUS(vpiArrayCreateHostMemWrapper(&data, 0, &inputBoxList));
  
             data.type        = VPI_ARRAY_TYPE_HOMOGRAPHY_TRANSFORM_2D;
             data.sizePointer = &predsSize;
             data.data        = &preds[0];
             CHECK_STATUS(vpiArrayCreateHostMemWrapper(&data, 0, &inputPredList));
         }
  
         // Now parse the backend
         VPIBackend backend;
  
         if (strBackend == "cpu")
         {
             backend = VPI_BACKEND_CPU;
         }
         else if (strBackend == "cuda")
         {
             backend = VPI_BACKEND_CUDA;
         }
         else if (strBackend == "pva")
         {
             backend = VPI_BACKEND_PVA;
         }
         else
         {
             throw std::runtime_error("Backend '" + strBackend +
                                      "' not recognized, it must be either cpu, cuda or pva.");
         }
  
         // Create the stream for the given backend.
         CHECK_STATUS(vpiStreamCreate(backend, &stream));
  
         // Helper function to fetch a frame from input
         int nextFrame   = 0;
         auto fetchFrame = [&invid, &nextFrame, backend]() {
             cv::Mat frame;
             if (!invid.read(frame))
             {
                 return cv::Mat();
             }
  
             // We only support grayscale inputs
             if (frame.channels() == 3)
             {
                 cvtColor(frame, frame, cv::COLOR_BGR2GRAY);
             }
  
             if (backend == VPI_BACKEND_PVA)
             {
                 // PVA only supports 16-bit unsigned inputs,
                 // where each element is in 0-255 range, so
                 // no rescaling needed.
                 cv::Mat aux;
                 frame.convertTo(aux, CV_16U);
                 frame = aux;
             }
             else
             {
                 assert(frame.type() == CV_8U);
             }
  
             ++nextFrame;
             return frame;
         };
  
         // Fetch the first frame and wrap it into a VPIImage.
         // Templates will be based on this frame.
         cvTemplate = fetchFrame();
         CHECK_STATUS(vpiImageCreateOpenCVMatWrapper(cvTemplate, 0, &imgTemplate));
  
         // Create the reference image wrapper. Let's wrap the cvTemplate for now just
         // to create the wrapper. Later we'll set it to wrap the actual reference image.
         CHECK_STATUS(vpiImageCreateOpenCVMatWrapper(cvTemplate, 0, &imgReference));
  
         VPIImageFormat imgFormat;
         CHECK_STATUS(vpiImageGetFormat(imgTemplate, &imgFormat));
  
         // Using this first frame's characteristics, create a KLT Bounding Box Tracker payload.
         // We're limiting the template dimensions to 64x64.
         CHECK_STATUS(vpiCreateKLTFeatureTracker(backend, cvTemplate.cols, cvTemplate.rows, imgFormat, NULL, &klt));
  
         // Parameters we'll use. No need to change them on the fly, so just define them here.
         VPIKLTFeatureTrackerParams params;
         CHECK_STATUS(vpiInitKLTFeatureTrackerParams(&params));
  
         // Output array with estimated bbox for current frame.
         CHECK_STATUS(vpiArrayCreate(128, VPI_ARRAY_TYPE_KLT_TRACKED_BOUNDING_BOX, 0, &outputBoxList));
  
         // Output array with estimated transform of input bbox to match output bbox.
         CHECK_STATUS(vpiArrayCreate(128, VPI_ARRAY_TYPE_HOMOGRAPHY_TRANSFORM_2D, 0, &outputEstimList));
  
         size_t curNumBoxes = 0;
  
         do
         {
             size_t curFrame = nextFrame - 1;
  
             // Get the number of bounding boxes in current frame.
             auto tmp          = --bboxes_size_at_frame.upper_bound(curFrame);
             size_t bbox_count = tmp->second;
  
             assert(bbox_count >= curNumBoxes && "input bounding boxes must be sorted by frame");
  
             // Does current frame have new bounding boxes?
             if (curNumBoxes != bbox_count)
             {
                 // Update the input array sizes, the new frame is already there as we populated
                 // these arrays with all input bounding boxes.
                 CHECK_STATUS(vpiArrayLock(inputBoxList, VPI_LOCK_READ_WRITE, NULL));
                 CHECK_STATUS(vpiArraySetSize(inputBoxList, bbox_count));
                 CHECK_STATUS(vpiArrayUnlock(inputBoxList));
  
                 CHECK_STATUS(vpiArrayLock(inputPredList, VPI_LOCK_READ_WRITE, NULL));
                 CHECK_STATUS(vpiArraySetSize(inputPredList, bbox_count));
                 CHECK_STATUS(vpiArrayUnlock(inputPredList));
  
                 for (size_t i = 0; i < bbox_count - curNumBoxes; ++i)
                 {
                     std::cout << curFrame << " -> new " << curNumBoxes + i << std::endl;
                 }
                 assert(bbox_count <= bboxes.capacity());
                 assert(bbox_count <= preds.capacity());
  
                 curNumBoxes = bbox_count;
             }
  
             // Save this frame to disk.
             outVideo << WriteKLTBoxes(imgTemplate, inputBoxList, inputPredList);
  
             // Fetch a new frame
             cvReference = fetchFrame();
  
             // Video ended?
             if (cvReference.data == NULL)
             {
                 // Just end gracefully.
                 break;
             }
  
             // Make the reference wrapper point to the reference frame
             CHECK_STATUS(vpiImageSetWrappedOpenCVMat(imgReference, cvReference));
  
             // Estimate the bounding boxes in current frame (reference) given their position in previous
             // frame (template).
             CHECK_STATUS(vpiSubmitKLTFeatureTracker(stream, backend, klt, imgTemplate, inputBoxList, inputPredList,
                                                     imgReference, outputBoxList, outputEstimList, &params));
  
             // Wait for processing to finish.
             CHECK_STATUS(vpiStreamSync(stream));
  
             // Now we lock the output arrays to properly set up the input for the next iteration.
             VPIArrayData updatedBBoxData;
             CHECK_STATUS(vpiArrayLock(outputBoxList, VPI_LOCK_READ, &updatedBBoxData));
  
             VPIArrayData estimData;
             CHECK_STATUS(vpiArrayLock(outputEstimList, VPI_LOCK_READ, &estimData));
  
             auto *updated_bbox = reinterpret_cast<VPIKLTTrackedBoundingBox *>(updatedBBoxData.data);
             auto *estim        = reinterpret_cast<VPIHomographyTransform2D *>(estimData.data);
  
             // For each bounding box,
             for (size_t b = 0; b < curNumBoxes; ++b)
             {
                 // Did tracking failed?
                 if (updated_bbox[b].trackingStatus)
                 {
                     // Do we have to update the input bbox's tracking status too?
                     if (bboxes[b].trackingStatus == 0)
                     {
                         std::cout << curFrame << " -> dropped " << b << std::endl;
                         bboxes[b].trackingStatus = 1;
                     }
  
                     continue;
                 }
  
                 // Must update template for this bounding box??
                 if (updated_bbox[b].templateStatus)
                 {
                     std::cout << curFrame << " -> update " << b << std::endl;
  
                     // There are usually two approaches here:
                     // 1. Redefine the bounding box using a feature detector such as
                     //    \ref algo_harris_corners "Harris keypoint detector", or
                     // 2. Use updated_bbox[b], which is still valid, although tracking
                     //    errors might accumulate over time.
                     //
                     // We'll go to the second option, less robust, but simple enough
                     // to implement.
                     bboxes[b] = updated_bbox[b];
  
                     // Signal the input that the template for this bounding box must be updated.
                     bboxes[b].templateStatus = 1;
  
                     // Predicted transform is now identity as we reset the tracking.
                     preds[b]            = VPIHomographyTransform2D{};
                     preds[b].mat3[0][0] = 1;
                     preds[b].mat3[1][1] = 1;
                     preds[b].mat3[2][2] = 1;
                 }
                 else
                 {
                     // Inform that the template for this bounding box doesn't need to be pdated.
                     bboxes[b].templateStatus = 0;
  
                     // We just update the input transform with the estimated one.
                     preds[b] = estim[b];
                 }
             }
  
             // We're finished working with the output arrays.
             CHECK_STATUS(vpiArrayUnlock(outputBoxList));
             CHECK_STATUS(vpiArrayUnlock(outputEstimList));
  
             // Since we've updated the input arrays, tell VPI to invalidate
             // any internal buffers that might still refer to the old data.
             CHECK_STATUS(vpiArrayInvalidate(inputBoxList));
             CHECK_STATUS(vpiArrayInvalidate(inputPredList));
  
             // Next's reference frame is current's template.
             std::swap(imgTemplate, imgReference);
             std::swap(cvTemplate, cvReference);
         } while (true);
     }
     catch (std::exception &e)
     {
         std::cerr << e.what() << std::endl;
         retval = 1;
     }
  
     vpiStreamDestroy(stream);
     vpiPayloadDestroy(klt);
     vpiArrayDestroy(inputBoxList);
     vpiArrayDestroy(inputPredList);
     vpiArrayDestroy(outputBoxList);
     vpiArrayDestroy(outputEstimList);
     vpiImageDestroy(imgReference);
     vpiImageDestroy(imgTemplate);
  
     return retval;
 }
  
 // vim: ts=8:sw=4:sts=4:et:ai

VPI - Vision Programming Interface

1.2 Release

Overview

Instructions

Results

Source Code