Overview

This application tracks bounding boxes on an input video, draws them on each frame, and saves the result in a sequence of image files. You can define which backend will be used for processing.
It serves as simple example (or skeleton) of how the DCF Tracker algorithm can be implemented in a pipeline.
Note: The sample implements a simplistic tracking pipeline using poor tracking quality. For production-grade tracking quality, you must implement proper object lifetime management and bounding box refinement stages in the code section named "Custom target update".
Instructions

The command line parameters are:
./vpi_sample_19_dcf_tracker <backend> <input video> <input bboxes>
where
backend: either cuda or pva; it defines the backend that will perform the processing.
input video: input video file name, it accepts all video types that OpenCV's cv::VideoCapture accepts.
input bboxes: file with input bounding boxes and in what frame they appear. The file is composed of multiple lines with the following format:
```
  <target_id> <frame> <bbox_x> <bbox_y> <bbox_width> <bbox_height>
```
Here's one example:
C++
./vpi_sample_19_dcf_tracker cuda ../assets/pedestrians.mp4 ../assets/pedestrians_bboxes.txt
This is using the CUDA backend and one of the provided sample videos and bounding boxes. It'll render the tracked bounding boxes into a sequence of images that are then saved to disk.
Results

Simple Tracking Result
Note: Video output requires HTML5-capable browser that supports H.264 mp4 video decoding.
Source Code

For convenience, here's the code that is also installed in the samples directory.
Language: C++
 #include <opencv2/core.hpp>
 #include <opencv2/features2d.hpp>
 #include <opencv2/imgcodecs.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/videoio.hpp>
 #include <vpi/OpenCVInterop.hpp>
  
 #include <vpi/Array.h>
 #include <vpi/Image.h>
 #include <vpi/Pyramid.h>
 #include <vpi/Status.h>
 #include <vpi/Stream.h>
 #include <vpi/algo/ConvertImageFormat.h>
 #include <vpi/algo/CropScaler.h>
 #include <vpi/algo/DCFTracker.h>
  
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <fstream>
 #include <iostream>
 #include <list>
 #include <map>
 #include <numeric>
 #include <optional>
 #include <sstream>
 #include <vector>
  
 #define CHECK_STATUS(STMT)                                    \
     do                                                        \
     {                                                         \
         VPIStatus status = (STMT);                            \
         if (status != VPI_SUCCESS)                            \
         {                                                     \
             char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH];       \
             vpiGetLastStatusMessage(buffer, sizeof(buffer));  \
             std::ostringstream ss;                            \
             ss << vpiStatusGetName(status) << ": " << buffer; \
             throw std::runtime_error(ss.str());               \
         }                                                     \
     } while (0);
  
 namespace {
  
 // Information about the target track.
 struct TrackInfo
 {
     int idTarget;
     cv::Scalar color;
     bool enabled; // whether target is lost or not.
 };
  
 // idTarget -> info
 using TargetTrackInfoMap = std::map<int, TrackInfo>;
  
 // Stores information about a detected target.
 struct DetectedTargetInfo
 {
     int idTarget;
  
     VPIAxisAlignedBoundingBoxF32 bbox;
  
     bool lostTrack() const
     {
         return bbox.width == 0 || bbox.height == 0;
     }
 };
  
 // idTarget -> info
 using DetectedTargetInfoMap = std::multimap<int, DetectedTargetInfo>;
  
 VPIBackend ParseBackend(const std::string &str)
 {
     if (str == "cuda")
     {
         return VPI_BACKEND_CUDA;
     }
     else if (str == "pva")
     {
         return VPI_BACKEND_PVA;
     }
     else
     {
         throw std::runtime_error("Backend '" + str + "' not recognized, it must be either cuda or pva.");
     }
 }
  
 // Opens the video given by its file name.
 cv::VideoCapture ParseVideo(const std::string &fname)
 {
     cv::VideoCapture video;
     if (!video.open(fname))
     {
         throw std::runtime_error("Can't open '" + fname + "'");
     }
     return video;
 }
  
 // Parse the target bounding boxes at the frame they show up.
 DetectedTargetInfoMap ParseTargetInfoAtFrame(const std::string &fname)
 {
     std::ifstream in(fname);
     if (!in)
     {
         throw std::runtime_error("Can't open '" + fname + "'");
     }
  
     DetectedTargetInfoMap out;
  
     // For each bounding box,
     int frame;
     DetectedTargetInfo tinfo;
     while (in >> tinfo.idTarget >> frame >> tinfo.bbox.left >> tinfo.bbox.top >> tinfo.bbox.width >> tinfo.bbox.height)
     {
         out.emplace(frame, tinfo);
     }
  
     return out;
 }
  
 // Returns random high-saturated colors.
 cv::Scalar GetRandomColor(cv::RNG &rng)
 {
     std::vector<cv::Vec3b> color = {cv::Vec3b{(unsigned char)rng.uniform(0, 180), 255, 255}};
     cvtColor(color, color, cv::COLOR_HSV2BGR);
     return cv::Scalar(color[0][0], color[0][1], color[0][2], 255);
 }
  
 // Adds to 'targets' the new targets found in frame 'idxFrame'.
 bool AddNewTargetsFromFrame(int idxFrame, DetectedTargetInfoMap &tgtInfos, TargetTrackInfoMap &trackInfo,
                             VPIArrayData &targets)
 {
     // Tries to add the new target in the slot of an existing target
     // whose tracking was lost. If these no such targets, will append to the
     // end of the array.
  
     auto *pTarget        = static_cast<VPIDCFTrackedBoundingBox *>(targets.buffer.aos.data);
     const auto *tgtBegin = pTarget;
  
     static cv::RNG rng(1);
  
     // For all new targets in 'idxFrame',
     auto tgtInfoRange = tgtInfos.equal_range(idxFrame);
     for (auto it = tgtInfoRange.first; it != tgtInfoRange.second; ++it)
     {
         // If info indicates the target's track has finished,
         if (it->second.lostTrack())
         {
             // skip it, we're only adding new targets here.
             continue;
         }
  
         // If the corresponding target is enabled (i.e. is being tracked)
         auto itTrackInfo = trackInfo.find(it->second.idTarget);
         if (itTrackInfo != trackInfo.end() && itTrackInfo->second.enabled)
         {
             // also skip it
             continue;
         }
  
         // @note: when an array is allocated, its content is filled with zeroes,
         // up to its capacity. This implies that the state of the targets is lost.
         static_assert(VPI_TRACKING_STATE_LOST == 0, "Unexpected value for lost state");
  
         // Search for the first target whose tracking was lost.
         while (pTarget->state != VPI_TRACKING_STATE_LOST && pTarget < tgtBegin + targets.buffer.aos.capacity)
         {
             ++pTarget;
         }
  
         assert(pTarget < tgtBegin + targets.buffer.aos.capacity);
  
         pTarget->bbox     = it->second.bbox;
         pTarget->state    = VPI_TRACKING_STATE_NEW;
         pTarget->seqIndex = 0;
         // Reasonable defaults.
         pTarget->filterLR               = 0.075;
         pTarget->filterChannelWeightsLR = 0.1;
  
         // Is it the first time we're seeing this target?
         if (itTrackInfo == trackInfo.end())
         {
             // Create a track info for it.
             TrackInfo tinfo;
             tinfo.idTarget = it->second.idTarget;
             tinfo.color    = GetRandomColor(rng);
             tinfo.enabled  = true;
             itTrackInfo    = trackInfo.emplace(tinfo.idTarget, tinfo).first;
         }
         else
         {
             // It's now enabled.
             itTrackInfo->second.enabled = true;
         }
  
         pTarget->userData = &itTrackInfo->second;
  
         ++pTarget;
     }
  
     // Update the array size only if we've appended targets to the end of the array.
     *targets.buffer.aos.sizePointer = std::max<int32_t>(*targets.buffer.aos.sizePointer, pTarget - tgtBegin);
  
     assert(*targets.buffer.aos.sizePointer >= 0);
  
     return true;
 }
  
 // Mark as lost the targets whose bounding box falls outside the frame area, or are deemed lost by the detector.
 bool DetectTrackingLost(int idxFrame, DetectedTargetInfoMap &tgtInfos, VPIArrayData &targets, cv::Size frameSize)
 {
     auto tgtInfoRange = tgtInfos.equal_range(idxFrame);
  
     // This is a simplistic method that isn't reliable in a robust tracker.
     // A robust method needs to be implemented by the user.
  
     bool atLeastOneLost = false;
  
     // For all targets, back to front so that we can easily reduce array size if needed.
     for (auto *pBeginTarget = static_cast<VPIDCFTrackedBoundingBox *>(targets.buffer.aos.data),
               *pTarget      = pBeginTarget + *targets.buffer.aos.sizePointer - 1;
          pTarget >= pBeginTarget; --pTarget)
     {
         bool trackingLost = false;
  
         // Is it a valid target but its bounding box isn't entirely inside the frame,
         if (pTarget->state != VPI_TRACKING_STATE_LOST && (pTarget->bbox.left < 0 || pTarget->bbox.top < 0 ||
                                                           pTarget->bbox.left + pTarget->bbox.width > frameSize.width ||
                                                           pTarget->bbox.top + pTarget->bbox.height > frameSize.height))
         {
             // Consider its tracking to be lost.
             trackingLost = true;
         }
         else
         {
             // Go through all target infos in current frame
             for (auto itInfo = tgtInfoRange.first; itInfo != tgtInfoRange.second; ++itInfo)
             {
                 // Is it the info of the current target, and the tracking is lost?
                 if (pTarget->state != VPI_TRACKING_STATE_LOST &&
                     static_cast<const TrackInfo *>(pTarget->userData)->idTarget == itInfo->second.idTarget &&
                     itInfo->second.lostTrack())
                 {
                     // Flag it,
                     trackingLost = true;
                     break;
                 }
             }
         }
  
         if (trackingLost)
         {
             atLeastOneLost = true;
  
             // Update the target state to reflect it.
             pTarget->state                                       = VPI_TRACKING_STATE_LOST;
             static_cast<TrackInfo *>(pTarget->userData)->enabled = false;
  
             assert(*targets.buffer.aos.sizePointer >= 1);
  
             // If the target is at the end of the target array,
             if (pTarget == pBeginTarget + *targets.buffer.aos.sizePointer - 1)
             {
                 // We can reduce the array size to improve tracking processing times.
                 *targets.buffer.aos.sizePointer = -1;
             }
         }
     }
  
     return atLeastOneLost;
 }
  
 // Update target's bounding box with input from detector output.
 bool RefineTracksAtFrame(int idxFrame, DetectedTargetInfoMap &tgtInfos, VPIArrayData &targets)
 {
     auto tgtInfoRange = tgtInfos.equal_range(idxFrame);
  
     bool atLeastOneUpdated = false;
  
     for (auto *pBeginTarget = static_cast<VPIDCFTrackedBoundingBox *>(targets.buffer.aos.data), *pTarget = pBeginTarget;
          pTarget < pBeginTarget + *targets.buffer.aos.sizePointer; ++pTarget)
     {
         // If tracking is lost,
         if (pTarget->state == VPI_TRACKING_STATE_LOST)
         {
             // there's nothing to refine.
             continue;
         }
  
         bool found = false;
  
         // For all targets in 'idxFrame',
         for (auto itInfo = tgtInfoRange.first; itInfo != tgtInfoRange.second; ++itInfo)
         {
             // If info indicates the tracking is lost,
             if (itInfo->second.lostTrack())
             {
                 // skip it, we're only updating existing targets.
                 continue;
             }
  
             if ((pTarget->state == VPI_TRACKING_STATE_TRACKED || pTarget->state == VPI_TRACKING_STATE_SHADOW_TRACKED) &&
                 static_cast<const TrackInfo *>(pTarget->userData)->idTarget == itInfo->second.idTarget)
             {
                 pTarget->bbox = itInfo->second.bbox;
                 found         = true;
                 break;
             }
         }
  
         if (found)
         {
             atLeastOneUpdated = true;
             pTarget->state    = VPI_TRACKING_STATE_TRACKED;
         }
         else if (pTarget->state == VPI_TRACKING_STATE_TRACKED)
         {
             pTarget->state = VPI_TRACKING_STATE_SHADOW_TRACKED;
         }
     }
  
     return atLeastOneUpdated;
 }
  
 void DrawTargets(cv::Mat &frame, VPIArray targets)
 {
     VPIArrayData tgtData;
     CHECK_STATUS(vpiArrayLockData(targets, VPI_LOCK_READ, VPI_ARRAY_BUFFER_HOST_AOS, &tgtData));
  
     auto *ptgt  = static_cast<VPIDCFTrackedBoundingBox *>(tgtData.buffer.aos.data);
     int numObjs = *tgtData.buffer.aos.sizePointer;
  
     for (int o = 0; o < numObjs; ++o, ++ptgt)
     {
         // Only draw objects that are not lost
         if (ptgt->state == VPI_TRACKING_STATE_LOST)
         {
             continue;
         }
  
         auto &tinfo = *static_cast<TrackInfo *>(ptgt->userData);
  
         rectangle(frame,
                   cv::Rect{(int)ptgt->bbox.left, (int)ptgt->bbox.top, (int)ptgt->bbox.width, (int)ptgt->bbox.height},
                   tinfo.color);
     }
  
     CHECK_STATUS(vpiArrayUnlock(targets));
 }
  
 void WriteToDisk(const cv::Mat &img, std::string name, int idx)
 {
     char buf[128];
     snprintf(buf, sizeof(buf) - 1, "%s_%03d.jpg", name.c_str(), idx);
     buf[sizeof(buf) - 1] = '\0';
  
     imwrite(buf, img);
 }
  
 void PreprocessFrame(VPIStream stream, const cv::Mat &in, VPIImage &wrapper, VPIImage out)
 {
     // Pre-process current frame
     if (wrapper == NULL)
     {
         CHECK_STATUS(vpiImageCreateWrapperOpenCVMat(in, 0, &wrapper));
     }
     else
     {
         CHECK_STATUS(vpiImageSetWrappedOpenCVMat(wrapper, in));
     }
  
     CHECK_STATUS(vpiSubmitConvertImageFormat(stream, VPI_BACKEND_CUDA, wrapper, out, NULL));
 }
  
 } // namespace
  
 int main(int argc, char *argv[])
 {
     VPIPayload cropScale = NULL;
     VPIPayload dcf       = NULL;
     VPIStream stream     = NULL;
     VPIArray inTargets = NULL, outTargets = NULL;
     VPIImage tgtPatches      = NULL;
     VPIImage frame           = NULL;
     VPIImage wrappedOCVFrame = NULL;
  
     int retval = 0;
     try
     {
         // Command line argument processing
         // --------------------------------
         if (argc != 4)
         {
             throw std::runtime_error(std::string("Usage: ") + argv[0] + " <pva|cuda> <input_video> <bbox descr>");
         }
  
         VPIBackend backend                      = ParseBackend(argv[1]);
         cv::VideoCapture invid                  = ParseVideo(argv[2]);
         DetectedTargetInfoMap targetInfoAtFrame = ParseTargetInfoAtFrame(argv[3]);
  
         TargetTrackInfoMap trackInfo;
  
         // Allocate all VPI resources needed
         // ---------------------------------
  
         const int maxTrackedTargets = targetInfoAtFrame.size();
  
         // Create the CropScale payload
         CHECK_STATUS(vpiCreateCropScaler(backend,
                                          1, // max number of sequences (only processing one video)
                                          maxTrackedTargets, &cropScale));
  
         // Configure and create the DCFTracker payload
         VPIDCFTrackerCreationParams dcfInitParams;
         CHECK_STATUS(vpiInitDCFTrackerCreationParams(&dcfInitParams));
  
         VPIPayload dcf;
         CHECK_STATUS(vpiCreateDCFTracker(backend,
                                          1, // max number of sequences
                                          maxTrackedTargets, &dcfInitParams, &dcf));
  
         // Optionally user can retrieve the internal array that stores the channel weights and
         // the maximum correlation response for each tracked target. These can be used, together
         // with the correlation map, to decide whether tracking was lost or not.
         // This is not being done in this sample, though.
         /*
         VPIArray channelWeights;
         int32_t numFeatureChannels
         CHECK_STATUS(vpiDCFTrackerGetChannelWeights(dcf, &channelWeights, &numFeatureChannels));
         */
  
         // Create target arrays
         VPIArray inTargets, outTargets;
         CHECK_STATUS(vpiArrayCreate(maxTrackedTargets, VPI_ARRAY_TYPE_DCF_TRACKED_BOUNDING_BOX, 0, &inTargets));
         CHECK_STATUS(vpiArrayCreate(maxTrackedTargets, VPI_ARRAY_TYPE_DCF_TRACKED_BOUNDING_BOX, 0, &outTargets));
  
         // Create image that stores the targets' patches
         const int tgtPatchSize              = dcfInitParams.featurePatchSize * dcfInitParams.hogCellSize;
         const VPIImageFormat tgtPatchFormat = backend == VPI_BACKEND_PVA
                                                   ? VPI_IMAGE_FORMAT_RGB8p
                                                   : VPI_IMAGE_FORMAT_RGBA8; // use supported patch-format for backend
  
         CHECK_STATUS(vpiImageCreate(tgtPatchSize, tgtPatchSize * maxTrackedTargets, tgtPatchFormat, 0, &tgtPatches));
  
         // Create stream for processing
         CHECK_STATUS(vpiStreamCreate(0, &stream));
  
         // Create the image that stores the input frame
         CHECK_STATUS(vpiImageCreate(invid.get(cv::CAP_PROP_FRAME_WIDTH), invid.get(cv::CAP_PROP_FRAME_HEIGHT),
                                     VPI_IMAGE_FORMAT_RGBA8, 0, &frame));
  
         // Target tracking
         // ---------------
  
         int curFrame = 0;
  
         // Populate the targets array with targets found the first frame.
         VPIArrayData tgtData;
         CHECK_STATUS(vpiArrayLockData(inTargets, VPI_LOCK_READ_WRITE, VPI_ARRAY_BUFFER_HOST_AOS, &tgtData));
         try
         {
             AddNewTargetsFromFrame(curFrame, targetInfoAtFrame, trackInfo, tgtData);
         }
         catch (...)
         {
             CHECK_STATUS(vpiArrayUnlock(inTargets));
             throw;
         }
         CHECK_STATUS(vpiArrayUnlock(inTargets));
  
         // For each input frame,
         cv::Mat cvFrame;
         while (invid.read(cvFrame))
         {
             printf("Frame %d\n", curFrame);
  
             // Transform the opencv frame (cvFrame) into a suitable VPIImage (frame).
             PreprocessFrame(stream, cvFrame, wrappedOCVFrame, frame);
  
             // Crop the targets from the current frame using their bbox from previous iteration,
             // then rescale them into tgtPatches.
             CHECK_STATUS(vpiSubmitCropScalerBatch(stream, 0, cropScale, &frame, 1, inTargets, tgtPatchSize,
                                                   tgtPatchSize, tgtPatches));
  
             // If we're in the first frame,
             VPIArray targets;
             if (curFrame == 0)
             {
                 // The targets are simply the ones found.
                 targets = inTargets;
             }
             else
             {
                 // Localize and refine current targets' bbox in the current frame.
                 CHECK_STATUS(vpiSubmitDCFTrackerLocalizeBatch(stream, 0, dcf, NULL, 0, // process all sequences
                                                               NULL, // feature masking window (not needed)
                                                               tgtPatches, inTargets, outTargets,
                                                               NULL,   // outCorrelationResponses (not needed)
                                                               NULL,   // outMaxCorrelationResponses (not needed)
                                                               NULL)); // algorithm knobs (use defaults)
                 targets = outTargets;
  
                 // Custom target update
                 // --------------------
  
                 // At this point some other additional processing can be done,
                 // such as target lifetime management and bounding box refinement.
                 // It typically uses information from 'outMaxCorrelationResponses',
                 // 'outCorrelationResponses' and/or 'channelWeights'.
                 // Since this processing is usually time consuming,
                 // it is usually performed once every few frames.
  
                 // Since we're updating the target arrays on host, we need to synchronize the stream.
                 CHECK_STATUS(vpiStreamSync(stream));
  
                 // Target patches must be updated if the corresponding target is new, or its bounding
                 // box was refined.
                 bool mustUpdateTargetPatches = false;
  
                 VPIArrayData tgtData;
                 CHECK_STATUS(vpiArrayLockData(targets, VPI_LOCK_READ_WRITE, VPI_ARRAY_BUFFER_HOST_AOS, &tgtData));
                 try
                 {
                     // These functions update the target array based on the
                     // output of a object detector on the current frame. This
                     // detector is responsible for associating the detected
                     // objects with the corresponding existing target (if
                     // possible).
                     //
                     // Based on this information, the functions update the
                     // object lifetime (whether the object is new or tracking
                     // was lost), and also update its bounding box and state.
                     //
                     // For this sample application, the detection and
                     // reassociation is hardcoded in 'targetInfoAtFrame'. For
                     // production-level quality, a robust and generic solution
                     // needs to be implemented by the user.
  
                     // Note: Target update implemented in three separate
                     // functions for exposition purposes only.
  
                     // Detect whether a target tracking was lost and update tgtData accordingly.
                     DetectTrackingLost(curFrame, targetInfoAtFrame, tgtData, cv::Size{cvFrame.cols, cvFrame.rows});
  
                     // Target bounding box refinement
                     mustUpdateTargetPatches |= RefineTracksAtFrame(curFrame, targetInfoAtFrame, tgtData);
  
                     // Detect whether new targets are found in the current frame.
                     mustUpdateTargetPatches |= AddNewTargetsFromFrame(curFrame, targetInfoAtFrame, trackInfo, tgtData);
                 }
                 catch (...)
                 {
                     CHECK_STATUS(vpiArrayUnlock(targets));
                     throw;
                 }
                 CHECK_STATUS(vpiArrayUnlock(targets));
  
                 if (mustUpdateTargetPatches)
                 {
                     // Crop+rescale updated targets and copy them into tgtPatches.
                     CHECK_STATUS(vpiSubmitCropScalerBatch(stream, 0, cropScale, &frame, 1, targets, tgtPatchSize,
                                                           tgtPatchSize, tgtPatches));
                 }
             }
  
             // Update the targets' internal metadata given their new bounding box.
             CHECK_STATUS(vpiSubmitDCFTrackerUpdateBatch(stream, 0, dcf, nullptr, 0, // process all sequences
                                                         NULL,                       // featureMaskingWindow (not needed)
                                                         NULL,                       // modelMaskingWindow (not needed)
                                                         tgtPatches, targets,
                                                         NULL)); // algorithm knobs (use defaults)
  
             // Wait for frame processing to finish
             CHECK_STATUS(vpiStreamSync(stream));
  
             // Write frame to disk
             DrawTargets(cvFrame, targets);
             WriteToDisk(cvFrame, "frame", curFrame);
  
             // Ping-pong the target arrays:
             // Updated targets in this iteration will be input to the next iteration (inTargets),
             // while current input will store the updated targets.
             std::swap(inTargets, targets);
             ++curFrame;
         }
     }
     catch (std::exception &e)
     {
         std::cerr << e.what() << std::endl;
         retval = 1;
     }
  
     // Destroy all VPI resources
     // -------------------------
  
     vpiStreamDestroy(stream);
  
     vpiPayloadDestroy(cropScale);
     vpiPayloadDestroy(dcf);
     vpiArrayDestroy(inTargets);
     vpiArrayDestroy(outTargets);
     vpiImageDestroy(tgtPatches);
     vpiImageDestroy(frame);
     vpiImageDestroy(wrappedOCVFrame);
  
     return retval;
 }
VPI - Vision Programming Interface

3.2 Release

Overview

Instructions

Results

Source Code