Accelerating Bilinear Interpolation via the Sampler APIs#

This tutorial introduces how to use the Sampler APIs to accelerate bilinear interpolation computations through PVA’s Decoupled Lookup Table Unit (DLUT). As mentioned in ROI Gather using GSDF Tutorial, one component of the ROI Align Layer is to extract feature map data pertaining to an image Region of Interest (ROI) at indices with an integer and fractional component. Because ROI Align supports fractional offsets for pixel coordinates, the extracted feature map data is obtained via bilinear interpolation. PVA’s DLUT hardware can accelerate this exact type of operation. The DLUT provides high-throughput lookup functionality with the ability to operate in parallel with VPU processing. DLUT features pertinent to the ROI Align layer include:

1D/2D Lookups
Fixed-point or Fixed-point + Fractional Lookup Indices (containing a configurable number of fractional bits)
Post-lookup interpolation
Configurable horizontal and vertical index offsets to support translation between global and local coordinates

This tutorial shows how to setup a Sampler task for lookup with bilinear interpolation, have the VPU trigger the Sampler task and wait for its completion. Input to this tutorial is an FW x FH feature map, and a list of ROI coordinates relative to a source input image.

Device Code#

Since we do not know the ROI coordinates until runtime, the cuPVA SequenceDataFlow APIs are used to facilitate transfers of the feature map data for each ROI. To use the SQDF APIs, we must first initialize the sequence dataflow. The dataflow holds the sequence data transfer information in a specified location in VMEM that the PVA DMA engine accesses. At the beginning of CUPVA_VPU_MAIN we initialize the SQDF dataflow via a call to cupvaSQDFOpen().
```
CUPVA_VPU_MAIN()
{
```
```
    int32_t sqdf_feature_map_vmem_buf = cupvaGetVmemAddress(feature_map_vmem_buf);
    cupvaSQDFOpen(vpu_cfg_tbl);
```

The DLUT unit is setup via a configuration variable of type CupvaSampler which is declared as follows:

        CupvaSamplerInput2D const input = {
            .data           = feature_map_fixed_pt_vmem_buf,
            .type           = SAMPLER_INPUT_TYPE_S32,
            .width          = (uint32_t)(batch_w - 1),
            .height         = (uint32_t)(batch_h - 1),
            .linePitch      = (uint32_t)batch_w,
            .outOfRangeMode = SAMPLER_OUT_OF_RANGE_CONSTANT,
        };

        CupvaSamplerIndices2D const indices = {
            .data               = indices_buf,
            .type               = SAMPLER_INDEX_TYPE_U32,
            .width              = NUM_DLUT_PIXELS_PER_BATCH,
            .height             = 1U,
            .fractionalBits     = INDEX_QBIT,
            .fractionalHandling = SAMPLER_FRAC_HANDLING_INTERPOLATE,
            .interleaving       = SAMPLER_INTERLEAVING_ELEMENTS,
        };

        CupvaSamplerOutput const output = {
            .data      = sampled_points,
            .transMode = TRANS_MODE_NONE,
        };

        cupvaSamplerSetup(&sampler, &input, &indices, &output);

Here, we wait for the completion of the transfer of the ROI coordinates buffer.

    cupvaRasterDataFlowOpen(coords_trig, &roi_coords[0]);
    cupvaRasterDataFlowAcquire(coords_trig);
    cupvaRasterDataFlowRelease(coords_trig);
    cupvaRasterDataFlowClose(coords_trig);

The ROI coordinates are defined in reference to an input image. These coordinates must be scaled down to the dimensions of the feature map. This is done by multiplying the x and y coordinates by the scaling factor.

    float scaling_factor_x = ((float)(IMAGE_WIDTH)) / ((float)(FEATURE_MAP_WIDTH));
    float scaling_factor_y = ((float)(IMAGE_HEIGHT)) / ((float)(FEATURE_MAP_HEIGHT));

    for (int32_t i = 0; i < NUM_ROIS; i++)
    {
        roi_feature_map_coords[(i * ROI_INFO_ENTRY_SIZE) + 1] =
            roi_coords[(i * ROI_INFO_ENTRY_SIZE) + 1] / scaling_factor_x;

        roi_feature_map_coords[(i * ROI_INFO_ENTRY_SIZE) + 2] =
            roi_coords[(i * ROI_INFO_ENTRY_SIZE) + 2] / scaling_factor_y;

        roi_feature_map_coords[(i * ROI_INFO_ENTRY_SIZE) + 3] =
            roi_coords[(i * ROI_INFO_ENTRY_SIZE) + 3] / scaling_factor_x;

        roi_feature_map_coords[(i * ROI_INFO_ENTRY_SIZE) + 4] =
            roi_coords[(i * ROI_INFO_ENTRY_SIZE) + 4] / scaling_factor_y;
    }

ROI_INFO_ENTRY_SIZE refers to the number of data items required to describe a single ROI. This number is 5 since each ROI is described by its batch_id and coordinate bounding box values of top-left x, top-left y, bottom-right x, and bottom-right y. roi_feature_map_coords is the name of the buffer that we are writing the scaled ROI info entries to.

We process the ROIs from the feature map in batches in the following loop:

    for (int32_t i = 0; i < NUM_ROIS; i += ROIS_PER_BATCH)
    {

We determine the bounding box dimensions needed to encompass the ROIs in the current batch via the following function:

        ROI2D batch_coord;
        roiAlignGetBatchROIDimensions(i, roi_feature_map_coords, &batch_coord, ROIS_PER_BATCH);

        int32_t top_left_x_int = (int32_t)batch_coord.top_left_x;
        int32_t top_left_y_int = (int32_t)batch_coord.top_left_y;

        int32_t bot_right_x_int = (int32_t)(batch_coord.bot_right_x + 1);
        int32_t bot_right_y_int = (int32_t)(batch_coord.bot_right_y + 1);

        int32_t batch_w = bot_right_x_int - top_left_x_int;
        int32_t batch_h = bot_right_y_int - top_left_y_int;

By iterating through the coordinates for each ROI in the batch, the min/max boundaries are determined such that with one DMA fetch, we are able to fetch a datablock that holds all the feature map ROIs defined for the batch.

The starting DRAM address of the batch block is calculated here:

        uint64_t src_addr_dram = feature_map_dram_buf.base + feature_map_dram_buf.offset +
                                 (((top_left_y_int * FEATURE_MAP_WIDTH) + top_left_x_int) * sizeof(float));

This address is updated for the SQDF transfer parameters.

The SQDF transfer parameters are then updated with the batch bounding box information and the transfer is triggered.

        cupvaSQDFUpdateAddr(vpu_cfg_tbl, 0, src_addr_dram, MEMTYPE_DRAM, FEATURE_MAP_WIDTH * sizeof(float),
                             sqdf_feature_map_vmem_buf, MEMTYPE_VMEM, batch_w * sizeof(float));
        cupvaSQDFUpdateTileSize(vpu_cfg_tbl, 0, batch_w * sizeof(float), batch_h);

        cupvaSQDFFlushAndTrig(vpu_cfg_tbl);
        cupvaSQDFSync(vpu_cfg_tbl);

The DLUT unit operates on fixed-point data so we must convert the input feature map data we just transferred to VMEM to a fixed-point Qformat.

        for (int y = 0; y < batch_h; y++)
        {
            for (int x = 0; x < batch_w; x++) chess_prepare_for_pipelining
            {
                feature_map_fixed_pt_vmem_buf[(y * batch_w) + x] =
                    (int32_t)(feature_map_vmem_buf[(y * batch_w) + x] * (1 << PIXEL_QBIT));
            }
        }

As explained in ROI Gather using GSDF Tutorial, each ROI is sub-divided into bins. Within each bin, we sample points that are pooled into the output feature vector for further analysis by another part of the neural network. In the following function, we calculate the x,y coordinates of each sampling point from each bin.
```
        roiAlignConvertROICoordsToDLUTSamplingCoords(i, roi_feature_map_coords, indices_buf, ROIS_PER_BATCH,
                                                     batch_coord.top_left_x, batch_coord.top_left_y);
```

Now that we have computed the sampling indices for the feature map we must configure the Sampler task via the following Sampler setup APIs:

        CupvaSamplerInput2D const input = {
            .data           = feature_map_fixed_pt_vmem_buf,
            .type           = SAMPLER_INPUT_TYPE_S32,
            .width          = (uint32_t)(batch_w - 1),
            .height         = (uint32_t)(batch_h - 1),
            .linePitch      = (uint32_t)batch_w,
            .outOfRangeMode = SAMPLER_OUT_OF_RANGE_CONSTANT,
        };

        CupvaSamplerIndices2D const indices = {
            .data               = indices_buf,
            .type               = SAMPLER_INDEX_TYPE_U32,
            .width              = NUM_DLUT_PIXELS_PER_BATCH,
            .height             = 1U,
            .fractionalBits     = INDEX_QBIT,
            .fractionalHandling = SAMPLER_FRAC_HANDLING_INTERPOLATE,
            .interleaving       = SAMPLER_INTERLEAVING_ELEMENTS,
        };

        CupvaSamplerOutput const output = {
            .data      = sampled_points,
            .transMode = TRANS_MODE_NONE,
        };

        cupvaSamplerSetup(&sampler, &input, &indices, &output);

These APIs allow enable the programmer to set a variety of parameters pertaining to how the lookup task should be performed. For this use case, we are performing 2D lookups with floating-point coordinates. Bilinear interpolation is used to retrieve data with fractional offsets. This is set via the cupvaSamplerSetup API. The DLUT unit operates on floating-point indices in Qformat so we set that via the “fractionalBits” field in CupvaSamplerIndices2D struct. The “data” member in CupvaSamplerIndices2D struct is used to tell the DLUT unit the source address for the lookup indices that we computed in the prior stage.

The Sampler task is started via a call to cupvaSamplerStart with the CupvaSampler typed variable we just initialized being passed as an argument.
```
        cupvaSamplerStart(&sampler);
```
The DLUT unit runs concurrently with a VPU task. We must check for completion of the Sampler task we started in the previous step before trying to use its output.
```
        cupvaSamplerWait();
```

Now that the sampling points have been retrieved we perform average pooling on them to produce the output feature vector.

        roiAlign4x4AvePoolQ(sampled_points, POOL_WIDTH, POOL_HEIGHT, NUM_SAMPLES_PER_BIN, ROIS_PER_BATCH, PIXEL_QBIT,
                            pooled_roi_vmem_buf);

Once the pooling operation is completed we trigger the transfer of the output feature vector for the batch.

        cupvaSQDFTrig(pooledFeatureMapTrig);
        cupvaSQDFSync(pooledFeatureMapTrig);

Once all of the ROIs from the feature map have been processed we close the SQDF.
```
    }

    cupvaSQDFClose(vpu_cfg_tbl);
```

Host Code#

C++

This example loads the input feature map into DRAM as follows:

        roiAlignTaskParams params;
        params.feature_map_device = (void *)mem::Alloc(FEATURE_MAP_HEIGHT * FEATURE_MAP_WIDTH * sizeof(float));
        params.feature_map_host   = (void *)mem::GetHostPointer(params.feature_map_device);

        float *feature_map_data_h = (float *)params.feature_map_host;
        if (ReadCSVFloatBuffer(featureMapData.c_str(), assetsDirectory, feature_map_data_h,
                               FEATURE_MAP_WIDTH * FEATURE_MAP_HEIGHT) < 0)
        {
            return 0;
        }

From this feature map we are gathering ROIs and transferring them to VMEM via the cuPVA Sequence DataFlow (SQDF) APIs. The output feature vector produced by processing the feature map ROIs is transferred to an output buffer (in DRAM) via the cuPVA Raster DataFlow (RDF) APIs.

We allocate memory in DRAM to store the top-left and bottom-right (x,y) coordinates of each feature map ROI to be fetched.

        float *coords_d = (float *)mem::Alloc(NUM_ROIS * ROI_INFO_ENTRY_SIZE * sizeof(float));
        float *coords_h = (float *)mem::GetHostPointer(coords_d);
        params.coords_d = coords_d;

        if (ReadCSVFloatBuffer(roiBufferData.c_str(), assetsDirectory, coords_h, NUM_ROIS * ROI_INFO_ENTRY_SIZE) < 0)
        {
            return 0;
        }

We allocate memory in DRAM to store the output feature vectors.

        void *pooled_output_d = (void *)mem::Alloc(NUM_ROIS * POOL_WIDTH * POOL_HEIGHT * sizeof(float));
        void *pooled_output_h = (void *)mem::GetHostPointer(pooled_output_d);

        memset(pooled_output_h, 0, NUM_ROIS * POOL_WIDTH * POOL_HEIGHT * sizeof(float));

        params.pooled_output_d = pooled_output_d;

Syncpoints and stream objects are created.

        SyncObj sync = SyncObj::Create(true);
        Fence fence{sync};
        CmdRequestFences f{fence};

        Stream s = Stream::Create(PVA0, VPU0);

The Executable and CmdProgram objects are created similar to the previous tutorials.
```
    auto prog = CmdProgram::Create(exec);
```

RDFs are used to describe pre-determined data transfer patterns. In the following code, the RDF is setup to bring in the complete set of coordinates into VMEM on a single transfer.

    auto coords_trig            = prog["coords_trig"];
    float *coords_v             = prog["roi_coords"].ptr<float>();
    RasterDataFlow &coordsInput = prog.addDataFlowHead<RasterDataFlow>();
    coordsInput.handler(coords_trig)
        .src(params.coords_d, NUM_ROIS * ROI_INFO_ENTRY_SIZE, 1, NUM_ROIS * ROI_INFO_ENTRY_SIZE)
        .tileBuffer(coords_v)
        .tile(NUM_ROIS * ROI_INFO_ENTRY_SIZE, 1);

In a real application, the location of the ROIs is not known at compile time. Therefore, we must use the SequenceDataFlow (SQDF) APIs as they facilitate setting src/dst addresses of DMA transfers at runtime. To use the SQDF APIs we first declare the SQDF head.
```
    SequenceDataFlow &inputDataFlow = prog.addDataFlowHead<SequenceDataFlow>();
```
We then declare the sequence dataflow handle that the VPU uses to maintain the addresses and trigger the SQDF transfers.
```
    auto vpu_cfg_tbl = prog["vpu_cfg_tbl"];
    inputDataFlow.handler(vpu_cfg_tbl);
```
We now set the basic parameters of the SQDF transfer.
```
    inputDataFlow.addTransfer()
        .src(params.feature_map_device, FEATURE_MAP_WIDTH * sizeof(float))
        .dst(feature_map_vmem_buf, FEATURE_MAP_WIDTH * sizeof(float))
        .tile(FEATURE_MAP_WIDTH * sizeof(float), FEATURE_MAP_HEIGHT)
        .mode(TransferModeType::CONTINUOUS);
```
We set the “src” parameter to be the origin of the input feature map in DRAM. The source of the transfer is updated dynamically during runtime via some cuPVA SQDF APIs that modifies the relevant fields in the SQDF transfer table. In contrast to the prior tutorial, the width and height of each ROI is different. Therefore at runtime, in addition to the source address, the tile width, height, and line pitch are updated at runtime too.

We configure a SQDF DataFlow for transferring the output feature vectors from VMEM to DRAM.

    auto pooledFeatureMapTrig    = prog["pooledFeatureMapTrig"];
    SequenceDataFlow &outputStream = prog.addDataFlowHead<SequenceDataFlow>().handler(pooledFeatureMapTrig);

    int tile_offset = POOL_HEIGHT * POOL_WIDTH * ROIS_PER_BATCH;

    outputStream.addTransfer()
        .tile(POOL_WIDTH * POOL_HEIGHT * sizeof(float), ROIS_PER_BATCH)
        .src(pooled_roi_vmem_buf, POOL_WIDTH * POOL_HEIGHT * sizeof(float))
        .srcDim1(NUM_ROIS / ROIS_PER_BATCH, 0)
        .dst(params.pooled_output_d, POOL_WIDTH * POOL_HEIGHT * sizeof(float))
        .dstDim1(NUM_ROIS / ROIS_PER_BATCH, tile_offset * sizeof(float))
        .mode(TransferModeType::TILE);

The dataflows are now ready to be compiled on the host, a required step as mentioned in previous tutorials.
```
    prog.compileDataFlows();
```

The program is submitted to the VPU and waited for completion here.

        s.submit({&prog, &f});

        fence.wait();

We verify the output with the following verification check.

        int errNum = 0;

        float *pooled_output = (float *)pooled_output_h;

        float pooled_output_ref[OUTPUT_BUF_SIZE];
        if (ReadCSVFloatBuffer(outputDataRef.c_str(), assetsDirectory, pooled_output_ref, OUTPUT_BUF_SIZE) < 0)
        {
            return 0;
        }

        for (int n = 0; n < OUTPUT_BUF_SIZE; n++)
        {
            if (std::fabs(pooled_output[n] - pooled_output_ref[n]) > 0.01)
            {
                errNum = 1;
                std::cout << "\nMismatch at num " << n << " abs(opt-ref) -- " << pooled_output[n] << "\t"
                          << pooled_output_ref[n] << std::endl;

                goto end;
            }
        }

    end:
        if (errNum == 0)
        {
            printf("Test Pass\n");
        }
        else
        {
            printf("Test Fail\n");
        }

We then delete the allocated resources for cleanup.

        mem::Free(params.feature_map_device);
        mem::Free(params.pooled_output_d);
        mem::Free(params.coords_d);

The tutorial code is run on the command-line as follows:
```
./decoupled_lookup_unit_cpp -a <Tutorial Assets Directory Path>
```
You see “Test Pass” reported upon successful execution of the code.

C

This example loads the input feature map into DRAM as follows:

    roiAlignTaskParams gather_params;
    CHECK_ERROR_GOTO(
        CupvaMemAlloc((void **)&gather_params.feature_map_device,
                      FEATURE_MAP_HEIGHT * FEATURE_MAP_WIDTH * sizeof(float), CUPVA_READ_WRITE, CUPVA_ALLOC_DRAM),
        err, MemAllocFailed);

    CHECK_ERROR_GOTO(CupvaMemGetHostPointer((void **)&gather_params.feature_map_host, gather_params.feature_map_device),
                     err, MemAllocFailed);

    float *feature_map_ptr = (float *)gather_params.feature_map_host;

    if (ReadCSVFloatBuffer(FEATURE_MAP_DATA_FILE, assetsDirectory, feature_map_ptr,
                           FEATURE_MAP_WIDTH * FEATURE_MAP_HEIGHT) < 0)
    {
        return 0;
    }

From this feature map we are gathering ROIs and transferring them to VMEM via the cuPVA Sequence DataFlow (SQDF) APIs. The output feature vector produced by processing the feature map ROIs are transferred to an output buffer (in DRAM) via the cuPVA Raster DataFlow (RDF) APIs.

We allocate memory in DRAM to store the top-left and bottom-right (x,y) coordinates of each feature map ROI to be fetched.

    CHECK_ERROR_GOTO(CupvaMemAlloc((void **)&gather_params.coords_d, (NUM_ROIS * ROI_INFO_ENTRY_SIZE * sizeof(float)),
                                   CUPVA_READ_WRITE, CUPVA_ALLOC_DRAM),
                     err, MemAllocFailed);

    float *coords_h;
    CHECK_ERROR_GOTO(CupvaMemGetHostPointer((void **)&coords_h, gather_params.coords_d), err, MemAllocFailed);

    if (ReadCSVFloatBuffer(ROI_BUF_DATA_FILE, assetsDirectory, coords_h, NUM_ROIS * ROI_INFO_ENTRY_SIZE) < 0)
    {
        return 0;
    }

The ROI coordinates have been randomly generated for this example.

We allocate memory in DRAM to store the output feature vectors.

    CHECK_ERROR_GOTO(
        CupvaMemAlloc((void **)&gather_params.pooled_output_d, NUM_ROIS * POOL_WIDTH * POOL_HEIGHT * sizeof(float),
                      CUPVA_READ_WRITE, CUPVA_ALLOC_DRAM),
        err, MemAllocFailed);

    void *pooled_output_h;
    CHECK_ERROR_GOTO(CupvaMemGetHostPointer((void **)&pooled_output_h, gather_params.pooled_output_d), err,
                     MemAllocFailed);

    memset(pooled_output_h, 0, NUM_ROIS * POOL_WIDTH * POOL_HEIGHT * sizeof(float));

Syncpoints and stream objects are created.

    cupvaSyncObj_t postSync;
    CHECK_ERROR_GOTO(CupvaSyncObjCreate(&postSync, true, CUPVA_SIGNALER_WAITER, CUPVA_SYNC_YIELD), err,
                     SyncObjCreateFailed);

    cupvaFence_t postFence;
    CHECK_ERROR_GOTO(CupvaFenceInit(&postFence, postSync), err, StreamCreateFailed);

    cupvaCmd_t cmdFenceReq;
    CHECK_ERROR_GOTO(CupvaCmdRequestFencesInit(&cmdFenceReq, &postFence, 1), err, StreamCreateFailed);

    cupvaStream_t stream;
    CHECK_ERROR_GOTO(CupvaStreamCreate(&stream, CUPVA_PVA0, CUPVA_VPU0), err, StreamCreateFailed);

    cupvaCmdStatus_t cmdstatus[2] = {NULL};

The Executable and CmdProgram objects are created similar to the previous tutorials.

    CHECK_ERROR_GOTO(CupvaExecutableCreate(&ROI_AlignExec, PVA_EXECUTABLE_DATA(decoupled_lookup_unit_dev),
                                           PVA_EXECUTABLE_SIZE(decoupled_lookup_unit_dev)),
                     err, ExecutableCreateFailed);
    CHECK_ERROR_GOTO(CupvaCmdProgramCreate(&ROI_AlignCmdProg, ROI_AlignExec), err, CmdProgramCreateFailed);

RDFs are used to describe pre-determined data transfer patterns. In the following code the RDF is setup to bring in the complete set of coordinates into VMEM on a single transfer.

    cupvaParameter_t roi_coords_hdl, coords_trig_hdl;
    CHECK_ERROR_GOTO(CupvaCmdProgramGetParameter(&ROI_AlignCmdProg, &coords_trig_hdl, "coords_trig"), err,
                     CmdProgramCreateFailed);
    CHECK_ERROR_GOTO(CupvaCmdProgramGetParameter(&ROI_AlignCmdProg, &roi_coords_hdl, "roi_coords"), err,
                     CmdProgramCreateFailed);

    cupvaDataFlow_t coordsInputDF;
    CHECK_ERROR_GOTO(CupvaCmdProgramAddDataFlowHead(&ROI_AlignCmdProg, &coordsInputDF, CUPVA_RASTER_DATAFLOW, 0, 1.0F),
                     err, SyncObjCreateFailed);

    float *roi_coords;
    CHECK_ERROR_GOTO(CupvaParameterGetDevicePointer(&roi_coords_hdl, (void const **)&roi_coords), err,
                     SyncObjCreateFailed);
    cupvaRasterDataFlowParams_t coordsInputDFParams = {};
    coordsInputDFParams.handler                     = &coords_trig_hdl;
    coordsInputDFParams.tileWidth                   = NUM_ROIS * ROI_INFO_ENTRY_SIZE;
    coordsInputDFParams.tileHeight                  = 1;
    coordsInputDFParams.bpp                         = sizeof(float);
    coordsInputDFParams.ptrSrc                      = gather_params.coords_d;
    coordsInputDFParams.srcWidth                    = NUM_ROIS * ROI_INFO_ENTRY_SIZE;
    coordsInputDFParams.srcHeight                   = 1;
    coordsInputDFParams.linePitchSrc                = NUM_ROIS * ROI_INFO_ENTRY_SIZE;
    coordsInputDFParams.ptrTileBuffer               = roi_coords;

    CHECK_ERROR_GOTO(CupvaRasterDataFlowSetParams(coordsInputDF, &coordsInputDFParams), err, SyncObjCreateFailed);

In a real application, the location of the ROIs is known at compile time. Therefore we must use the SequenceDataFlow (SQDF) APIs as they facilitate setting src/dst addresses of DMA transfers at runtime. To use the SQDF APIs we first declare the SQDF head.

    cupvaDataFlow_t featureMapDF;
    CHECK_ERROR_GOTO(CupvaCmdProgramAddDataFlowHead(&ROI_AlignCmdProg, &featureMapDF, CUPVA_SEQUENCE_DATAFLOW, 0, 1.0F),
                     err, SyncObjCreateFailed);
    cupvaSequenceDataFlowParams_t featureMapDFParams = {};
    cupvaSequenceDataFlowTransferParams_t featureMapDFTransferParams = {};

We then declare the sequence dataflow handle that the VPU uses to maintain the addresses and trigger the SQDF transfers.

    cupvaParameter_t vpu_cfg_tbl;
    CHECK_ERROR_GOTO(CupvaCmdProgramGetParameter(&ROI_AlignCmdProg, &vpu_cfg_tbl, "vpu_cfg_tbl"), err,
                     SyncObjCreateFailed);
    featureMapDFParams.handler = &vpu_cfg_tbl;

We now set the basic parameters of the SQDF transfer.

    featureMapDFTransferParams.ptrSrc       = gather_params.feature_map_device;
    featureMapDFTransferParams.linePitchSrc = FEATURE_MAP_PITCH * sizeof(float);
    featureMapDFTransferParams.ptrDst       = feature_map_vmem_buf;
    featureMapDFTransferParams.linePitchDst = FEATURE_MAP_PITCH * sizeof(float);
    featureMapDFTransferParams.tileWidth    = FEATURE_MAP_WIDTH * sizeof(float);
    featureMapDFTransferParams.tileHeight   = FEATURE_MAP_HEIGHT;
    featureMapDFTransferParams.transferMode = CUPVA_TRANSFER_MODE_CONTINUOUS;

    CHECK_ERROR_GOTO(CupvaSequenceDataFlowSetParams(featureMapDF, &featureMapDFParams), err, SyncObjCreateFailed);
    CHECK_ERROR_GOTO(CupvaSequenceDataFlowAddTransfer(featureMapDF, &featureMapDFTransferParams), err, SyncObjCreateFailed);

We set the “src” parameter to be the origin of the input feature map in DRAM. The source of the transfer is updated dynamically during runtime via some cuPVA SQDF APIs that modify the relevant fields in the SQDF transfer table. In contrast to the prior tutorial, the width and height of each ROI is different. Therefore, at runtime in addition to the source address, the tile width, height, and line pitch are updated at runtime too.

We configure a RDF DataFlow for transferring the output feature vectors from VMEM to DRAM.

    cupvaParameter_t pooledFeatureMapTrig;
    CHECK_ERROR_GOTO(CupvaCmdProgramGetParameter(&ROI_AlignCmdProg, &pooledFeatureMapTrig, "pooledFeatureMapTrig"), err,
                     SyncObjCreateFailed);

    cupvaDataFlow_t outputDataDF;
    CHECK_ERROR_GOTO(CupvaCmdProgramAddDataFlowHead(&ROI_AlignCmdProg, &outputDataDF, CUPVA_SEQUENCE_DATAFLOW, 0, 1.0F),
                     err, SyncObjCreateFailed);

    cupvaSequenceDataFlowParams_t outputDataDFParams = {};
    outputDataDFParams.handler                     = &pooledFeatureMapTrig;
    cupvaSequenceDataFlowTransferParams_t outputDataDFTransferParams = {};
    outputDataDFTransferParams.tileWidth                   = POOL_WIDTH * POOL_HEIGHT * sizeof(float);
    outputDataDFTransferParams.tileHeight                  = ROIS_PER_BATCH;
    outputDataDFTransferParams.ptrSrc                      = pooled_roi_vmem_buf;
    outputDataDFTransferParams.niterSrcDim1                = (NUM_ROIS / ROIS_PER_BATCH);
    outputDataDFTransferParams.advSrcDim1                  = 0;
    outputDataDFTransferParams.linePitchSrc                = POOL_WIDTH * POOL_HEIGHT * sizeof(float);
    outputDataDFTransferParams.ptrDst                      = gather_params.pooled_output_d;
    outputDataDFTransferParams.niterDstDim1                = NUM_ROIS / ROIS_PER_BATCH;
    outputDataDFTransferParams.advDstDim1                  = POOL_HEIGHT * POOL_WIDTH * ROIS_PER_BATCH;
    outputDataDFTransferParams.linePitchDst                = POOL_WIDTH * POOL_HEIGHT * sizeof(float);

    CHECK_ERROR_GOTO(CupvaSequenceDataFlowSetParams(outputDataDF, &outputDataDFParams), err, SyncObjCreateFailed);
    CHECK_ERROR_GOTO(CupvaSequenceDataFlowAddTransfer(outputDataDF, &outputDataDFTransferParams), err, SyncObjCreateFailed);

The dataflows are now ready to be compiled on the host, a required step as mentioned in previous tutorials.

    CHECK_ERROR_GOTO(CupvaCmdProgramCompileDataFlows(&ROI_AlignCmdProg), err, SyncObjCreateFailed);

The program is submitted to the VPU and waited for completion here.

    cupvaCmd_t const *submitCmds[2] = {&ROI_AlignCmdProg, &cmdFenceReq};
    CHECK_ERROR_GOTO(CupvaStreamSubmit(stream, submitCmds, cmdstatus, 2, CUPVA_IN_ORDER, 10000000, 10000000), err,
                     DeAllocateAllResources);

    bool fenceWaitStatus;
    CHECK_ERROR_GOTO(CupvaFenceWait(&postFence, -1, &fenceWaitStatus), err, DeAllocateAllResources);

    cupvaError_t statusCode = {CUPVA_ERROR_NONE};
    CupvaCheckCommandStatus(cmdstatus[0], &statusCode);

We verify the output with the following verification check.

    int errNum = 0;

    float *pooled_output = (float *)pooled_output_h;

    float pooled_output_ref[OUTPUT_BUF_SIZE];
    if (ReadCSVFloatBuffer(OUTPUT_DATA_FILE, assetsDirectory, pooled_output_ref, OUTPUT_BUF_SIZE) < 0)
    {
        return 0;
    }

    for (int num = 0; num < OUTPUT_BUF_SIZE; num++)
    {
        if (fabs(pooled_output[num] - pooled_output_ref[num]) > 0.01)
        {
            errNum = 1;
            printf("\nMismatch at num %d abs(opt-ref) -- %f %f", num, pooled_output[num], pooled_output_ref[num]);
        }
    }
    if (!errNum)
        printf("\nTest Pass\n");
    else
        printf("\nTest Fail\n");

We then delete the allocated resources for cleanup.

DeAllocateAllResources: /* clean up all allocated resources */
    CupvaStreamDestroy(stream);
StreamCreateFailed: /* clean up resources allocated prior to StreamCreate */
    CupvaSyncObjDestroy(postSync);

SyncObjCreateFailed: /* clean up resources allocated prior to SyncObjCreate */
MemAllocFailed:      /* clean up resources allocated prior to MemAlloc */
    CupvaMemFree(gather_params.feature_map_device);
    CupvaMemFree(gather_params.pooled_output_d);
    CupvaMemFree(gather_params.coords_d);

    CupvaCmdDestroy(&ROI_AlignCmdProg);
CmdProgramCreateFailed: /* clean up resources allocated prior to CmdProgramCreate */
    CupvaExecutableDestroy(ROI_AlignExec);
ExecutableCreateFailed: /* clean up resources allocated prior to ExecutableCreate */

The tutorial code is run on the command-line as follows:
```
./decoupled_lookup_unit_c -a <Tutorial Assets Directory Path>
```
You see “Test Pass” reported upon successful execution of the code.