dynamic_dataflow.h#

Fully qualified name: src/device/vpu_runtime/include/cupva_device/dynamic_dataflow.h

File members: src/device/vpu_runtime/include/cupva_device/dynamic_dataflow.h

/*
 * Copyright (c) 2020-2021 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA Corporation is strictly prohibited.
 */

#ifndef CUPVA_DEVICE_DYNAMIC_DATAFLOW_H
#define CUPVA_DEVICE_DYNAMIC_DATAFLOW_H

#include "impl/dma_common.h"

#include <cupva_types.h>

/* The maximum node number of dynamic dataflow is 4. */
#define CUPVA_DDF_MAX_NODE_NUM (4U)
/* The maximum lane number of dynamic dataflow is 8. */
#define CUPVA_DDF_MAX_LANE_NUM (8U)
/* Select between above based on lanes == 0 */
#define DDF_GENERIC_VPUC_METADATA_SIZE(nodesPerLane, lanes) \
    ((lanes) == 0 ? DDF_METADATA_SIZE(nodesPerLane) : DDF_PARALLEL_METADATA_SIZE(nodesPerLane))
/* Select between VPUC table sizes based on lanes == 0 */
#define DDF_GENERIC_VPUC_SIZE(nodesPerLane, lanes) \
    ((lanes) == 0 ? DDF_VPUC_TBL_SIZE((nodesPerLane)) : DDF_PARALLEL_VPUC_TBL_SIZE((nodesPerLane), (lanes)))
/* Select between handler sizes based on lanes == 0 */
#define DDF_GENERIC_TBL_SIZE(nodesPerLane, lanes) \
    ((lanes) == 0 ? DDF_TBL_SIZE_IMPL((nodesPerLane)) : DDF_PARALLEL_TBL_SIZE_IMPL((nodesPerLane), (lanes)))
/* Expand to argument list for backwards compatibility with VMEM macro */
#define DDF_TBL_SIZE(NODE_NUM) (NODE_NUM), 0U, true
/* Expand to argument list for backwards compatibility with VMEM macro */
#define DDF_PARALLEL_TBL_SIZE(NODE_NUM, LANE_NUM) (NODE_NUM), (LANE_NUM), true
/* Helper for compatibility with VMEM macros */
#define VMEM_DDF_HANDLER(bank, name, nodesPerLane, lanes)                                                        \
    VMEM_1D(bank, uint32_t, name, DDF_GENERIC_TBL_SIZE(nodesPerLane, lanes));                                    \
    CUPVA_EXPORT_IMPL(name, _M##name##_export,                                                                   \
                      (DDF_GENERIC_VPUC_METADATA_SIZE(nodesPerLane, lanes) * sizeof(uint32_t)), VMEM_TYPE_DATA); \
    CUPVA_EXPORT_IMPL(name[DDF_GENERIC_VPUC_METADATA_SIZE(nodesPerLane, lanes)], _V##name##_export,              \
                      (DDF_GENERIC_VPUC_SIZE(nodesPerLane, lanes) * sizeof(uint32_t)), VMEM_TYPE_VPUC_TABLE);

inline bool cupvaDDFIsSrcVmem(uint32_t *ddfPtr, const uint32_t ddfPitch)
{
    CUPVA_VPU_ASSERT((0xFFFFFFFFU / (2U * (uint32_t)DDF_DESCR_CNTL)) >= ddfPitch);
    uint32_t descrCntl = ddfPtr[2U * (uint32_t)DDF_DESCR_CNTL * ddfPitch];
    return (CUPVA_DMA_DESC_EXTRACT(DESCR_CNTL, DSTM, descrCntl) == (uint32_t)DMA_TRANS_MODE_VMEM);
}

inline bool cupvaDDFIsDstVmem(uint32_t *ddfPtr, const uint32_t ddfPitch)
{
    CUPVA_VPU_ASSERT((0xFFFFFFFFU / (2U * (uint32_t)DDF_DESCR_CNTL)) >= ddfPitch);
    uint32_t descrCntl = ddfPtr[2U * (uint32_t)DDF_DESCR_CNTL * ddfPitch];
    return (CUPVA_DMA_DESC_EXTRACT(DESCR_CNTL, DDTM, descrCntl) == (uint32_t)DMA_TRANS_MODE_VMEM);
}

inline uint32_t *cupvaDDFGetRawPtr2D(uint32_t *ddfBasePtr, uint8_t nodeIdx)
{
    uint32_t idx =
        ((DDF_ENTRY_HEAD + (2U * (uint32_t)nodeIdx * DDF_ENTRY_NUM_PER_DESC)) * DDF_SCRATCH_PITCH) + DDF_HIDDEN_HEAD;
    return &ddfBasePtr[idx];
}

inline uint32_t *cupvaDDFGetRawPtr1D(uint32_t *ddfBasePtr, uint8_t nodeIdx)
{
    uint32_t offset = POINTER_CAST(DDFTable, ddfBasePtr)->ddfHeaders.ddfTblInternalOfst;
    uint32_t idx    = (DDF_ENTRY_HEAD + (2U * (uint32_t)nodeIdx * DDF_ENTRY_NUM_PER_DESC)) + DDF_HIDDEN_HEAD;
    return &ddfBasePtr[idx + offset];
}

inline void cupvaDDFUpdateSrcAddr(uint32_t *ddfBasePtr, uint64_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr       = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *srcAddrLoPtr = ddfPtr->ddfPayload[nodeIdx].ddfSrcAddr + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    uint32_t *descCntlPtr  = ddfPtr->ddfPayload[nodeIdx].ddfDescrCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    uint32_t *transCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfTransCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    *srcAddrLoPtr          = (uint32_t)(newVal & 0xFFFFFFFFUL);

    uint32_t newDescCntl = *descCntlPtr;
    newDescCntl &= (CUPVA_DESC_SRC_ADDR1_CLEAR_MASK & CUPVA_DESC_SRC_TF_CLEAR_MASK);
    newDescCntl |= (((uint32_t)(newVal >> 32) & CUPVA_DESC_SRC_ADDR1_MASK) << CUPVA_DESC_SRC_ADDR1_SHIFT);
    uint32_t srcTf = ((uint32_t)(newVal >> 40) & CUPVA_DESC_SRC_TF_MASK);
    newDescCntl |= (srcTf << CUPVA_DESC_SRC_TF_SHIFT);
    *descCntlPtr = newDescCntl;

    uint32_t newTransCntl          = *transCntlPtr;
    uint32_t transCntlUpdatedSbadr = newTransCntl & CUPVA_DESC_SBADR_CLEAR_MASK;
    transCntlUpdatedSbadr |= (((uint32_t)(newVal >> 41) & CUPVA_DESC_SBADR_MASK) << CUPVA_DESC_SBADR_SHIFT);
    newTransCntl  = (srcTf > 0U) ? transCntlUpdatedSbadr : newTransCntl;
    *transCntlPtr = newTransCntl;
}

inline void cupvaDDFUpdateSrcAddr(uint32_t *ddfBasePtr, vintx const &newSrcAddrLo, vintx const &newSrcAddrHi,
                                  uint8_t nodeIdx)
{
    vint *vSrcAddrLoPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfSrcAddr));
    vint *vDescCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfDescrCntl));
    vint *vTransCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfTransCntl));

    (void)(*vSrcAddrLoPtr = extract(newSrcAddrLo));

    vintx vNewDescCntl = sign_extend(*vDescCntlPtr);
    vNewDescCntl       = vNewDescCntl & replicatew(CUPVA_DESC_SRC_ADDR1_CLEAR_MASK & CUPVA_DESC_SRC_TF_CLEAR_MASK);
    vNewDescCntl  = vNewDescCntl | (newSrcAddrHi & replicatew(CUPVA_DESC_SRC_ADDR1_MASK)) << CUPVA_DESC_SRC_ADDR1_SHIFT;
    vintx vsrcTf  = (newSrcAddrHi >> 8) & replicatew(CUPVA_DESC_SRC_TF_MASK);
    vNewDescCntl  = vNewDescCntl | (vsrcTf << CUPVA_DESC_SRC_TF_SHIFT);
    *vDescCntlPtr = extract(vNewDescCntl);

    vintx vNewTransCntl          = sign_extend(*vTransCntlPtr);
    vintx vTransCntlUpdatedSbadr = vNewTransCntl & replicatew(CUPVA_DESC_SBADR_CLEAR_MASK);
    vTransCntlUpdatedSbadr =
        vTransCntlUpdatedSbadr | (((newSrcAddrHi >> 9) & replicatew(CUPVA_DESC_SBADR_MASK)) << CUPVA_DESC_SBADR_SHIFT);

    vNewTransCntl  = vmux(vsrcTf, vTransCntlUpdatedSbadr, vNewTransCntl);
    *vTransCntlPtr = extract(vNewTransCntl);
}

inline void cupvaDDFUpdateDstAddr(uint32_t *ddfBasePtr, uint64_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr       = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *dstAddrLoPtr = ddfPtr->ddfPayload[nodeIdx].ddfDstAddr + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    uint32_t *descCntlPtr  = ddfPtr->ddfPayload[nodeIdx].ddfDescrCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    uint32_t *transCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfTransCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    *dstAddrLoPtr          = (uint32_t)(newVal & 0xFFFFFFFFUL);

    uint32_t newDescCntl = *descCntlPtr;
    newDescCntl &= (CUPVA_DESC_DST_ADDR1_CLEAR_MASK & CUPVA_DESC_DST_TF_CLEAR_MASK);
    newDescCntl |= (((uint32_t)(newVal >> 32) & CUPVA_DESC_DST_ADDR1_MASK) << CUPVA_DESC_DST_ADDR1_SHIFT);
    uint32_t dstTf = ((uint32_t)(newVal >> 40) & CUPVA_DESC_DST_TF_MASK);
    newDescCntl |= (dstTf << CUPVA_DESC_DST_TF_SHIFT);
    *descCntlPtr = newDescCntl;

    uint32_t newTransCntl          = *transCntlPtr;
    uint32_t transCntlUpdatedSbadr = newTransCntl & CUPVA_DESC_SBADR_CLEAR_MASK;
    transCntlUpdatedSbadr |= (((uint32_t)(newVal >> 41) & CUPVA_DESC_SBADR_MASK) << CUPVA_DESC_SBADR_SHIFT);
    newTransCntl  = (dstTf > 0U) ? transCntlUpdatedSbadr : newTransCntl;
    *transCntlPtr = newTransCntl;
}

inline void cupvaDDFUpdateDstAddr(uint32_t *ddfBasePtr, vintx const &newDstAddrLo, vintx const &newDstAddrHi,
                                  uint8_t nodeIdx)
{
    vint *vDstAddrLoPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfDstAddr));
    vint *vDescCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfDescrCntl));
    vint *vTransCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfTransCntl));

    (void)(*vDstAddrLoPtr = extract(newDstAddrLo));

    vintx vNewDescCntl = sign_extend(*vDescCntlPtr);
    vNewDescCntl       = vNewDescCntl & replicatew(CUPVA_DESC_DST_ADDR1_CLEAR_MASK & CUPVA_DESC_DST_TF_CLEAR_MASK);
    vNewDescCntl =
        vNewDescCntl | ((newDstAddrHi & replicatew(CUPVA_DESC_DST_ADDR1_MASK)) << CUPVA_DESC_DST_ADDR1_SHIFT);
    vintx vDstTf  = (newDstAddrHi >> 8) & replicatew(CUPVA_DESC_DST_TF_MASK);
    vNewDescCntl  = vNewDescCntl | (vDstTf << CUPVA_DESC_DST_TF_SHIFT);
    *vDescCntlPtr = extract(vNewDescCntl);

    vintx vNewTransCntl          = sign_extend(*vTransCntlPtr);
    vintx vTransCntlUpdatedSbadr = vNewTransCntl & replicatew(CUPVA_DESC_SBADR_CLEAR_MASK);
    vTransCntlUpdatedSbadr =
        vTransCntlUpdatedSbadr | (((newDstAddrHi >> 9) & replicatew(CUPVA_DESC_SBADR_MASK)) << CUPVA_DESC_SBADR_SHIFT);

    vNewTransCntl  = vmux(vDstTf, vTransCntlUpdatedSbadr, vNewTransCntl);
    *vTransCntlPtr = extract(vNewTransCntl);
}

inline void cupvaDDFUpdateTx(uint32_t *ddfBasePtr, uint16_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr      = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *tileCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfTileCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*tileCntlPtr = (*tileCntlPtr & CUPVA_DESC_TX_CLEAR_MASK) | ((uint32_t)newVal << CUPVA_DESC_TX_SHIFT));
}

inline void cupvaDDFUpdateTx(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vTileCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfTileCntl));
    (void)(*vTileCntlPtr = extract((sign_extend(*vTileCntlPtr) & replicatew(CUPVA_DESC_TX_CLEAR_MASK)) |
                                   (newVal << CUPVA_DESC_TX_SHIFT)));
}

inline void cupvaDDFUpdateTy(uint32_t *ddfBasePtr, uint16_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr      = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *tileCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfTileCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*tileCntlPtr = (*tileCntlPtr & CUPVA_DESC_TY_CLEAR_MASK) | ((uint32_t)newVal << CUPVA_DESC_TY_SHIFT));
}

inline void cupvaDDFUpdateTy(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vTileCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfTileCntl));
    (void)(*vTileCntlPtr = extract((sign_extend(*vTileCntlPtr) & replicatew(CUPVA_DESC_TY_CLEAR_MASK)) |
                                   (newVal << CUPVA_DESC_TY_SHIFT)));
}

inline void cupvaDDFUpdateLdid(uint32_t *ddfBasePtr, uint8_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr       = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *descrCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfDescrCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*descrCntlPtr =
               (*descrCntlPtr & CUPVA_DESC_LINK_DID_CLEAR_MASK) | ((uint32_t)newVal << CUPVA_DESC_LINK_DID_SHIFT));
}

inline void cupvaDDFUpdateLdid(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vDescrCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfDescrCntl));
    (void)(*vDescrCntlPtr = extract((sign_extend(*vDescrCntlPtr) & replicatew(CUPVA_DESC_LINK_DID_CLEAR_MASK)) |
                                    (newVal << CUPVA_DESC_LINK_DID_SHIFT)));
}

inline void cupvaDDFUpdateDstm(uint32_t *ddfBasePtr, uint8_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr       = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *descrCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfDescrCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*descrCntlPtr = (*descrCntlPtr & CUPVA_DESC_DSTM_CLEAR_MASK) | ((uint32_t)newVal << CUPVA_DESC_DSTM_SHIFT));
}

inline void cupvaDDFUpdateDstm(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vDescrCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfDescrCntl));
    (void)(*vDescrCntlPtr = extract((sign_extend(*vDescrCntlPtr) & replicatew(CUPVA_DESC_DSTM_CLEAR_MASK)) |
                                    (newVal << CUPVA_DESC_DSTM_SHIFT)));
}

inline void cupvaDDFUpdateDdtm(uint32_t *ddfBasePtr, uint8_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr       = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *descrCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfDescrCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*descrCntlPtr = (*descrCntlPtr & CUPVA_DESC_DDTM_CLEAR_MASK) | ((uint32_t)newVal << CUPVA_DESC_DDTM_SHIFT));
}

inline void cupvaDDFUpdateDdtm(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vDescrCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfDescrCntl));
    (void)(*vDescrCntlPtr = extract((sign_extend(*vDescrCntlPtr) & replicatew(CUPVA_DESC_DDTM_CLEAR_MASK)) |
                                    (newVal << CUPVA_DESC_DDTM_SHIFT)));
}

inline void cupvaDDFUpdatePx(uint32_t *ddfBasePtr, uint8_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr       = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *transCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfTransCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*transCntlPtr = (*transCntlPtr & CUPVA_DESC_PX_CLEAR_MASK) | ((uint32_t)newVal << CUPVA_DESC_PX_SHIFT));
}

inline void cupvaDDFUpdatePx(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vTransCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfTransCntl));
    (void)(*vTransCntlPtr = extract((sign_extend(*vTransCntlPtr) & replicatew(CUPVA_DESC_PX_CLEAR_MASK)) |
                                    (newVal << CUPVA_DESC_PX_SHIFT)));
}

inline void cupvaDDFUpdatePy(uint32_t *ddfBasePtr, uint8_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr       = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *transCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfTransCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*transCntlPtr = (*transCntlPtr & CUPVA_DESC_PY_CLEAR_MASK) | ((uint32_t)newVal << CUPVA_DESC_PY_SHIFT));
}

inline void cupvaDDFUpdatePy(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vTransCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfTransCntl));
    (void)(*vTransCntlPtr = extract((sign_extend(*vTransCntlPtr) & replicatew(CUPVA_DESC_PY_CLEAR_MASK)) |
                                    (newVal << CUPVA_DESC_PY_SHIFT)));
}

inline void cupvaDDFUpdatePxdir(uint32_t *ddfBasePtr, uint8_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr       = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *transCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfTransCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*transCntlPtr =
               (*transCntlPtr & CUPVA_DESC_PXDIR_CLEAR_MASK) | ((uint32_t)newVal << CUPVA_DESC_PXDIR_SHIFT));
}

inline void cupvaDDFUpdatePxdir(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vTransCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfTransCntl));
    (void)(*vTransCntlPtr = extract((sign_extend(*vTransCntlPtr) & replicatew(CUPVA_DESC_PXDIR_CLEAR_MASK)) |
                                    (newVal << CUPVA_DESC_PXDIR_SHIFT)));
}

inline void cupvaDDFUpdatePydir(uint32_t *ddfBasePtr, uint8_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr       = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *transCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfTransCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*transCntlPtr =
               (*transCntlPtr & CUPVA_DESC_PYDIR_CLEAR_MASK) | ((uint32_t)newVal << CUPVA_DESC_PYDIR_SHIFT));
}

inline void cupvaDDFUpdatePydir(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vTransCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfTransCntl));
    (void)(*vTransCntlPtr = extract((sign_extend(*vTransCntlPtr) & replicatew(CUPVA_DESC_PYDIR_CLEAR_MASK)) |
                                    (newVal << CUPVA_DESC_PYDIR_SHIFT)));
}

inline void cupvaDDFUpdateItc(uint32_t *ddfBasePtr, uint8_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr       = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *transCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfTransCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*transCntlPtr = (*transCntlPtr & CUPVA_DESC_ITC_CLEAR_MASK) | ((uint32_t)newVal << CUPVA_DESC_ITC_SHIFT));
}

inline void cupvaDDFUpdateItc(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vTransCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfTransCntl));
    (void)(*vTransCntlPtr = extract((sign_extend(*vTransCntlPtr) & replicatew(CUPVA_DESC_ITC_CLEAR_MASK)) |
                                    (newVal << CUPVA_DESC_ITC_SHIFT)));
}

inline void cupvaDDFUpdateLpSrc(uint32_t *ddfBasePtr, uint16_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr    = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *lpCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfLpCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*lpCntlPtr = (*lpCntlPtr & CUPVA_DESC_SRC_LP_CLEAR_MASK) | ((uint32_t)newVal << CUPVA_DESC_SRC_LP_SHIFT));
}

inline void cupvaDDFUpdateLpSrc(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vLpCntlPtr = POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfLpCntl));
    (void)(*vLpCntlPtr = extract((sign_extend(*vLpCntlPtr) & replicatew(CUPVA_DESC_SRC_LP_CLEAR_MASK)) |
                                 (newVal << CUPVA_DESC_SRC_LP_SHIFT)));
}

inline void cupvaDDFUpdateLpDst(uint32_t *ddfBasePtr, uint16_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr    = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *lpCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfLpCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*lpCntlPtr = (*lpCntlPtr & CUPVA_DESC_DST_LP_CLEAR_MASK) | ((uint32_t)newVal << CUPVA_DESC_DST_LP_SHIFT));
}

inline void cupvaDDFUpdateLpDst(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vLpCntlPtr = POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfLpCntl));
    (void)(*vLpCntlPtr = extract((sign_extend(*vLpCntlPtr) & replicatew(CUPVA_DESC_DST_LP_CLEAR_MASK)) |
                                 (newVal << CUPVA_DESC_DST_LP_SHIFT)));
}

inline void cupvaDDFUpdateLpCntl(uint32_t *ddfBasePtr, uint32_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr    = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *lpCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfLpCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*lpCntlPtr = newVal);
}

inline void cupvaDDFUpdateLpCntl(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vLpCntlPtr = POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfLpCntl));
    (void)(*vLpCntlPtr = extract(newVal));
}

inline void cupvaDDFUpdateTransCntl(uint32_t *ddfBasePtr, uint32_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr       = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *transCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfTransCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*transCntlPtr = newVal);
}

inline void cupvaDDFUpdateTransCntl(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vTransCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfTransCntl));
    (void)(*vTransCntlPtr = extract(newVal));
}

inline void cupvaDDFUpdateTileCntl(uint32_t *ddfBasePtr, uint32_t newVal, uint8_t nodeIdx)
{
    DDFTable *ddfPtr      = POINTER_CAST(DDFTable, ddfBasePtr);
    uint32_t *tileCntlPtr = ddfPtr->ddfPayload[nodeIdx].ddfTileCntl + ddfPtr->ddfHeaders.ddfTblInternalOfst;
    (void)(*tileCntlPtr = newVal);
}

inline void cupvaDDFUpdateTileCntl(uint32_t *ddfBasePtr, vintx const &newVal, uint8_t nodeIdx)
{
    vint *vTileCntlPtr =
        POINTER_CAST(vint, (POINTER_CAST(DDFParallelTable, ddfBasePtr)->ddfPayload[nodeIdx].ddfTileCntl));
    (void)(*vTileCntlPtr = extract(newVal));
}

inline void cupvaDDFUpdateDmaBase(uint32_t *ddfPayloadPtr, const uint8_t nodeNumPerLane, const uint8_t laneNum)
{
    CUPVA_VPU_ASSERT(nodeNumPerLane <= CUPVA_DDF_MAX_NODE_NUM);
    CUPVA_VPU_ASSERT(laneNum <= CUPVA_DDF_MAX_LANE_NUM);
    const uint16_t entryNum    = (uint16_t)nodeNumPerLane * DDF_ENTRY_NUM_PER_DESC;
    const uint32_t dmaBaseAddr = cupvaGetDmaDescBase();
    const uint8_t ddfPitch     = (laneNum == 1U) ? 1U : DDF_SCRATCH_PITCH;

    for (uint8_t j = 0U; j < laneNum; j++)
    {
        for (uint16_t i = 0U; i < entryNum; i++)
        {
            ddfPayloadPtr[(((2U * (uint32_t)i) + 1U) * (uint32_t)ddfPitch) + (uint32_t)j] += dmaBaseAddr;
        }
    }
}

inline void cupvaDDFOpen(uint32_t *ddfBasePtr, ExtMemPointer extBufSymbol, uint32_t vmemBufAddr)
{
    CUPVA_VPU_ASSERT(ddfBasePtr != NULL);
    DDFParams *ddfHeaderPtr = POINTER_CAST(DDFParams, ddfBasePtr);
    CUPVA_VPU_ASSERT(ddfHeaderPtr->nodeNumPerLane <= CUPVA_DDF_MAX_NODE_NUM);
    CUPVA_VPU_ASSERT(ddfHeaderPtr->laneNum <= CUPVA_DDF_MAX_LANE_NUM);
    uint8_t laneNum        = ddfHeaderPtr->laneNum;
    uint8_t nodeNumPerLane = ddfHeaderPtr->nodeNumPerLane;
    uint64_t extBufAddr    = extBufSymbol.base + (uint64_t)extBufSymbol.offset;

    // Copy from param area to VPUC area
    dvint *srcPtr               = POINTER_CAST(dvint, &ddfBasePtr[DDF_HIDDEN_HEAD]);
    uint32_t const offsetToVPUC = DDF_VPUC_TBL_SIZE(nodeNumPerLane);
    uint32_t const niter        = offsetToVPUC / pva_elementsof(dvint);
    dvint *dstPtr               = srcPtr + niter;
    uint32_t *dstPtrInt         = POINTER_CAST(uint32_t, dstPtr);
    uint32_t *ddfPayloadPtr     = &dstPtrInt[DDF_ENTRY_HEAD];
    for (uint32_t i = 0U; i < niter; i++) chess_loop_range(1, )
    chess_prepare_for_pipelining
    {
        dstPtr[i] = srcPtr[i];
    }

    // Internal offset to the real VPUC table
    ddfHeaderPtr->ddfTblInternalOfst = offsetToVPUC;

    cupvaDDFUpdateDmaBase(ddfPayloadPtr, nodeNumPerLane, laneNum);
    for (uint8_t i = 0U; i < nodeNumPerLane; i++)
    {
        uint32_t ddfBaseIdx = (uint32_t)i * 2U * DDF_ENTRY_NUM_PER_DESC;
        uint32_t *ddfPtr    = &ddfPayloadPtr[ddfBaseIdx];
        bool isSrcVmem      = cupvaDDFIsSrcVmem(ddfPtr, 1);
        bool isDstVmem      = cupvaDDFIsDstVmem(ddfPtr, 1);

        // vmem<->vmem or MC<->L2 transfer not supported
        uint64_t srcBase = isSrcVmem ? vmemBufAddr : extBufAddr;
        uint64_t dstBase = isDstVmem ? vmemBufAddr : extBufAddr;
        CUPVA_VPU_ASSERT((0xFFFFFFFFU - ddfPtr[2 * (int32_t)DDF_SRC_ADDR]) >= (uint32_t)(srcBase & 0xffffffffU));
        CUPVA_VPU_ASSERT((0xFFFFFFFFU - ddfPtr[2 * (int32_t)DDF_DST_ADDR]) >= (uint32_t)(dstBase & 0xffffffffU));
        ddfPtr[2 * (int32_t)DDF_SRC_ADDR] += (uint32_t)(srcBase & 0xffffffffU);
        ddfPtr[2 * (int32_t)DDF_DST_ADDR] += (uint32_t)(dstBase & 0xffffffffU);
        ddfPtr[2 * (int32_t)DDF_DESCR_CNTL] += (uint32_t)(((srcBase >> 32) & 0xffU) << 16);
        ddfPtr[2 * (int32_t)DDF_DESCR_CNTL] += (uint32_t)(((dstBase >> 32) & 0xffU) << 24);
    }
}

inline void cupvaDDFParallelOpen(uint32_t *ddfBasePtr, ExtMemPointer extBufSymbol, uint32_t vmemBufAddr)
{
    CUPVA_VPU_ASSERT(ddfBasePtr != NULL);
    DDFParams *ddfHeaderPtr = POINTER_CAST(DDFParams, ddfBasePtr);
    CUPVA_VPU_ASSERT(ddfHeaderPtr->nodeNumPerLane <= CUPVA_DDF_MAX_NODE_NUM);
    CUPVA_VPU_ASSERT(ddfHeaderPtr->laneNum <= CUPVA_DDF_MAX_LANE_NUM);
    uint8_t laneNum         = ddfHeaderPtr->laneNum;
    uint8_t nodeNumPerLane  = ddfHeaderPtr->nodeNumPerLane;
    uint32_t *ddfPayloadPtr = &ddfBasePtr[DDF_HIDDEN_HEAD + (DDF_ENTRY_HEAD * DDF_SCRATCH_PITCH)];
    uint64_t extBufAddr     = extBufSymbol.base + (uint64_t)extBufSymbol.offset;

    cupvaDDFUpdateDmaBase(ddfPayloadPtr, nodeNumPerLane, laneNum);
    for (uint8_t j = 0U; j < laneNum; j++)
    {
        for (uint8_t i = 0U; i < nodeNumPerLane; i++)
        {
            uint32_t ddfBaseIdx = ((uint32_t)i * 2U * DDF_ENTRY_NUM_PER_DESC * DDF_SCRATCH_PITCH) + (uint32_t)j;
            uint32_t *ddfPtr    = &ddfPayloadPtr[ddfBaseIdx];
            bool isSrcVmem      = cupvaDDFIsSrcVmem(ddfPtr, DDF_SCRATCH_PITCH);
            bool isDstVmem      = cupvaDDFIsDstVmem(ddfPtr, DDF_SCRATCH_PITCH);

            // vmem<->vmem or MC<->L2 transfer not supported
            uint64_t srcBase = isSrcVmem ? vmemBufAddr : extBufAddr;
            uint64_t dstBase = isDstVmem ? vmemBufAddr : extBufAddr;
            CUPVA_VPU_ASSERT((0xFFFFFFFFU - ddfPtr[2U * (uint32_t)DDF_SRC_ADDR * DDF_SCRATCH_PITCH]) >=
                             (uint32_t)(srcBase & 0xffffffffU));
            CUPVA_VPU_ASSERT((0xFFFFFFFFU - ddfPtr[2U * (uint32_t)DDF_DST_ADDR * DDF_SCRATCH_PITCH]) >=
                             (uint32_t)(dstBase & 0xffffffffU));
            ddfPtr[2U * (uint32_t)DDF_SRC_ADDR * DDF_SCRATCH_PITCH] += (uint32_t)(srcBase & 0xffffffffU);
            ddfPtr[2U * (uint32_t)DDF_DST_ADDR * DDF_SCRATCH_PITCH] += (uint32_t)(dstBase & 0xffffffffU);
            ddfPtr[2U * (uint32_t)DDF_DESCR_CNTL * DDF_SCRATCH_PITCH] += (uint32_t)(((srcBase >> 32) & 0xffU) << 16);
            ddfPtr[2U * (uint32_t)DDF_DESCR_CNTL * DDF_SCRATCH_PITCH] += (uint32_t)(((dstBase >> 32) & 0xffU) << 24);
        }
    }
}

inline void cupvaDDFFlush(uint32_t *ddfPayloadPtr, uint32_t *ddfVpucTblPtr, const uint8_t nodeNumPerLane,
                          const uint8_t laneNum)
{
    agen configIn;
    agen configOut;

    CUPVA_VPU_ASSERT(nodeNumPerLane <= CUPVA_DDF_MAX_NODE_NUM);
    CUPVA_VPU_ASSERT(laneNum <= CUPVA_DDF_MAX_LANE_NUM);
    AgenWrapper wrapper;
    wrapper.size = (int32_t)sizeof(int32_t);

    int32_t vecw = 16;
    int32_t singleTblLen =
        ((int32_t)DDF_ENTRY_HEAD + (2 * ((int32_t)nodeNumPerLane * (int32_t)DDF_ENTRY_NUM_PER_DESC)) + vecw - 1) /
        vecw * vecw;
    int32_t num_vecs = singleTblLen / vecw;
    int32_t niter1   = num_vecs;
    int32_t niter2   = (int32_t)laneNum;
    uint32_t niter   = (uint32_t)niter1 * (uint32_t)niter2;

    configIn   = init(POINTER_CAST(vuint, ddfPayloadPtr));
    wrapper.n1 = (uint16_t)niter1;
    wrapper.n2 = (uint16_t)niter2;
    wrapper.s1 = (int32_t)vecw * (int32_t)DDF_SCRATCH_PITCH;
    wrapper.s2 = 1;
    INIT_AGEN2(configIn, wrapper);
    configIn.lane_ofst = 1;

    configOut  = init(POINTER_CAST(vuint, ddfVpucTblPtr));
    wrapper.n1 = (uint16_t)niter1;
    wrapper.n2 = (uint16_t)niter2;
    wrapper.s1 = vecw;
    wrapper.s2 = singleTblLen;
    CUPVA_VPU_ASSERT(wrapper.s2 >= 0);
    INIT_AGEN2(configOut, wrapper);

    for (uint32_t i = 0U; i < niter; i++) chess_prepare_for_pipelining
    {
        dvintx v1 = dvuint_load_transp(configIn);
        vstore(v1, configOut);
    }
}

inline void cupvaDDFTrig(uint32_t *ddfBasePtr)
{
    DDFParams *ddfHeaderPtr = POINTER_CAST(DDFParams, ddfBasePtr);
    uint32_t trigger        = ddfHeaderPtr->trigger;

    chess_memory_fence();
    cupvaDataFlowTrig(trigger);
}

inline void cupvaDDFParallelTrig(uint32_t *ddfBasePtr)
{
    CUPVA_VPU_ASSERT(ddfBasePtr != NULL);
    DDFParams *ddfHeaderPtr = POINTER_CAST(DDFParams, ddfBasePtr);
    CUPVA_VPU_ASSERT(ddfHeaderPtr->nodeNumPerLane <= CUPVA_DDF_MAX_NODE_NUM);
    CUPVA_VPU_ASSERT(ddfHeaderPtr->laneNum <= CUPVA_DDF_MAX_LANE_NUM);
    uint8_t laneNum             = ddfHeaderPtr->laneNum;
    uint8_t nodeNumPerLane      = ddfHeaderPtr->nodeNumPerLane;
    uint32_t ddfTblInternalOfst = ddfHeaderPtr->ddfTblInternalOfst;
    uint32_t trigger            = ddfHeaderPtr->trigger;

    cupvaDDFFlush(&ddfBasePtr[DDF_HIDDEN_HEAD], &ddfBasePtr[ddfTblInternalOfst], nodeNumPerLane, laneNum);
    chess_memory_fence();
    cupvaDataFlowTrig(trigger);
}

inline void cupvaDDFSync(uint32_t *ddfBasePtr)
{
    DDFParams *ddfHeaderPtr = POINTER_CAST(DDFParams, ddfBasePtr);
    uint32_t trigger        = ddfHeaderPtr->trigger;
    cupvaDataFlowSync(trigger);
}

inline void cupvaDDFParallelSync(uint32_t *ddfBasePtr)
{
    DDFParams *ddfHeaderPtr = POINTER_CAST(DDFParams, ddfBasePtr);
    uint32_t trigger        = ddfHeaderPtr->trigger;
    cupvaDataFlowSync(trigger);
}

inline void cupvaDDFClose(uint32_t *ddfBasePtr)
{
    DDFParams *ddfHeaderPtr = POINTER_CAST(DDFParams, ddfBasePtr);
    uint32_t trigger        = ddfHeaderPtr->trigger;

    cupvaDDFUpdateTx(ddfBasePtr, 0U, 0U);
    cupvaDDFUpdateTy(ddfBasePtr, 0U, 0U);
    cupvaDDFUpdateLdid(ddfBasePtr, 0U, 0U);

    cupvaDataFlowTrig(trigger);
    cupvaDataFlowSync(trigger);
}

inline void cupvaDDFParallelClose(uint32_t *ddfBasePtr)
{
    CUPVA_VPU_ASSERT(ddfBasePtr != NULL);
    DDFParams *ddfHeaderPtr = POINTER_CAST(DDFParams, ddfBasePtr);
    CUPVA_VPU_ASSERT(ddfHeaderPtr->nodeNumPerLane <= CUPVA_DDF_MAX_NODE_NUM);
    CUPVA_VPU_ASSERT(ddfHeaderPtr->laneNum <= CUPVA_DDF_MAX_LANE_NUM);
    uint32_t trigger            = ddfHeaderPtr->trigger;
    uint8_t laneNum             = ddfHeaderPtr->laneNum;
    uint8_t nodeNumPerLane      = ddfHeaderPtr->nodeNumPerLane;
    uint32_t ddfTblInternalOfst = ddfHeaderPtr->ddfTblInternalOfst;

    vintx vzero = replicatew(0);
    for (uint8_t nodeIdx = 0U; nodeIdx < nodeNumPerLane; nodeIdx++)
    {
        cupvaDDFUpdateTx(ddfBasePtr, vzero, nodeIdx);
        cupvaDDFUpdateTy(ddfBasePtr, vzero, nodeIdx);
        cupvaDDFUpdateLdid(ddfBasePtr, vzero, nodeIdx);
        cupvaDDFFlush(&ddfBasePtr[DDF_HIDDEN_HEAD], &ddfBasePtr[ddfTblInternalOfst], nodeNumPerLane, laneNum);
    }

    cupvaDataFlowTrig(trigger);
    cupvaDataFlowSync(trigger);
}

#endif