device_core.h#

Fully qualified name: src/device/vpu_runtime/include/cupva_device/device_core.h

File members: src/device/vpu_runtime/include/cupva_device/device_core.h

/*
 * Copyright (c) 2020-2023 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA Corporation is strictly prohibited.
 */
#ifndef CUPVA_DEVICE_CORE_H
#define CUPVA_DEVICE_CORE_H

#include "impl/dma_common.h"

#include <cupva_types.h>
#include <limits.h>
#include <stddef.h>
#include <stdint.h>

#if CUPVA_BUILD_MODE != CUPVA_NATIVE
typedef dvuint AgenCFG;
#    define CUPVA_VPU_ASSERT(x) ((void)0)
#else
#    include <assert.h>
#    define CUPVA_VPU_ASSERT(x) assert(x)
#endif

#define IGNORE_RETURN(x) ((void)(x))
#define POINTER_CAST(_T, _P) ((_T *)((void *)_P))
#define CONST_POINTER_CAST(_T, _P) ((_T const *)((void const *)_P))

#define CUPVA_VPU_MAIN() int32_t CUPVA_VPU_MAIN_SYMBOL()

#define CUPVA_SINGLE_BUFFER_LENGTH_IMPL(tw, th, haloX, haloY) (((tw) + 2 * (haloX)) * ((th) + 2 * (haloY)))

#define CUPVA_BUFFER_LENGTH_IMPL_VAR(tw, th, ...) CUPVA_SINGLE_BUFFER_LENGTH_IMPL(tw, th, 0, 0)
#define CUPVA_BUFFER_LENGTH_IMPL_HALO_1D(tw, th, haloX, ...) CUPVA_SINGLE_BUFFER_LENGTH_IMPL(tw, th, haloX, 0)
#define CUPVA_BUFFER_LENGTH_IMPL_HALO_2D(tw, th, haloX, haloY, ...) \
    CUPVA_SINGLE_BUFFER_LENGTH_IMPL(tw, th, haloX, haloY)
#define CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D(tw, th, haloX, haloY, type, mode)                                \
    (TRANSPOSE_LINE_PITCH_ALIGNED(((tw) + 2 * (haloX)) * sizeof(type), mode, sizeof(type)) * ((th) + 2 * (haloY)) / \
     sizeof(type))
#define CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D_V2(tw, th, haloX, haloY, mode, type) \
    CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D(tw, th, haloX, haloY, type, mode)

#define CUPVA_BUFFER_LENGTH_MACRO(tw, th, haloX, haloY, type, mode, macro, ...) macro

#define CUPVA_CIRCULAR_BUFFER_LENGTH_IMPL_RM(type, tw, th, haloX, haloY, mode, ...)                                  \
    ((CIRCULAR_BUFFER_ALIGNED_SIZE(TRANSPOSE_LINE_PITCH_ALIGNED(CIRCULAR_BUFFER_LINE_PITCH_ALIGNED(                  \
                                                                    ((tw) * CIRCULAR_BUFFER_LINE_PITCH_FACTOR0 +     \
                                                                     (haloX) * CIRCULAR_BUFFER_LINE_PITCH_FACTOR1) * \
                                                                    sizeof(type)),                                   \
                                                                mode, sizeof(type)) *                                \
                                   ((th) + 2 * (haloY))) +                                                           \
      CIRCULAR_BUFFER_EXTRA_BYTES) /                                                                                 \
     sizeof(type))

#define CUPVA_CIRCULAR_BUFFER_LENGTH_IMPL_CM(type, tw, th, haloX, haloY, mode, ...)                               \
    ((CIRCULAR_BUFFER_ALIGNED_SIZE(                                                                               \
          TRANSPOSE_LINE_PITCH_ALIGNED(CIRCULAR_BUFFER_LINE_PITCH_ALIGNED((((tw) + 2 * (haloX)) * sizeof(type))), \
                                       mode, sizeof(type)) *                                                      \
          ((th) * CIRCULAR_BUFFER_LINE_PITCH_FACTOR0 + (haloY) * CIRCULAR_BUFFER_LINE_PITCH_FACTOR1)) +           \
      CIRCULAR_BUFFER_EXTRA_BYTES) /                                                                              \
     sizeof(type))

#define CUPVA_MAX(a, b) ((a) > (b) ? (a) : (b))

#define CUPVA_SINGLE_BUFFER_LENGTH(tw, th, ...)                                                   \
    CUPVA_BUFFER_LENGTH_MACRO(tw, th, ##__VA_ARGS__, CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D,  \
                              CUPVA_BUFFER_LENGTH_IMPL_HALO_2D, CUPVA_BUFFER_LENGTH_IMPL_HALO_2D, \
                              CUPVA_BUFFER_LENGTH_IMPL_HALO_1D, CUPVA_BUFFER_LENGTH_IMPL_VAR)     \
    (tw, th, ##__VA_ARGS__)

#define CUPVA_DOUBLE_BUFFER_LENGTH(tw, th, ...) \
    (CIRCULAR_BUFFER_ALIGNED_SIZE(CUPVA_SINGLE_BUFFER_LENGTH(tw, th, ##__VA_ARGS__) * 2))

#define CUPVA_CIRCULAR_BUFFER_LENGTH(type, tw, th, haloX, haloY, ...) \
    RDF_CIRCULAR_ROWMAJOR(type, tw, th, haloX, haloY, ##__VA_ARGS__)

#define RDF_SINGLE(type, tw, th, ...)                                                                     \
    CUPVA_BUFFER_LENGTH_MACRO(tw, th, ##__VA_ARGS__, type, CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D_V2, \
                              CUPVA_BUFFER_LENGTH_IMPL_HALO_2D, CUPVA_BUFFER_LENGTH_IMPL_HALO_1D,         \
                              CUPVA_BUFFER_LENGTH_IMPL_VAR)                                               \
    (tw, th, ##__VA_ARGS__, type)

#define RDF_DOUBLE(type, tw, th, ...) \
    (CIRCULAR_BUFFER_ALIGNED_SIZE((RDF_SINGLE(type, tw, th, ##__VA_ARGS__)) * 2 * sizeof(type)) / sizeof(type))

#define RDF_CIRCULAR_ROWMAJOR(type, tw, th, haloX, haloY, ...) \
    CUPVA_CIRCULAR_BUFFER_LENGTH_IMPL_RM(type, tw, th, haloX, haloY, ##__VA_ARGS__, TRANS_MODE_NONE)

#define RDF_CIRCULAR_COLUMNMAJOR(type, tw, th, haloX, haloY, ...) \
    CUPVA_CIRCULAR_BUFFER_LENGTH_IMPL_CM(type, tw, th, haloX, haloY, ##__VA_ARGS__, TRANS_MODE_NONE)

#define RDF_CIRCULAR(type, tw, th, haloX, haloY, ...)                           \
    CUPVA_MAX(RDF_CIRCULAR_ROWMAJOR(type, tw, th, haloX, haloY, ##__VA_ARGS__), \
              RDF_CIRCULAR_COLUMNMAJOR(type, tw, th, haloX, haloY, ##__VA_ARGS__))

#define VMEM_NOEXPORT(bank, type, name, ...) VMEM_NOEXPORT_IMPL(bank, type, name, ##__VA_ARGS__)

#define VMEM_DATA_IMPL(bank, type, name, ...)                                        \
    GET_VMEM_MACRO(bank, type, name, ##__VA_ARGS__, VMEM_2D, VMEM_1D, VMEM_VAR, ...) \
    (bank, type, name, ##__VA_ARGS__);                                               \
    CUPVA_EXPORT(name, name)

#define VMEM_DDF_HANDLER_WRAPPER(bank, type, name, nodesPerLane, lanes, isDDF) \
    VMEM_DDF_HANDLER(bank, name, nodesPerLane, lanes)

#define VMEM_DDF_SELECTOR(bank, type, name, h, w, isDDF, macro, ...) macro

#define VMEM_INVOKE(NAME, ARGS) NAME ARGS

#define VMEM(bank, type, name, ...)                                                                        \
    VMEM_INVOKE(VMEM_INVOKE(VMEM_DDF_SELECTOR, (bank, type, name, ##__VA_ARGS__, VMEM_DDF_HANDLER_WRAPPER, \
                                                VMEM_DATA_IMPL, VMEM_DATA_IMPL, VMEM_DATA_IMPL)),          \
                (bank, type, name, ##__VA_ARGS__))

#define EXTERN_VMEM(bank, type, name, ...) EXTERN_VMEM_IMPL(bank, type, name, ##__VA_ARGS__)

#define VMEM_POINTER(bank, name)         \
    VMEM_VAR(bank, ExtMemPointer, name); \
    CUPVA_EXPORT(name, name, VMEM_TYPE_POINTER_EX)

#define EXTERN_VMEM_POINTER(bank, name) EXTERN_VMEM_IMPL(bank, ExtMemPointer, name)

#define VMEM_SURFACE(bank, name)                                \
    VMEM_VAR(bank, VPUSurfaceData, name);                       \
    CUPVA_EXPORT(name.pointer, _S##name, VMEM_TYPE_POINTER_EX); \
    CUPVA_EXPORT(name.metadata, _M##name)

#define EXTERN_VMEM_SURFACE(bank, name) EXTERN_VMEM_IMPL(bank, VPUSurfaceData, name)

#define VMEM_RDF_UNIFIED(bank, name)         \
    VMEM_VAR(bank, UnifiedRDFHandler, name); \
    CUPVA_EXPORT(name.hdl, name)

#define VMEM_DMA_CONFIG(bank, type, name, ...)                                       \
    GET_VMEM_MACRO(bank, type, name, ##__VA_ARGS__, VMEM_2D, VMEM_1D, VMEM_VAR, ...) \
    (bank, type, name, ##__VA_ARGS__);                                               \
    CUPVA_EXPORT(name, name, VMEM_TYPE_VPUC_TABLE)

#define EXTERN_VMEM_DMA_CONFIG(bank, type, name, ...) EXTERN_VMEM_IMPL(bank, type, name, ##__VA_ARGS__)

#define CUPVA_ALIGNED(type, name, alignment) CUPVA_ALIGNED_IMPL(type, name, alignment)

#define CUPVA_EXPORT(variable, name, ...) \
    CUPVA_EXPORT_IMPL(variable, name##_export, sizeof(variable), ##__VA_ARGS__, VMEM_TYPE_DATA)

inline void cupvaDataFlowTrig(uint32_t handler);

inline void cupvaDataFlowSync(uint32_t handler);

inline uint32_t cupvaGetVpuId(void);

inline uint32_t cupvaGetVmemBase(void);

inline uint32_t cupvaGetDmaDescBase(void);

inline uint32_t cupvaGetHwseqRamBase(void);

inline uint32_t cupvaGetChannelRegAddr(int8_t channel_idx, uint16_t reg_ofst);

inline uint32_t cupvaGetL2Base(void);

inline uint32_t cupvaGetL2Size(void);

inline void cupvaGetL2ExtMemPointer(ExtMemPointer *ptr, uint32_t offset)
{
    ptr->base     = cupvaGetL2Base();
    ptr->size     = cupvaGetL2Size();
    ptr->offset   = offset;
    uint8_t *base = POINTER_CAST(uint8_t, &ptr->base);
    base[7]       = CUPVA_EXT_MEM_TYPE_L2;
}

inline uint32_t cupvaGetVmemAddress(void *ptr);

typedef struct
{
    int32_t size;

    uint16_t n1;
    uint16_t n2;
    uint16_t n3;
    uint16_t n4;
    uint16_t n5;
    uint16_t n6;

    int32_t s1;
    int32_t s2;
    int32_t s3;
    int32_t s4;
    int32_t s5;
    int32_t s6;
} AgenWrapper;

#define INIT_AGEN1(AGEN_INIT, AGEN_WRAPPER)                   \
    do                                                        \
    {                                                         \
        AGEN_INIT.n1   = AGEN_WRAPPER.n1;                     \
        AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size; \
    } while (false)

#define INIT_AGEN2(AGEN_INIT, AGEN_WRAPPER)                                                                          \
    do                                                                                                               \
    {                                                                                                                \
        AGEN_INIT.n1   = AGEN_WRAPPER.n1;                                                                            \
        AGEN_INIT.n2   = AGEN_WRAPPER.n2;                                                                            \
        AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size;                                                        \
        AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size; \
    } while (false)

#define INIT_AGEN3(AGEN_INIT, AGEN_WRAPPER)                                                                          \
    do                                                                                                               \
    {                                                                                                                \
        AGEN_INIT.n1   = AGEN_WRAPPER.n1;                                                                            \
        AGEN_INIT.n2   = AGEN_WRAPPER.n2;                                                                            \
        AGEN_INIT.n3   = AGEN_WRAPPER.n3;                                                                            \
        AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size;                                                        \
        AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size; \
        AGEN_INIT.mod3 = (AGEN_WRAPPER.s3 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                     \
                          (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2)) *                                      \
                         AGEN_WRAPPER.size;                                                                          \
    } while (false)

#define INIT_AGEN4(AGEN_INIT, AGEN_WRAPPER)                                                                          \
    do                                                                                                               \
    {                                                                                                                \
        AGEN_INIT.n1   = AGEN_WRAPPER.n1;                                                                            \
        AGEN_INIT.n2   = AGEN_WRAPPER.n2;                                                                            \
        AGEN_INIT.n3   = AGEN_WRAPPER.n3;                                                                            \
        AGEN_INIT.n4   = AGEN_WRAPPER.n4;                                                                            \
        AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size;                                                        \
        AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size; \
        AGEN_INIT.mod3 = (AGEN_WRAPPER.s3 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                     \
                          (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2)) *                                      \
                         AGEN_WRAPPER.size;                                                                          \
        AGEN_INIT.mod4 = (AGEN_WRAPPER.s4 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                     \
                          (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) -                                       \
                          (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3)) *                                      \
                         AGEN_WRAPPER.size;                                                                          \
    } while (false)

#define INIT_AGEN5(AGEN_INIT, AGEN_WRAPPER)                                                                            \
    do                                                                                                                 \
    {                                                                                                                  \
        AGEN_INIT.n1   = AGEN_WRAPPER.n1;                                                                              \
        AGEN_INIT.n2   = AGEN_WRAPPER.n2;                                                                              \
        AGEN_INIT.n3   = AGEN_WRAPPER.n3;                                                                              \
        AGEN_INIT.n4   = AGEN_WRAPPER.n4;                                                                              \
        AGEN_INIT.n5   = AGEN_WRAPPER.n5;                                                                              \
        AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size;                                                          \
        AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size;   \
        AGEN_INIT.mod3 = (AGEN_WRAPPER.s3 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                       \
                          (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2)) *                                        \
                         AGEN_WRAPPER.size;                                                                            \
        AGEN_INIT.mod4 = (AGEN_WRAPPER.s4 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                       \
                          (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) -                                         \
                          (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3)) *                                        \
                         AGEN_WRAPPER.size;                                                                            \
        AGEN_INIT.mod5 =                                                                                               \
            (AGEN_WRAPPER.s5 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                                    \
             (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) - (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3) - \
             (((int32_t)AGEN_WRAPPER.n4 - 1) * AGEN_WRAPPER.s4)) *                                                     \
            AGEN_WRAPPER.size;                                                                                         \
    } while (false)

#define INIT_AGEN6(AGEN_INIT, AGEN_WRAPPER)                                                                            \
    do                                                                                                                 \
    {                                                                                                                  \
        AGEN_INIT.n1   = AGEN_WRAPPER.n1;                                                                              \
        AGEN_INIT.n2   = AGEN_WRAPPER.n2;                                                                              \
        AGEN_INIT.n3   = AGEN_WRAPPER.n3;                                                                              \
        AGEN_INIT.n4   = AGEN_WRAPPER.n4;                                                                              \
        AGEN_INIT.n5   = AGEN_WRAPPER.n5;                                                                              \
        AGEN_INIT.n6   = AGEN_WRAPPER.n6;                                                                              \
        AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size;                                                          \
        AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size;   \
        AGEN_INIT.mod3 = (AGEN_WRAPPER.s3 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                       \
                          (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2)) *                                        \
                         AGEN_WRAPPER.size;                                                                            \
        AGEN_INIT.mod4 = (AGEN_WRAPPER.s4 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                       \
                          (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) -                                         \
                          (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3)) *                                        \
                         AGEN_WRAPPER.size;                                                                            \
        AGEN_INIT.mod5 =                                                                                               \
            (AGEN_WRAPPER.s5 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                                    \
             (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) - (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3) - \
             (((int32_t)AGEN_WRAPPER.n4 - 1) * AGEN_WRAPPER.s4)) *                                                     \
            AGEN_WRAPPER.size;                                                                                         \
        AGEN_INIT.mod6 =                                                                                               \
            (AGEN_WRAPPER.s6 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                                    \
             (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) - (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3) - \
             (((int32_t)AGEN_WRAPPER.n4 - 1) * AGEN_WRAPPER.s4) -                                                      \
             (((int32_t)AGEN_WRAPPER.n5 - 1) * AGEN_WRAPPER.s5)) *                                                     \
            AGEN_WRAPPER.size;                                                                                         \
    } while (false)

inline void cupvaModifyAgenCfgBase(AgenCFG *cfg, void *const addr);

inline void *cupvaGetAgenCfgBase(AgenCFG *cfg);

inline void cupvaPrefetchReadyWait(void);

inline void cupvaPrefetchDoneWait(void);

inline void cupvaAdvanceAgenCfg(AgenCFG *cfg, const int32_t step_in_bytes)
{
    agen x = init_agen_from_cfg(*cfg);
    adv_agen_base(x, step_in_bytes);
    *cfg = extract_agen_cfg(x);
}

#define CUPVA_GET_DESC_ATTR_ADDR(desc_id, field)                                          \
    (cupvaGetDmaDescBase() + ((uint32_t)sizeof(PvaDmaDescriptor) * (uint32_t)(desc_id)) + \
     (uint32_t)offsetof(PvaDmaDescriptor, field))

#define CUPVA_ICACHE_FIELD(r, f, value)                             \
    (((value) & CUPVA_FIELD_MASK(CUPVA_ICACHE_##r##_0_##f##_RANGE)) \
     << CUPVA_FIELD_SHIFT(CUPVA_ICACHE_##r##_0_##f##_RANGE))

inline void cupvaCircularBufferMemcpy(void *cb, uint32_t size)
{
    dvint *src = (dvint *)cb;
    dvint *dst = (dvint *)((void *)&((uint8_t *)cb)[size]);
    (void)(*dst = extract(sign_extend(*src)));
}

inline uint32_t _cupvaRasterDataFlowAdvProg(RasterDataFlowHandler &handler)
{
    uint8_t const pc                    = handler.pc;
    RasterDataFlowTrigInst &instruction = handler.trigProgram[pc];
    uint8_t const newTrigCnt            = (uint8_t)mod_inc((int32_t)handler.trigCnt, (int32_t)instruction.trigRpt);
    uint8_t const jumpDec               = (uint8_t)max((int32_t)instruction.jmpCnt, 1) - 1U; // Subtract 1, clamp to 0
    // If we are not repeating this trigger, update jmpCnt
    uint8_t const newJmpCnt = (newTrigCnt != 0U) ? instruction.jmpCnt : jumpDec;
    // If the updated jmpCnt is different to curent jmpCnt, it means we should take the jump
    uint32_t const applyJmp = (newJmpCnt != instruction.jmpCnt) ? 1U : 0U;
    // Decide whether to 1) apply jump 2) advance PC or 3) stay with current PC
    uint32_t const jumpOrAdv = (applyJmp != 0U) ? (uint32_t)instruction.jmpOffst : 1U;
    uint32_t const pcOffset  = (newTrigCnt != 0U) ? 0U : jumpOrAdv;
    // GPIO[0] are not used for triggers so are used for 'no-trigger'
    uint32_t const setTrigger = ((instruction.gpio > 0U) ? 1U : 0U);
    uint32_t const trigger    = setTrigger << instruction.gpio;
    instruction.jmpCnt        = newJmpCnt;
    uint32_t const newPc      = pc + pcOffset;
    // Write updates to handler
    handler.pc          = (uint8_t)min(newPc, RDF_INST_COUNT - 1U); // pc - clamp to valid bounds
    handler.trigCnt     = newTrigCnt;                               // trigCnt
    handler.gpioHistory = instruction.gpio;
    return trigger;
}

inline void cupvaRasterDataFlowTrig(RasterDataFlowHandler &handler)
{
    uint32_t trigger = _cupvaRasterDataFlowAdvProg(handler);
    cupvaDataFlowTrig(trigger);
}

inline void cupvaRasterDataFlowSync(RasterDataFlowHandler &handler)
{
    uint32_t const setTrigger = ((handler.gpioHistory > 0U) ? 1U : 0U);
    uint32_t const trigger    = setTrigger << handler.gpioHistory;
    handler.gpioHistory       = 0;
    cupvaDataFlowSync(trigger);
}

inline int32_t cupvaRasterDataFlowGetLinePitch(RasterDataFlowHandler &handler)
{
    return (int32_t)handler.linePitch;
}

inline uint8_t cupvaRasterDataFlowGetLayout(RasterDataFlowHandler &handler)
{
    return CUPVA_EXTRACT_FIELD(RDF_TRANSFER_PROPERTIES_LAYOUT, handler.transferProperties);
}

inline bool cupvaRasterDataFlowIsRead(RasterDataFlowHandler &handler)
{
    return (CUPVA_EXTRACT_FIELD(RDF_TRANSFER_PROPERTIES_IS_READ, handler.transferProperties) > 0U);
}

inline uint32_t cupvaRasterDataFlowGetBppLog2(RasterDataFlowHandler &handler)
{
    return CUPVA_EXTRACT_FIELD(RDF_TRANSFER_PROPERTIES_BPPLOG2, (uint32_t)handler.transferProperties);
}

inline int32_t cupvaRasterDataFlowGetCbLen(RasterDataFlowHandler &handler)
{
    return handler.cbLen;
}

inline int32_t cupvaRasterDataFlowGetAdvance(RasterDataFlowHandler &handler)
{
    uint16_t ixAdv = handler.ixAdv;
    uint16_t nxm1  = handler.nxm1;
    bool isLast    = ixAdv == nxm1;
    ixAdv          = isLast ? (uint16_t)0U : (ixAdv + (uint16_t)1U);
    int32_t adv    = isLast ? handler.adv[1] : handler.adv[0];
    handler.ixAdv  = ixAdv;

    return adv;
}

inline int32_t cupvaRasterDataFlowGetOffset(RasterDataFlowHandler &handler, int32_t offset)
{
    int32_t ret = cupvaRasterDataFlowGetAdvance(handler) + offset;
    ret         = (ret >= handler.cbLen) ? (ret - handler.cbLen) : ret;
    ret         = (ret < 0) ? (ret + handler.cbLen) : ret;
    return ret;
}

inline int32_t cupvaTransposeLaneOfst(RasterDataFlowHandler &handler, TranspositionMode mode, int32_t bpp)
{
    uint32_t linePitchInBytes  = (uint32_t)(handler.linePitch * bpp);
    uint8_t const transHOffset = TRANSPOSE_HORIZONTAL_OFFSET((uint8_t)mode, (uint8_t)bpp);
    uint32_t laneOfst          = (linePitchInBytes - (uint32_t)transHOffset) / VMEM_SUPERBANK_WIDTH_IN_BYTES;
    return (int32_t)laneOfst;
}

void cupvaRasterDataFlowOpen(UnifiedRDFHandler &handler, void *vmemBuffer);

inline void *cupvaRasterDataFlowAcquire(UnifiedRDFHandler &handler)
{
    // Trigger/sync values
    uint32_t const shouldDeferSync = handler.deferSync;
    uint32_t const shouldDeferTrig = handler.deferTrig;
    uint32_t const sync            = handler.nextSync;
    uint32_t const trig            = handler.nextTrig;

    // Pointers for copying CB and returning
    void *const base  = handler.base;
    void *const cbDst = handler.cbDst;
    dvintx cbCopyData;
    uint32_t const bppLog2 = cupvaRasterDataFlowGetBppLog2(handler.hdl);

    // Scalar loads for tile advancement
    uint16_t const ixAdvCurrent      = handler.hdl.ixAdv;
    uint16_t const nxm1Current       = handler.hdl.nxm1;
    int32_t const adv0               = handler.hdl.adv[0];
    int32_t const adv1               = handler.hdl.adv[1];
    uint32_t const tileOffsetCurrent = handler.tileOffset;

    if (shouldDeferSync == 0U)
    {
        cupvaDataFlowSync(sync);
    }

    // Update offset step 1 - determine adv, load registers
    int32_t const cbLen  = handler.hdl.cbLen;
    bool const isLast    = (ixAdvCurrent == nxm1Current);
    int32_t const adv    = isLast ? adv1 : adv0;
    uint16_t const ixAdv = isLast ? (uint16_t)0U : (ixAdvCurrent + (uint16_t)1U);
    void *const retVal   = (int8_t *)base + (tileOffsetCurrent << bppLog2);

    // Vector read CB head
    cbCopyData = sign_extend(*(dvint *)base);

    if (shouldDeferTrig == 0U)
    {
        cupvaDataFlowTrig(trig);
    }

    // Update offset step 2 - compute new tileOffset with wrapping
    handler.hdl.ixAdv           = ixAdv;
    int32_t const tileOffsetNew = adv + (int32_t)tileOffsetCurrent;
    int32_t tileOffsetWrapped   = (tileOffsetNew >= cbLen) ? (tileOffsetNew - cbLen) : tileOffsetNew;
    // On host side, negative advancements with CB have been converted to positive advancements, so no need to check
    // for condition tileOffsetNew < 0.
    handler.tileOffset = (uint32_t)tileOffsetWrapped;

    // Vector write CB tail
    *(dvint *)cbDst = extract(cbCopyData);

    // Return previously calculated tile offset as a pointer
    return retVal;
}

inline void cupvaRasterDataFlowRelease(UnifiedRDFHandler &handler)
{
    // Load data to registers
    uint8_t const pc               = handler.hdl.pc;
    uint32_t const trigCnt         = handler.hdl.trigCnt;
    uint32_t const sync            = handler.nextSync;
    uint32_t const trig            = handler.nextTrig;
    uint32_t const shouldDeferTrig = handler.deferTrig;
    uint32_t const shouldDeferSync = handler.deferSync;

    // Next GPIO calculation part 1 - calculate PC jump
    RasterDataFlowTrigInst &instruction = handler.hdl.trigProgram[pc];
    uint32_t const trigRpt              = instruction.trigRpt;
    uint32_t const jmpCnt               = instruction.jmpCnt;
    int32_t const jmpOffset             = instruction.jmpOffst;
    uint32_t const newTrigCnt           = (uint32_t)mod_inc((int32_t)trigCnt, (int32_t)trigRpt);
    uint32_t const jumpDec              = max(jmpCnt, 1U) - 1U; // Subtract 1, clamp to 0

    if (shouldDeferSync != 0U)
    {
        cupvaDataFlowSync(sync);
    }

    // Next GPIO calculation part 1 - calculate next trigger
    // If we are not repeating this trigger, update jmpCnt
    uint32_t const newJmpCnt = (newTrigCnt != 0U) ? jmpCnt : jumpDec;
    // If the updated jmpCnt is different to curent jmpCnt, it means we should take the jump
    uint32_t const applyJmp = (newJmpCnt != jmpCnt) ? 1U : 0U;
    // Decide whether to 1) apply jump 2) advance PC or 3) stay with current PC
    uint32_t const jumpOrAdv = (applyJmp != 0U) ? (uint32_t)jmpOffset : 1U;
    uint32_t const pcOffset  = (newTrigCnt != 0U) ? 0U : jumpOrAdv;
    // GPIO[0] are not used for triggers so are used for 'no-trigger'
    uint32_t const setNextTrig = ((instruction.gpio > 0U) ? 1U : 0U);
    uint32_t const nextTrig    = setNextTrig << instruction.gpio;

    if (shouldDeferTrig != 0U)
    {
        cupvaDataFlowTrig(trig);
    }

    // Write updates to handler
    instruction.jmpCnt   = (uint8_t)newJmpCnt;
    uint32_t const newPc = pc + pcOffset;
    handler.hdl.pc       = (uint8_t)min(newPc, RDF_INST_COUNT - 1U); // pc - clamp to valid bounds
    handler.hdl.trigCnt  = (uint8_t)newTrigCnt;                      // trigCnt
    handler.nextTrig     = nextTrig;
    handler.nextSync     = trig;
}

inline void cupvaRasterDataFlowClose(UnifiedRDFHandler &handler)
{
    cupvaDataFlowSync(handler.nextSync);
}

inline int32_t cupvaRasterDataFlowGetLinePitch(UnifiedRDFHandler &handler)
{
    return cupvaRasterDataFlowGetLinePitch(handler.hdl);
}

inline int32_t cupvaRasterDataFlowGetCbLen(UnifiedRDFHandler &handler)
{
    return cupvaRasterDataFlowGetCbLen(handler.hdl);
}

inline int32_t cupvaTransposeLaneOfst(UnifiedRDFHandler &handler, TranspositionMode mode, int32_t bpp)
{
    return cupvaTransposeLaneOfst(handler.hdl, mode, bpp);
}

inline bool cupvaRasterDataFlowIsRead(UnifiedRDFHandler &handler)
{
    return cupvaRasterDataFlowIsRead(handler.hdl);
}

inline uint32_t cupvaRasterDataFlowGetBppLog2(UnifiedRDFHandler &handler)
{
    return cupvaRasterDataFlowGetBppLog2(handler.hdl);
}

// GOB width
#define BL_GOBW UINT32_C(64)
// GOB width in log2
#define BL_GOBW_LOG2 UINT32_C(6)
// GOB height
#define BL_GOBH UINT32_C(8)
// GOB height in log2
#define BL_GOBH_LOG2 UINT32_C(3)
// GOB size
#define BL_GOB_SZ (BL_GOBW * BL_GOBH)
// GOB size in log2
#define BL_GOB_SZ_LOG2 ((uint32_t)BL_GOBW_LOG2 + (uint32_t)BL_GOBH_LOG2)
// GOB sector width
#define BL_GOB_SECW UINT32_C(16)
// GOB sector width in log2
#define BL_GOB_SECW_LOG2 UINT32_C(4)
// GOB sector height
#define BL_GOB_SECH UINT32_C(2)
// GOB sector size
#define BL_GOB_SEC_SZ (BL_GOB_SECW * BL_GOB_SECH)
// GOB packet mask
#define BL_GOB_PACK_MASK (BL_GOBW >> 1)
// GOB packet stride
#define BL_GOB_PACK_STRIDE UINT32_C(8)
// GOB sub-packet vertical mask
#define BL_GOB_SUBPACK_VER_MASK UINT32_C(6)
// GOB sub-packet vertical stride
#define BL_GOB_SUBPACK_VER_STRIDE UINT32_C(32)
// GOB sub-packet horizontal mask
#define BL_GOB_SUBPACK_HOR_MASK (BL_GOB_SEC_SZ >> 1)
// GOB sub-packat horizontal stride
#define BL_GOB_SUBPACK_HOR_STRIDE UINT32_C(2)
// GOB sector vertical mask
#define BL_GOB_SEC_VER_MASK (BL_GOB_SECH - UINT32_C(1))
// GOB sector verical stride
#define BL_GOB_SEC_VER_STRIDE UINT32_C(16)
// GOB sector horizontal mask
#define BL_GOB_SEC_HOR_MASK (BL_GOB_SECW - UINT32_C(1))
// Swizzle bit which should be set on non-native builds
#define BL_SWIZZLE_BIT 39U
// Addresses from cupvaSurfaceAddress2D will fill metadata starting from this bit
#define BL_ADDR_METADATA_SHIFT 40U
// Address bits 13:9
#define SBADR_IOVA_RANGE \
    UINT32_C(13)         \
        : UINT32_C(9)
#define BL_SBADR_IOVA_MASK (CUPVA_FIELD_MASK(SBADR_IOVA_RANGE) << CUPVA_FIELD_SHIFT(SBADR_IOVA_RANGE))
// Flag to indicate that the address is a BL address
#define BL_METADATA_FLAG_BIT 8U
#define BL_METADATA_FLAG (1UL << BL_METADATA_FLAG_BIT)
#define BL_MAX_BLOCKHEIGHT_LOG2 (5)

inline vintx _cupvaBlocklinearOffset2D(vintx const &xCoords, vintx const &yCoords, uint32_t const linePitch,
                                       uint8_t const blockHeightLog2)
{
    uint32_t const widthInGobs      = linePitch >> (uint32_t)BL_GOBW_LOG2;
    int32_t const blockSizeLog2     = (int32_t)BL_GOB_SZ_LOG2 + (int32_t)blockHeightLog2;
    uint32_t const linesPerBlock    = (uint32_t)BL_GOBH << (uint32_t)blockHeightLog2;
    int32_t const linesPerBlockLog2 = (int32_t)BL_GOBH_LOG2 + (int32_t)blockHeightLog2;
    vintx x                         = xCoords;
    vintx y                         = yCoords;

    vintx vgobBase = (((y & (linesPerBlock - 1U)) >> (int32_t)BL_GOBH_LOG2) << (int32_t)BL_GOB_SZ_LOG2) +
                     ((x >> (int32_t)BL_GOBW_LOG2) << blockSizeLog2);
    vgobBase = vmaddwhw((y >> linesPerBlockLog2) << blockSizeLog2, (int32_t)widthInGobs, vgobBase, 0, 1);
    vintx vgobOffset =
        vmaddwhw(y & BL_GOB_SEC_VER_MASK, (int32_t)BL_GOB_SEC_VER_STRIDE, (x & BL_GOB_SEC_HOR_MASK), 0, 1);
    vgobOffset = vmaddwhw(x & BL_GOB_PACK_MASK, (int32_t)BL_GOB_PACK_STRIDE, vgobOffset, 0, 1);
    vgobOffset = vmaddwhw(y & BL_GOB_SUBPACK_VER_MASK, (int32_t)BL_GOB_SUBPACK_VER_STRIDE, vgobOffset, 0, 1);
    vgobOffset = vmaddwhw(x & BL_GOB_SUBPACK_HOR_MASK, (int32_t)BL_GOB_SUBPACK_HOR_STRIDE, vgobOffset, 0, 1);

#if ((CUPVA_BUILD_MODE == CUPVA_QNX || CUPVA_BUILD_MODE == CUPVA_L4T) && CUPVA_PVA_GEN_NUMBER <= 2)
    vintx swizzleBit = (replicatew(1) << (int32_t)BL_SWIZZLE_BIT);
#else
    vintx swizzleBit = replicatew(0);
#endif
    return (swizzleBit + vgobBase + vgobOffset);
}

inline vintx _cupvaBlocklinearOffset2D(vintx const &xCoords, vintx const &yCoords, vintx const &vLinePitch,
                                       uint8_t const blockHeightLog2)
{
    vintx const widthInGobs         = vLinePitch >> (uint32_t)BL_GOBW_LOG2;
    int32_t const blockSizeLog2     = (int32_t)BL_GOB_SZ_LOG2 + (int32_t)blockHeightLog2;
    uint32_t const linesPerBlock    = (uint32_t)BL_GOBH << (uint32_t)blockHeightLog2;
    int32_t const linesPerBlockLog2 = (int32_t)BL_GOBH_LOG2 + (int32_t)blockHeightLog2;
    vintx x                         = xCoords;
    vintx y                         = yCoords;

    vintx vgobBase = (((y & (linesPerBlock - 1U)) >> (int32_t)BL_GOBH_LOG2) << (int32_t)BL_GOB_SZ_LOG2) +
                     ((x >> (int32_t)BL_GOBW_LOG2) << blockSizeLog2);
    vgobBase = vmaddwhw((y >> linesPerBlockLog2) << blockSizeLog2, widthInGobs, vgobBase, 0, 1);
    vintx vgobOffset =
        vmaddwhw(y & BL_GOB_SEC_VER_MASK, (int32_t)BL_GOB_SEC_VER_STRIDE, (x & BL_GOB_SEC_HOR_MASK), 0, 1);
    vgobOffset = vmaddwhw(x & BL_GOB_PACK_MASK, (int32_t)BL_GOB_PACK_STRIDE, vgobOffset, 0, 1);
    vgobOffset = vmaddwhw(y & BL_GOB_SUBPACK_VER_MASK, (int32_t)BL_GOB_SUBPACK_VER_STRIDE, vgobOffset, 0, 1);
    vgobOffset = vmaddwhw(x & BL_GOB_SUBPACK_HOR_MASK, (int32_t)BL_GOB_SUBPACK_HOR_STRIDE, vgobOffset, 0, 1);

#if ((CUPVA_BUILD_MODE == CUPVA_QNX || CUPVA_BUILD_MODE == CUPVA_L4T) && CUPVA_PVA_GEN_NUMBER <= 2)
    vintx swizzleBit = (replicatew(1) << (int32_t)BL_SWIZZLE_BIT);
#else
    vintx swizzleBit = replicatew(0);
#endif
    return (swizzleBit + vgobBase + vgobOffset);
}
inline uint64_t cupvaSurfaceAddress2D(VPUSurfaceData const &surfData, uint32_t const x, uint32_t const y)
{
    CUPVA_VPU_ASSERT(surfData.metadata.linePitch >= 0);
    CUPVA_VPU_ASSERT(surfData.metadata.blockHeightLog2 >= 0 &&
                     surfData.metadata.blockHeightLog2 <= BL_MAX_BLOCKHEIGHT_LOG2);
    uint32_t const widthInGobs      = surfData.metadata.linePitch >> BL_GOBW_LOG2;
    uint8_t const blockHeightLog2   = (uint8_t)(surfData.metadata.blockHeightLog2 & 0x7FU);
    uint8_t const blockSizeLog2     = BL_GOB_SZ_LOG2 + blockHeightLog2;
    uint32_t const linesPerBlock    = ((uint32_t)BL_GOBH) << blockHeightLog2;
    uint8_t const linesPerBlockLog2 = BL_GOBH_LOG2 + blockHeightLog2;
    CUPVA_VPU_ASSERT(((0xFFFFFFFFU) - ((((y >> (uint32_t)linesPerBlockLog2) << (uint32_t)blockSizeLog2) * widthInGobs) +
                                       ((x >> (uint32_t)BL_GOBW_LOG2) << (uint32_t)blockSizeLog2))) >=
                     ((y & (linesPerBlock - 1U)) >> BL_GOBH_LOG2));
    uint64_t const gobBase =
        ((((uint64_t)y >> (uint64_t)linesPerBlockLog2) << (uint64_t)blockSizeLog2) * (uint64_t)widthInGobs) +
        (((uint64_t)x >> (uint64_t)BL_GOBW_LOG2) << (uint64_t)blockSizeLog2) +
        ((((uint64_t)y & ((uint64_t)linesPerBlock - 1UL)) >> (uint64_t)BL_GOBH_LOG2) << (uint64_t)BL_GOB_SZ_LOG2);

    CUPVA_VPU_ASSERT(((0xFFFFFFFFFFFFFFFF) - surfData.pointer.base) >= surfData.pointer.offset);
    ExtMemIova va;
    va.val = surfData.pointer.base + surfData.pointer.offset;
    // blBaddr needs bits 13:9, plus set bit 8 to indicate BL mode, to be shifted to bit 40
    uint32_t const blMetadata       = (va.iova.addrLo & BL_SBADR_IOVA_MASK) | BL_METADATA_FLAG;
    uint64_t const blMetadataPacked = (uint64_t)blMetadata << (BL_ADDR_METADATA_SHIFT - BL_METADATA_FLAG_BIT);

    uint32_t const packOffset    = (x & BL_GOB_PACK_MASK) * BL_GOB_PACK_STRIDE;
    uint32_t const subpackOffset = ((y & BL_GOB_SUBPACK_VER_MASK) * BL_GOB_SUBPACK_VER_STRIDE) +
                                   ((x & BL_GOB_SUBPACK_HOR_MASK) * BL_GOB_SUBPACK_HOR_STRIDE);
    uint32_t const secOffset = ((y & BL_GOB_SEC_VER_MASK) * BL_GOB_SEC_VER_STRIDE) + (x & BL_GOB_SEC_HOR_MASK);
    CUPVA_VPU_ASSERT(((0xFFFFFFFFU) - packOffset) >= subpackOffset);
    CUPVA_VPU_ASSERT(((0xFFFFFFFFU) - packOffset - subpackOffset) >= secOffset);
    uint64_t const gobOffset = (uint64_t)packOffset + (uint64_t)subpackOffset + (uint64_t)secOffset;
    uint64_t const swizzleBit =
        ((CUPVA_BUILD_MODE == CUPVA_QNX || CUPVA_BUILD_MODE == CUPVA_L4T) && CUPVA_PVA_GEN_NUMBER <= 2)
            ? (1ULL << BL_SWIZZLE_BIT)
            : 0ULL;
    uint64_t const BLOffset = blMetadataPacked + swizzleBit + gobBase + gobOffset;
    CUPVA_VPU_ASSERT(((0xFFFFFFFFFFFFFFFFUL) / surfData.metadata.linePitch) >= y);
    CUPVA_VPU_ASSERT(((0xFFFFFFFFFFFFFFFFUL) - (surfData.metadata.linePitch * y)) >= x);
    uint64_t const PLOffset = ((uint64_t)y * (uint64_t)surfData.metadata.linePitch) + (uint64_t)x;
    uint64_t const offset   = (surfData.metadata.format == 1U) ? BLOffset : PLOffset;

    return (va.val + offset);
}

inline vintx cupvaSurfaceAddress2D(VPUSurfaceData const &surfData, dvintx coords)
{
    CUPVA_VPU_ASSERT(surfData.metadata.linePitch >= 0);
    CUPVA_VPU_ASSERT(surfData.metadata.blockHeightLog2 >= 0 && surfData.metadata.blockHeightLog2 <= 5);
    vintx x = coords.hi;
    vintx y = coords.lo;

    CUPVA_VPU_ASSERT(((0xFFFFFFFFFFFFFFFF) - surfData.pointer.base) >= surfData.pointer.offset);
    ExtMemIova va;
    va.val = surfData.pointer.base + surfData.pointer.offset;
    // blBaddr needs bits 13:9, plus set bit 8 to indicate BL mode, to be shifted to bit 40
    uint32_t const blMetadata = (va.iova.addrLo & BL_SBADR_IOVA_MASK) | BL_METADATA_FLAG;
    vintx blMetadataPacked    = replicatew(blMetadata) << (int32_t)(BL_ADDR_METADATA_SHIFT - BL_METADATA_FLAG_BIT);

    vintx vBLOffset =
        _cupvaBlocklinearOffset2D(coords.hi, coords.lo, surfData.metadata.linePitch, surfData.metadata.blockHeightLog2);

    vBLOffset       = vBLOffset + blMetadataPacked;
    vintx vPLOffset = vmaddwhw(y, replicatew(surfData.metadata.linePitch), x, 0, 1);
    vintx voffset   = vmux(replicatew((int32_t)surfData.metadata.format), vBLOffset, vPLOffset);
    vintx vBaseAddr = replicatew(va.iova.addrLo) | (replicatew((int32_t)va.iova.addrHi) << 32);

    return (vBaseAddr + voffset);
}

void cupvaFloatingPointNANErrorEnabled(bool enable);

void cupvaICachePrefetch(uintptr_t addr_in_words, uint32_t size);

#endif // CUPVA_DEVICE_H