device_core.h#

Fully qualified name: src/device/vpu_runtime/include/cupva_device/device_core.h

File members: src/device/vpu_runtime/include/cupva_device/device_core.h

/*
 * Copyright (c) 2020-2023 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA Corporation is strictly prohibited.
 */
 #ifndef CUPVA_DEVICE_CORE_H
 #define CUPVA_DEVICE_CORE_H

 #include "impl/dma_common.h"

 #include <cupva_types.h>
 #include <limits.h>
 #include <stddef.h>
 #include <stdint.h>

 #if CUPVA_BUILD_MODE != CUPVA_NATIVE
 typedef dvuint AgenCFG;
 #    define CUPVA_VPU_ASSERT(x) ((void)0)
 #else
 #    include <assert.h>
 #    define CUPVA_VPU_ASSERT(x) assert(x)
 #endif

 #define IGNORE_RETURN(x) ((void)(x))
 #define POINTER_CAST(_T, _P) ((_T *)((void *)_P))
 #define CONST_POINTER_CAST(_T, _P) ((_T const *)((void const *)_P))

 #define CUPVA_VPU_MAIN() int32_t CUPVA_VPU_MAIN_SYMBOL()

 #define CUPVA_SINGLE_BUFFER_LENGTH_IMPL(tw, th, haloX, haloY) (((tw) + 2 * (haloX)) * ((th) + 2 * (haloY)))

 #define CUPVA_BUFFER_LENGTH_IMPL_VAR(tw, th, ...) CUPVA_SINGLE_BUFFER_LENGTH_IMPL(tw, th, 0, 0)
 #define CUPVA_BUFFER_LENGTH_IMPL_HALO_1D(tw, th, haloX, ...) CUPVA_SINGLE_BUFFER_LENGTH_IMPL(tw, th, haloX, 0)
 #define CUPVA_BUFFER_LENGTH_IMPL_HALO_2D(tw, th, haloX, haloY, ...) \
     CUPVA_SINGLE_BUFFER_LENGTH_IMPL(tw, th, haloX, haloY)
 #define CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D(tw, th, haloX, haloY, type, mode)                                \
     (TRANSPOSE_LINE_PITCH_ALIGNED(((tw) + 2 * (haloX)) * sizeof(type), mode, sizeof(type)) * ((th) + 2 * (haloY)) / \
      sizeof(type))
 #define CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D_V2(tw, th, haloX, haloY, mode, type) \
     CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D(tw, th, haloX, haloY, type, mode)

 #define CUPVA_BUFFER_LENGTH_MACRO(tw, th, haloX, haloY, type, mode, macro, ...) macro

 #define CUPVA_CIRCULAR_BUFFER_LENGTH_IMPL_RM(type, tw, th, haloX, haloY, mode, ...)                                  \
     ((CIRCULAR_BUFFER_ALIGNED_SIZE(TRANSPOSE_LINE_PITCH_ALIGNED(CIRCULAR_BUFFER_LINE_PITCH_ALIGNED(                  \
                                                                     ((tw) * CIRCULAR_BUFFER_LINE_PITCH_FACTOR0 +     \
                                                                      (haloX) * CIRCULAR_BUFFER_LINE_PITCH_FACTOR1) * \
                                                                     sizeof(type)),                                   \
                                                                 mode, sizeof(type)) *                                \
                                    ((th) + 2 * (haloY))) +                                                           \
       CIRCULAR_BUFFER_EXTRA_BYTES) /                                                                                 \
      sizeof(type))

 #define CUPVA_CIRCULAR_BUFFER_LENGTH_IMPL_CM(type, tw, th, haloX, haloY, mode, ...)                               \
     ((CIRCULAR_BUFFER_ALIGNED_SIZE(                                                                               \
           TRANSPOSE_LINE_PITCH_ALIGNED(CIRCULAR_BUFFER_LINE_PITCH_ALIGNED((((tw) + 2 * (haloX)) * sizeof(type))), \
                                        mode, sizeof(type)) *                                                      \
           ((th) * CIRCULAR_BUFFER_LINE_PITCH_FACTOR0 + (haloY) * CIRCULAR_BUFFER_LINE_PITCH_FACTOR1)) +           \
       CIRCULAR_BUFFER_EXTRA_BYTES) /                                                                              \
      sizeof(type))

 #define CUPVA_MAX(a, b) ((a) > (b) ? (a) : (b))

 #define CUPVA_SINGLE_BUFFER_LENGTH(tw, th, ...)                                                   \
     CUPVA_BUFFER_LENGTH_MACRO(tw, th, ##__VA_ARGS__, CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D,  \
                               CUPVA_BUFFER_LENGTH_IMPL_HALO_2D, CUPVA_BUFFER_LENGTH_IMPL_HALO_2D, \
                               CUPVA_BUFFER_LENGTH_IMPL_HALO_1D, CUPVA_BUFFER_LENGTH_IMPL_VAR)     \
     (tw, th, ##__VA_ARGS__)

 #define CUPVA_DOUBLE_BUFFER_LENGTH(tw, th, ...) \
     (CIRCULAR_BUFFER_ALIGNED_SIZE(CUPVA_SINGLE_BUFFER_LENGTH(tw, th, ##__VA_ARGS__) * 2))

 #define CUPVA_CIRCULAR_BUFFER_LENGTH(type, tw, th, haloX, haloY, ...) \
     RDF_CIRCULAR_ROWMAJOR(type, tw, th, haloX, haloY, ##__VA_ARGS__)

 #define RDF_SINGLE(type, tw, th, ...)                                                                     \
     CUPVA_BUFFER_LENGTH_MACRO(tw, th, ##__VA_ARGS__, type, CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D_V2, \
                               CUPVA_BUFFER_LENGTH_IMPL_HALO_2D, CUPVA_BUFFER_LENGTH_IMPL_HALO_1D,         \
                               CUPVA_BUFFER_LENGTH_IMPL_VAR)                                               \
     (tw, th, ##__VA_ARGS__, type)

 #define RDF_DOUBLE(type, tw, th, ...) \
     (CIRCULAR_BUFFER_ALIGNED_SIZE((RDF_SINGLE(type, tw, th, ##__VA_ARGS__)) * 2 * sizeof(type)) / sizeof(type))

 #define RDF_CIRCULAR_ROWMAJOR(type, tw, th, haloX, haloY, ...) \
     CUPVA_CIRCULAR_BUFFER_LENGTH_IMPL_RM(type, tw, th, haloX, haloY, ##__VA_ARGS__, TRANS_MODE_NONE)

 #define RDF_CIRCULAR_COLUMNMAJOR(type, tw, th, haloX, haloY, ...) \
     CUPVA_CIRCULAR_BUFFER_LENGTH_IMPL_CM(type, tw, th, haloX, haloY, ##__VA_ARGS__, TRANS_MODE_NONE)

 #define RDF_CIRCULAR(type, tw, th, haloX, haloY, ...)                           \
     CUPVA_MAX(RDF_CIRCULAR_ROWMAJOR(type, tw, th, haloX, haloY, ##__VA_ARGS__), \
               RDF_CIRCULAR_COLUMNMAJOR(type, tw, th, haloX, haloY, ##__VA_ARGS__))

 #define VMEM_NOEXPORT(bank, type, name, ...) VMEM_NOEXPORT_IMPL(bank, type, name, ##__VA_ARGS__)

 #define VMEM_DATA_IMPL(bank, type, name, ...)                                        \
     GET_VMEM_MACRO(bank, type, name, ##__VA_ARGS__, VMEM_2D, VMEM_1D, VMEM_VAR, ...) \
     (bank, type, name, ##__VA_ARGS__);                                               \
     CUPVA_EXPORT(name, name)

 #define VMEM_DDF_HANDLER_WRAPPER(bank, type, name, nodesPerLane, lanes, isDDF) \
     VMEM_DDF_HANDLER(bank, name, nodesPerLane, lanes)

 #define VMEM_DDF_SELECTOR(bank, type, name, h, w, isDDF, macro, ...) macro

 #define VMEM_INVOKE(NAME, ARGS) NAME ARGS

 #define VMEM(bank, type, name, ...)                                                                        \
     VMEM_INVOKE(VMEM_INVOKE(VMEM_DDF_SELECTOR, (bank, type, name, ##__VA_ARGS__, VMEM_DDF_HANDLER_WRAPPER, \
                                                 VMEM_DATA_IMPL, VMEM_DATA_IMPL, VMEM_DATA_IMPL)),          \
                 (bank, type, name, ##__VA_ARGS__))

 #define EXTERN_VMEM(bank, type, name, ...) EXTERN_VMEM_IMPL(bank, type, name, ##__VA_ARGS__)

 #define VMEM_POINTER(bank, name)         \
     VMEM_VAR(bank, ExtMemPointer, name); \
     CUPVA_EXPORT(name, name, VMEM_TYPE_POINTER_EX)

 #define EXTERN_VMEM_POINTER(bank, name) EXTERN_VMEM_IMPL(bank, ExtMemPointer, name)

 #define VMEM_SURFACE(bank, name)                                \
     VMEM_VAR(bank, VPUSurfaceData, name);                       \
     CUPVA_EXPORT(name.pointer, _S##name, VMEM_TYPE_POINTER_EX); \
     CUPVA_EXPORT(name.metadata, _M##name)

 #define EXTERN_VMEM_SURFACE(bank, name) EXTERN_VMEM_IMPL(bank, VPUSurfaceData, name)

 #define VMEM_RDF_UNIFIED(bank, name)         \
     VMEM_VAR(bank, UnifiedRDFHandler, name); \
     CUPVA_EXPORT(name.hdl, name)

 #define VMEM_DMA_CONFIG(bank, type, name, ...)                                       \
     GET_VMEM_MACRO(bank, type, name, ##__VA_ARGS__, VMEM_2D, VMEM_1D, VMEM_VAR, ...) \
     (bank, type, name, ##__VA_ARGS__);                                               \
     CUPVA_EXPORT(name, name, VMEM_TYPE_VPUC_TABLE)

 #define EXTERN_VMEM_DMA_CONFIG(bank, type, name, ...) EXTERN_VMEM_IMPL(bank, type, name, ##__VA_ARGS__)

 #define CUPVA_ALIGNED(type, name, alignment) CUPVA_ALIGNED_IMPL(type, name, alignment)

 #define CUPVA_EXPORT(variable, name, ...) \
     CUPVA_EXPORT_IMPL(variable, name##_export, sizeof(variable), ##__VA_ARGS__, VMEM_TYPE_DATA)

 inline void cupvaDataFlowTrig(uint32_t handler);

 inline void cupvaDataFlowSync(uint32_t handler);

 inline uint32_t cupvaGetVpuId(void);

 inline uint32_t cupvaGetVmemBase(void);

 inline uint32_t cupvaGetDmaDescBase(void);

 inline uint32_t cupvaGetHwseqRamBase(void);

 inline uint32_t cupvaGetChannelRegAddr(int8_t channel_idx, uint16_t reg_ofst);

 inline uint32_t cupvaGetL2Base(void);

 inline uint32_t cupvaGetL2Size(void);

 inline void cupvaGetL2ExtMemPointer(ExtMemPointer *ptr, uint32_t offset)
 {
     ptr->base     = cupvaGetL2Base();
     ptr->size     = cupvaGetL2Size();
     ptr->offset   = offset;
     uint8_t *base = POINTER_CAST(uint8_t, &ptr->base);
     base[7]       = CUPVA_EXT_MEM_TYPE_L2;
 }

 inline uint32_t cupvaGetVmemAddress(void *ptr);

 typedef struct
 {
     int32_t size;

     uint16_t n1;
     uint16_t n2;
     uint16_t n3;
     uint16_t n4;
     uint16_t n5;
     uint16_t n6;

     int32_t s1;
     int32_t s2;
     int32_t s3;
     int32_t s4;
     int32_t s5;
     int32_t s6;
 } AgenWrapper;

 #define INIT_AGEN1(AGEN_INIT, AGEN_WRAPPER)                   \
     do                                                        \
     {                                                         \
         AGEN_INIT.n1   = AGEN_WRAPPER.n1;                     \
         AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size; \
     } while (false)

 #define INIT_AGEN2(AGEN_INIT, AGEN_WRAPPER)                                                                          \
     do                                                                                                               \
     {                                                                                                                \
         AGEN_INIT.n1   = AGEN_WRAPPER.n1;                                                                            \
         AGEN_INIT.n2   = AGEN_WRAPPER.n2;                                                                            \
         AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size;                                                        \
         AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size; \
     } while (false)

 #define INIT_AGEN3(AGEN_INIT, AGEN_WRAPPER)                                                                          \
     do                                                                                                               \
     {                                                                                                                \
         AGEN_INIT.n1   = AGEN_WRAPPER.n1;                                                                            \
         AGEN_INIT.n2   = AGEN_WRAPPER.n2;                                                                            \
         AGEN_INIT.n3   = AGEN_WRAPPER.n3;                                                                            \
         AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size;                                                        \
         AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size; \
         AGEN_INIT.mod3 = (AGEN_WRAPPER.s3 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                     \
                           (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2)) *                                      \
                          AGEN_WRAPPER.size;                                                                          \
     } while (false)

 #define INIT_AGEN4(AGEN_INIT, AGEN_WRAPPER)                                                                          \
     do                                                                                                               \
     {                                                                                                                \
         AGEN_INIT.n1   = AGEN_WRAPPER.n1;                                                                            \
         AGEN_INIT.n2   = AGEN_WRAPPER.n2;                                                                            \
         AGEN_INIT.n3   = AGEN_WRAPPER.n3;                                                                            \
         AGEN_INIT.n4   = AGEN_WRAPPER.n4;                                                                            \
         AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size;                                                        \
         AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size; \
         AGEN_INIT.mod3 = (AGEN_WRAPPER.s3 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                     \
                           (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2)) *                                      \
                          AGEN_WRAPPER.size;                                                                          \
         AGEN_INIT.mod4 = (AGEN_WRAPPER.s4 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                     \
                           (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) -                                       \
                           (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3)) *                                      \
                          AGEN_WRAPPER.size;                                                                          \
     } while (false)

 #define INIT_AGEN5(AGEN_INIT, AGEN_WRAPPER)                                                                            \
     do                                                                                                                 \
     {                                                                                                                  \
         AGEN_INIT.n1   = AGEN_WRAPPER.n1;                                                                              \
         AGEN_INIT.n2   = AGEN_WRAPPER.n2;                                                                              \
         AGEN_INIT.n3   = AGEN_WRAPPER.n3;                                                                              \
         AGEN_INIT.n4   = AGEN_WRAPPER.n4;                                                                              \
         AGEN_INIT.n5   = AGEN_WRAPPER.n5;                                                                              \
         AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size;                                                          \
         AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size;   \
         AGEN_INIT.mod3 = (AGEN_WRAPPER.s3 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                       \
                           (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2)) *                                        \
                          AGEN_WRAPPER.size;                                                                            \
         AGEN_INIT.mod4 = (AGEN_WRAPPER.s4 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                       \
                           (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) -                                         \
                           (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3)) *                                        \
                          AGEN_WRAPPER.size;                                                                            \
         AGEN_INIT.mod5 =                                                                                               \
             (AGEN_WRAPPER.s5 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                                    \
              (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) - (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3) - \
              (((int32_t)AGEN_WRAPPER.n4 - 1) * AGEN_WRAPPER.s4)) *                                                     \
             AGEN_WRAPPER.size;                                                                                         \
     } while (false)

 #define INIT_AGEN6(AGEN_INIT, AGEN_WRAPPER)                                                                            \
     do                                                                                                                 \
     {                                                                                                                  \
         AGEN_INIT.n1   = AGEN_WRAPPER.n1;                                                                              \
         AGEN_INIT.n2   = AGEN_WRAPPER.n2;                                                                              \
         AGEN_INIT.n3   = AGEN_WRAPPER.n3;                                                                              \
         AGEN_INIT.n4   = AGEN_WRAPPER.n4;                                                                              \
         AGEN_INIT.n5   = AGEN_WRAPPER.n5;                                                                              \
         AGEN_INIT.n6   = AGEN_WRAPPER.n6;                                                                              \
         AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size;                                                          \
         AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size;   \
         AGEN_INIT.mod3 = (AGEN_WRAPPER.s3 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                       \
                           (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2)) *                                        \
                          AGEN_WRAPPER.size;                                                                            \
         AGEN_INIT.mod4 = (AGEN_WRAPPER.s4 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                       \
                           (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) -                                         \
                           (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3)) *                                        \
                          AGEN_WRAPPER.size;                                                                            \
         AGEN_INIT.mod5 =                                                                                               \
             (AGEN_WRAPPER.s5 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                                    \
              (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) - (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3) - \
              (((int32_t)AGEN_WRAPPER.n4 - 1) * AGEN_WRAPPER.s4)) *                                                     \
             AGEN_WRAPPER.size;                                                                                         \
         AGEN_INIT.mod6 =                                                                                               \
             (AGEN_WRAPPER.s6 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) -                                    \
              (((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) - (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3) - \
              (((int32_t)AGEN_WRAPPER.n4 - 1) * AGEN_WRAPPER.s4) -                                                      \
              (((int32_t)AGEN_WRAPPER.n5 - 1) * AGEN_WRAPPER.s5)) *                                                     \
             AGEN_WRAPPER.size;                                                                                         \
     } while (false)

 inline void cupvaModifyAgenCfgBase(AgenCFG *cfg, void *const addr);

 inline void *cupvaGetAgenCfgBase(AgenCFG *cfg);

 inline void cupvaPrefetchReadyWait(void);

 inline void cupvaPrefetchDoneWait(void);

 inline void cupvaAdvanceAgenCfg(AgenCFG *cfg, const int32_t step_in_bytes)
 {
     agen x = init_agen_from_cfg(*cfg);
     adv_agen_base(x, step_in_bytes);
     *cfg = extract_agen_cfg(x);
 }

 #define CUPVA_GET_DESC_ATTR_ADDR(desc_id, field)                                          \
     (cupvaGetDmaDescBase() + ((uint32_t)sizeof(PvaDmaDescriptor) * (uint32_t)(desc_id)) + \
      (uint32_t)offsetof(PvaDmaDescriptor, field))

 #define CUPVA_ICACHE_FIELD(r, f, value)                             \
     (((value) & CUPVA_FIELD_MASK(CUPVA_ICACHE_##r##_0_##f##_RANGE)) \
      << CUPVA_FIELD_SHIFT(CUPVA_ICACHE_##r##_0_##f##_RANGE))

 inline void cupvaCircularBufferMemcpy(void *cb, uint32_t size)
 {
     dvint *src = (dvint *)cb;
     dvint *dst = (dvint *)((void *)&((uint8_t *)cb)[size]);
     (void)(*dst = extract(sign_extend(*src)));
 }

 inline uint32_t _cupvaRasterDataFlowAdvProg(RasterDataFlowHandler &handler)
 {
     uint8_t const pc                    = handler.pc;
     RasterDataFlowTrigInst &instruction = handler.trigProgram[pc];
     uint8_t const newTrigCnt            = (uint8_t)mod_inc((int32_t)handler.trigCnt, (int32_t)instruction.trigRpt);
     uint8_t const jumpDec               = (uint8_t)max((int32_t)instruction.jmpCnt, 1) - 1U; // Subtract 1, clamp to 0
     // If we are not repeating this trigger, update jmpCnt
     uint8_t const newJmpCnt = (newTrigCnt != 0U) ? instruction.jmpCnt : jumpDec;
     // If the updated jmpCnt is different to curent jmpCnt, it means we should take the jump
     uint32_t const applyJmp = (newJmpCnt != instruction.jmpCnt) ? 1U : 0U;
     // Decide whether to 1) apply jump 2) advance PC or 3) stay with current PC
     uint32_t const jumpOrAdv = (applyJmp != 0U) ? (uint32_t)instruction.jmpOffst : 1U;
     uint32_t const pcOffset  = (newTrigCnt != 0U) ? 0U : jumpOrAdv;
     // GPIO[0] are not used for triggers so are used for 'no-trigger'
     uint32_t const setTrigger = ((instruction.gpio > 0U) ? 1U : 0U);
     uint32_t const trigger    = setTrigger << instruction.gpio;
     instruction.jmpCnt        = newJmpCnt;
     uint32_t const newPc      = pc + pcOffset;
     // Write updates to handler
     handler.pc          = (uint8_t)min(newPc, RDF_INST_COUNT - 1U); // pc - clamp to valid bounds
     handler.trigCnt     = newTrigCnt;                               // trigCnt
     handler.gpioHistory = instruction.gpio;
     return trigger;
 }

 inline void cupvaRasterDataFlowTrig(RasterDataFlowHandler &handler)
 {
     uint32_t trigger = _cupvaRasterDataFlowAdvProg(handler);
     cupvaDataFlowTrig(trigger);
 }

 inline void cupvaRasterDataFlowSync(RasterDataFlowHandler &handler)
 {
     uint32_t const setTrigger = ((handler.gpioHistory > 0U) ? 1U : 0U);
     uint32_t const trigger    = setTrigger << handler.gpioHistory;
     handler.gpioHistory       = 0;
     cupvaDataFlowSync(trigger);
 }

 inline int32_t cupvaRasterDataFlowGetLinePitch(RasterDataFlowHandler &handler)
 {
     return (int32_t)handler.linePitch;
 }

 inline uint8_t cupvaRasterDataFlowGetLayout(RasterDataFlowHandler &handler)
 {
     return CUPVA_EXTRACT_FIELD(RDF_TRANSFER_PROPERTIES_LAYOUT, handler.transferProperties);
 }

 inline bool cupvaRasterDataFlowIsRead(RasterDataFlowHandler &handler)
 {
     return (CUPVA_EXTRACT_FIELD(RDF_TRANSFER_PROPERTIES_IS_READ, handler.transferProperties) > 0U);
 }

 inline uint32_t cupvaRasterDataFlowGetBppLog2(RasterDataFlowHandler &handler)
 {
     return CUPVA_EXTRACT_FIELD(RDF_TRANSFER_PROPERTIES_BPPLOG2, (uint32_t)handler.transferProperties);
 }

 inline int32_t cupvaRasterDataFlowGetCbLen(RasterDataFlowHandler &handler)
 {
     return handler.cbLen;
 }

 inline int32_t cupvaRasterDataFlowGetAdvance(RasterDataFlowHandler &handler)
 {
     uint16_t ixAdv = handler.ixAdv;
     uint16_t nxm1  = handler.nxm1;
     bool isLast    = ixAdv == nxm1;
     ixAdv          = isLast ? (uint16_t)0U : (ixAdv + (uint16_t)1U);
     int32_t adv    = isLast ? handler.adv[1] : handler.adv[0];
     handler.ixAdv  = ixAdv;

     return adv;
 }

 inline int32_t cupvaRasterDataFlowGetOffset(RasterDataFlowHandler &handler, int32_t offset)
 {
     int32_t ret = cupvaRasterDataFlowGetAdvance(handler) + offset;
     ret         = (ret >= handler.cbLen) ? (ret - handler.cbLen) : ret;
     ret         = (ret < 0) ? (ret + handler.cbLen) : ret;
     return ret;
 }

 inline int32_t cupvaTransposeLaneOfst(RasterDataFlowHandler &handler, TranspositionMode mode, int32_t bpp)
 {
     uint32_t linePitchInBytes  = (uint32_t)(handler.linePitch * bpp);
     uint8_t const transHOffset = TRANSPOSE_HORIZONTAL_OFFSET((uint8_t)mode, (uint8_t)bpp);
     uint32_t laneOfst          = (linePitchInBytes - (uint32_t)transHOffset) / VMEM_SUPERBANK_WIDTH_IN_BYTES;
     return (int32_t)laneOfst;
 }

 void cupvaRasterDataFlowOpen(UnifiedRDFHandler &handler, void *vmemBuffer);

 inline void *cupvaRasterDataFlowAcquire(UnifiedRDFHandler &handler)
 {
     // Trigger/sync values
     uint32_t const shouldDeferSync = handler.deferSync;
     uint32_t const shouldDeferTrig = handler.deferTrig;
     uint32_t const sync            = handler.nextSync;
     uint32_t const trig            = handler.nextTrig;

     // Pointers for copying CB and returning
     void *const base  = handler.base;
     void *const cbDst = handler.cbDst;
     dvintx cbCopyData;
     uint32_t const bppLog2 = cupvaRasterDataFlowGetBppLog2(handler.hdl);

     // Scalar loads for tile advancement
     uint16_t const ixAdvCurrent      = handler.hdl.ixAdv;
     uint16_t const nxm1Current       = handler.hdl.nxm1;
     int32_t const adv0               = handler.hdl.adv[0];
     int32_t const adv1               = handler.hdl.adv[1];
     uint32_t const tileOffsetCurrent = handler.tileOffset;

     if (shouldDeferSync == 0U)
     {
         cupvaDataFlowSync(sync);
     }

     // Update offset step 1 - determine adv, load registers
     int32_t const cbLen  = handler.hdl.cbLen;
     bool const isLast    = (ixAdvCurrent == nxm1Current);
     int32_t const adv    = isLast ? adv1 : adv0;
     uint16_t const ixAdv = isLast ? (uint16_t)0U : (ixAdvCurrent + (uint16_t)1U);
     void *const retVal   = (int8_t *)base + (tileOffsetCurrent << bppLog2);

     // Vector read CB head
     cbCopyData = sign_extend(*(dvint *)base);

     if (shouldDeferTrig == 0U)
     {
         cupvaDataFlowTrig(trig);
     }

     // Update offset step 2 - compute new tileOffset with wrapping
     handler.hdl.ixAdv           = ixAdv;
     int32_t const tileOffsetNew = adv + (int32_t)tileOffsetCurrent;
     int32_t tileOffsetWrapped   = (tileOffsetNew >= cbLen) ? (tileOffsetNew - cbLen) : tileOffsetNew;
     // On host side, negative advancements with CB have been converted to positive advancements, so no need to check
     // for condition tileOffsetNew < 0.
     handler.tileOffset = (uint32_t)tileOffsetWrapped;

     // Vector write CB tail
     *(dvint *)cbDst = extract(cbCopyData);

     // Return previously calculated tile offset as a pointer
     return retVal;
 }

 inline void cupvaRasterDataFlowRelease(UnifiedRDFHandler &handler)
 {
     // Load data to registers
     uint8_t const pc               = handler.hdl.pc;
     uint32_t const trigCnt         = handler.hdl.trigCnt;
     uint32_t const sync            = handler.nextSync;
     uint32_t const trig            = handler.nextTrig;
     uint32_t const shouldDeferTrig = handler.deferTrig;
     uint32_t const shouldDeferSync = handler.deferSync;

     // Next GPIO calculation part 1 - calculate PC jump
     RasterDataFlowTrigInst &instruction = handler.hdl.trigProgram[pc];
     uint32_t const trigRpt              = instruction.trigRpt;
     uint32_t const jmpCnt               = instruction.jmpCnt;
     int32_t const jmpOffset             = instruction.jmpOffst;
     uint32_t const newTrigCnt           = (uint32_t)mod_inc((int32_t)trigCnt, (int32_t)trigRpt);
     uint32_t const jumpDec              = max(jmpCnt, 1U) - 1U; // Subtract 1, clamp to 0

     if (shouldDeferSync != 0U)
     {
         cupvaDataFlowSync(sync);
     }

     // Next GPIO calculation part 1 - calculate next trigger
     // If we are not repeating this trigger, update jmpCnt
     uint32_t const newJmpCnt = (newTrigCnt != 0U) ? jmpCnt : jumpDec;
     // If the updated jmpCnt is different to curent jmpCnt, it means we should take the jump
     uint32_t const applyJmp = (newJmpCnt != jmpCnt) ? 1U : 0U;
     // Decide whether to 1) apply jump 2) advance PC or 3) stay with current PC
     uint32_t const jumpOrAdv = (applyJmp != 0U) ? (uint32_t)jmpOffset : 1U;
     uint32_t const pcOffset  = (newTrigCnt != 0U) ? 0U : jumpOrAdv;
     // GPIO[0] are not used for triggers so are used for 'no-trigger'
     uint32_t const setNextTrig = ((instruction.gpio > 0U) ? 1U : 0U);
     uint32_t const nextTrig    = setNextTrig << instruction.gpio;

     if (shouldDeferTrig != 0U)
     {
         cupvaDataFlowTrig(trig);
     }

     // Write updates to handler
     instruction.jmpCnt   = (uint8_t)newJmpCnt;
     uint32_t const newPc = pc + pcOffset;
     handler.hdl.pc       = (uint8_t)min(newPc, RDF_INST_COUNT - 1U); // pc - clamp to valid bounds
     handler.hdl.trigCnt  = (uint8_t)newTrigCnt;                      // trigCnt
     handler.nextTrig     = nextTrig;
     handler.nextSync     = trig;
 }

 inline void cupvaRasterDataFlowClose(UnifiedRDFHandler &handler)
 {
     cupvaDataFlowSync(handler.nextSync);
 }

 inline int32_t cupvaRasterDataFlowGetLinePitch(UnifiedRDFHandler &handler)
 {
     return cupvaRasterDataFlowGetLinePitch(handler.hdl);
 }

 inline int32_t cupvaRasterDataFlowGetCbLen(UnifiedRDFHandler &handler)
 {
     return cupvaRasterDataFlowGetCbLen(handler.hdl);
 }

 inline int32_t cupvaTransposeLaneOfst(UnifiedRDFHandler &handler, TranspositionMode mode, int32_t bpp)
 {
     return cupvaTransposeLaneOfst(handler.hdl, mode, bpp);
 }

 inline bool cupvaRasterDataFlowIsRead(UnifiedRDFHandler &handler)
 {
     return cupvaRasterDataFlowIsRead(handler.hdl);
 }

 inline uint32_t cupvaRasterDataFlowGetBppLog2(UnifiedRDFHandler &handler)
 {
     return cupvaRasterDataFlowGetBppLog2(handler.hdl);
 }

 // GOB width
 #define BL_GOBW UINT32_C(64)
 // GOB width in log2
 #define BL_GOBW_LOG2 UINT32_C(6)
 // GOB height
 #define BL_GOBH UINT32_C(8)
 // GOB height in log2
 #define BL_GOBH_LOG2 UINT32_C(3)
 // GOB size
 #define BL_GOB_SZ (BL_GOBW * BL_GOBH)
 // GOB size in log2
 #define BL_GOB_SZ_LOG2 ((uint32_t)BL_GOBW_LOG2 + (uint32_t)BL_GOBH_LOG2)
 // GOB sector width
 #define BL_GOB_SECW UINT32_C(16)
 // GOB sector width in log2
 #define BL_GOB_SECW_LOG2 UINT32_C(4)
 // GOB sector height
 #define BL_GOB_SECH UINT32_C(2)
 // GOB sector size
 #define BL_GOB_SEC_SZ (BL_GOB_SECW * BL_GOB_SECH)
 // GOB packet mask
 #define BL_GOB_PACK_MASK (BL_GOBW >> 1)
 // GOB packet stride
 #define BL_GOB_PACK_STRIDE UINT32_C(8)
 // GOB sub-packet vertical mask
 #define BL_GOB_SUBPACK_VER_MASK UINT32_C(6)
 // GOB sub-packet vertical stride
 #define BL_GOB_SUBPACK_VER_STRIDE UINT32_C(32)
 // GOB sub-packet horizontal mask
 #define BL_GOB_SUBPACK_HOR_MASK (BL_GOB_SEC_SZ >> 1)
 // GOB sub-packat horizontal stride
 #define BL_GOB_SUBPACK_HOR_STRIDE UINT32_C(2)
 // GOB sector vertical mask
 #define BL_GOB_SEC_VER_MASK (BL_GOB_SECH - UINT32_C(1))
 // GOB sector verical stride
 #define BL_GOB_SEC_VER_STRIDE UINT32_C(16)
 // GOB sector horizontal mask
 #define BL_GOB_SEC_HOR_MASK (BL_GOB_SECW - UINT32_C(1))
 // Swizzle bit which should be set on non-native builds
 #define BL_SWIZZLE_BIT 39U
 // Addresses from cupvaSurfaceAddress2D will fill metadata starting from this bit
 #define BL_ADDR_METADATA_SHIFT 40U
 // Address bits 13:9
 #define SBADR_IOVA_RANGE \
     UINT32_C(13)         \
         : UINT32_C(9)
 #define BL_SBADR_IOVA_MASK (CUPVA_FIELD_MASK(SBADR_IOVA_RANGE) << CUPVA_FIELD_SHIFT(SBADR_IOVA_RANGE))
 // Flag to indicate that the address is a BL address
 #define BL_METADATA_FLAG_BIT 8U
 #define BL_METADATA_FLAG (1UL << BL_METADATA_FLAG_BIT)
 #define BL_MAX_BLOCKHEIGHT_LOG2 (5)

 inline vintx _cupvaBlocklinearOffset2D(vintx const &xCoords, vintx const &yCoords, uint32_t const linePitch,
                                        uint8_t const blockHeightLog2)
 {
     uint32_t const widthInGobs      = linePitch >> (uint32_t)BL_GOBW_LOG2;
     int32_t const blockSizeLog2     = (int32_t)BL_GOB_SZ_LOG2 + (int32_t)blockHeightLog2;
     uint32_t const linesPerBlock    = (uint32_t)BL_GOBH << (uint32_t)blockHeightLog2;
     int32_t const linesPerBlockLog2 = (int32_t)BL_GOBH_LOG2 + (int32_t)blockHeightLog2;
     vintx x                         = xCoords;
     vintx y                         = yCoords;

     vintx vgobBase = (((y & (linesPerBlock - 1U)) >> (int32_t)BL_GOBH_LOG2) << (int32_t)BL_GOB_SZ_LOG2) +
                      ((x >> (int32_t)BL_GOBW_LOG2) << blockSizeLog2);
     vgobBase = vmaddwhw((y >> linesPerBlockLog2) << blockSizeLog2, (int32_t)widthInGobs, vgobBase, 0, 1);
     vintx vgobOffset =
         vmaddwhw(y & BL_GOB_SEC_VER_MASK, (int32_t)BL_GOB_SEC_VER_STRIDE, (x & BL_GOB_SEC_HOR_MASK), 0, 1);
     vgobOffset = vmaddwhw(x & BL_GOB_PACK_MASK, (int32_t)BL_GOB_PACK_STRIDE, vgobOffset, 0, 1);
     vgobOffset = vmaddwhw(y & BL_GOB_SUBPACK_VER_MASK, (int32_t)BL_GOB_SUBPACK_VER_STRIDE, vgobOffset, 0, 1);
     vgobOffset = vmaddwhw(x & BL_GOB_SUBPACK_HOR_MASK, (int32_t)BL_GOB_SUBPACK_HOR_STRIDE, vgobOffset, 0, 1);

 #if ((CUPVA_BUILD_MODE == CUPVA_QNX || CUPVA_BUILD_MODE == CUPVA_L4T) && CUPVA_PVA_GEN_NUMBER <= 2)
     vintx swizzleBit = (replicatew(1) << (int32_t)BL_SWIZZLE_BIT);
 #else
     vintx swizzleBit = replicatew(0);
 #endif
     return (swizzleBit + vgobBase + vgobOffset);
 }

 inline vintx _cupvaBlocklinearOffset2D(vintx const &xCoords, vintx const &yCoords, vintx const &vLinePitch,
                                        uint8_t const blockHeightLog2)
 {
     vintx const widthInGobs         = vLinePitch >> (uint32_t)BL_GOBW_LOG2;
     int32_t const blockSizeLog2     = (int32_t)BL_GOB_SZ_LOG2 + (int32_t)blockHeightLog2;
     uint32_t const linesPerBlock    = (uint32_t)BL_GOBH << (uint32_t)blockHeightLog2;
     int32_t const linesPerBlockLog2 = (int32_t)BL_GOBH_LOG2 + (int32_t)blockHeightLog2;
     vintx x                         = xCoords;
     vintx y                         = yCoords;

     vintx vgobBase = (((y & (linesPerBlock - 1U)) >> (int32_t)BL_GOBH_LOG2) << (int32_t)BL_GOB_SZ_LOG2) +
                      ((x >> (int32_t)BL_GOBW_LOG2) << blockSizeLog2);
     vgobBase = vmaddwhw((y >> linesPerBlockLog2) << blockSizeLog2, widthInGobs, vgobBase, 0, 1);
     vintx vgobOffset =
         vmaddwhw(y & BL_GOB_SEC_VER_MASK, (int32_t)BL_GOB_SEC_VER_STRIDE, (x & BL_GOB_SEC_HOR_MASK), 0, 1);
     vgobOffset = vmaddwhw(x & BL_GOB_PACK_MASK, (int32_t)BL_GOB_PACK_STRIDE, vgobOffset, 0, 1);
     vgobOffset = vmaddwhw(y & BL_GOB_SUBPACK_VER_MASK, (int32_t)BL_GOB_SUBPACK_VER_STRIDE, vgobOffset, 0, 1);
     vgobOffset = vmaddwhw(x & BL_GOB_SUBPACK_HOR_MASK, (int32_t)BL_GOB_SUBPACK_HOR_STRIDE, vgobOffset, 0, 1);

 #if ((CUPVA_BUILD_MODE == CUPVA_QNX || CUPVA_BUILD_MODE == CUPVA_L4T) && CUPVA_PVA_GEN_NUMBER <= 2)
     vintx swizzleBit = (replicatew(1) << (int32_t)BL_SWIZZLE_BIT);
 #else
     vintx swizzleBit = replicatew(0);
 #endif
     return (swizzleBit + vgobBase + vgobOffset);
 }
 inline uint64_t cupvaSurfaceAddress2D(VPUSurfaceData const &surfData, uint32_t const x, uint32_t const y)
 {
     CUPVA_VPU_ASSERT(surfData.metadata.linePitch >= 0);
     CUPVA_VPU_ASSERT(surfData.metadata.blockHeightLog2 >= 0 &&
                      surfData.metadata.blockHeightLog2 <= BL_MAX_BLOCKHEIGHT_LOG2);
     uint32_t const widthInGobs      = surfData.metadata.linePitch >> BL_GOBW_LOG2;
     uint8_t const blockHeightLog2   = (uint8_t)(surfData.metadata.blockHeightLog2 & 0x7FU);
     uint8_t const blockSizeLog2     = BL_GOB_SZ_LOG2 + blockHeightLog2;
     uint32_t const linesPerBlock    = ((uint32_t)BL_GOBH) << blockHeightLog2;
     uint8_t const linesPerBlockLog2 = BL_GOBH_LOG2 + blockHeightLog2;
     CUPVA_VPU_ASSERT(((0xFFFFFFFFU) - ((((y >> (uint32_t)linesPerBlockLog2) << (uint32_t)blockSizeLog2) * widthInGobs) +
                                        ((x >> (uint32_t)BL_GOBW_LOG2) << (uint32_t)blockSizeLog2))) >=
                      ((y & (linesPerBlock - 1U)) >> BL_GOBH_LOG2));
     uint64_t const gobBase =
         ((((uint64_t)y >> (uint64_t)linesPerBlockLog2) << (uint64_t)blockSizeLog2) * (uint64_t)widthInGobs) +
         (((uint64_t)x >> (uint64_t)BL_GOBW_LOG2) << (uint64_t)blockSizeLog2) +
         ((((uint64_t)y & ((uint64_t)linesPerBlock - 1UL)) >> (uint64_t)BL_GOBH_LOG2) << (uint64_t)BL_GOB_SZ_LOG2);

     CUPVA_VPU_ASSERT(((0xFFFFFFFFFFFFFFFF) - surfData.pointer.base) >= surfData.pointer.offset);
     ExtMemIova va;
     va.val = surfData.pointer.base + surfData.pointer.offset;
     // blBaddr needs bits 13:9, plus set bit 8 to indicate BL mode, to be shifted to bit 40
     uint32_t const blMetadata       = (va.iova.addrLo & BL_SBADR_IOVA_MASK) | BL_METADATA_FLAG;
     uint64_t const blMetadataPacked = (uint64_t)blMetadata << (BL_ADDR_METADATA_SHIFT - BL_METADATA_FLAG_BIT);

     uint32_t const packOffset    = (x & BL_GOB_PACK_MASK) * BL_GOB_PACK_STRIDE;
     uint32_t const subpackOffset = ((y & BL_GOB_SUBPACK_VER_MASK) * BL_GOB_SUBPACK_VER_STRIDE) +
                                    ((x & BL_GOB_SUBPACK_HOR_MASK) * BL_GOB_SUBPACK_HOR_STRIDE);
     uint32_t const secOffset = ((y & BL_GOB_SEC_VER_MASK) * BL_GOB_SEC_VER_STRIDE) + (x & BL_GOB_SEC_HOR_MASK);
     CUPVA_VPU_ASSERT(((0xFFFFFFFFU) - packOffset) >= subpackOffset);
     CUPVA_VPU_ASSERT(((0xFFFFFFFFU) - packOffset - subpackOffset) >= secOffset);
     uint64_t const gobOffset = (uint64_t)packOffset + (uint64_t)subpackOffset + (uint64_t)secOffset;
     uint64_t const swizzleBit =
         ((CUPVA_BUILD_MODE == CUPVA_QNX || CUPVA_BUILD_MODE == CUPVA_L4T) && CUPVA_PVA_GEN_NUMBER <= 2)
             ? (1ULL << BL_SWIZZLE_BIT)
             : 0ULL;
     uint64_t const BLOffset = blMetadataPacked + swizzleBit + gobBase + gobOffset;
     CUPVA_VPU_ASSERT(((0xFFFFFFFFFFFFFFFFUL) / surfData.metadata.linePitch) >= y);
     CUPVA_VPU_ASSERT(((0xFFFFFFFFFFFFFFFFUL) - (surfData.metadata.linePitch * y)) >= x);
     uint64_t const PLOffset = ((uint64_t)y * (uint64_t)surfData.metadata.linePitch) + (uint64_t)x;
     uint64_t const offset   = (surfData.metadata.format == 1U) ? BLOffset : PLOffset;

     return (va.val + offset);
 }

 inline vintx cupvaSurfaceAddress2D(VPUSurfaceData const &surfData, dvintx coords)
 {
     CUPVA_VPU_ASSERT(surfData.metadata.linePitch >= 0);
     CUPVA_VPU_ASSERT(surfData.metadata.blockHeightLog2 >= 0 && surfData.metadata.blockHeightLog2 <= 5);
     vintx x = coords.hi;
     vintx y = coords.lo;

     CUPVA_VPU_ASSERT(((0xFFFFFFFFFFFFFFFF) - surfData.pointer.base) >= surfData.pointer.offset);
     ExtMemIova va;
     va.val = surfData.pointer.base + surfData.pointer.offset;
     // blBaddr needs bits 13:9, plus set bit 8 to indicate BL mode, to be shifted to bit 40
     uint32_t const blMetadata = (va.iova.addrLo & BL_SBADR_IOVA_MASK) | BL_METADATA_FLAG;
     vintx blMetadataPacked    = replicatew(blMetadata) << (int32_t)(BL_ADDR_METADATA_SHIFT - BL_METADATA_FLAG_BIT);

     vintx vBLOffset =
         _cupvaBlocklinearOffset2D(coords.hi, coords.lo, surfData.metadata.linePitch, surfData.metadata.blockHeightLog2);

     vBLOffset       = vBLOffset + blMetadataPacked;
     vintx vPLOffset = vmaddwhw(y, replicatew(surfData.metadata.linePitch), x, 0, 1);
     vintx voffset   = vmux(replicatew((int32_t)surfData.metadata.format), vBLOffset, vPLOffset);
     vintx vBaseAddr = replicatew(va.iova.addrLo) | (replicatew((int32_t)va.iova.addrHi) << 32);

     return (vBaseAddr + voffset);
 }

 void cupvaFloatingPointNANErrorEnabled(bool enable);

 void cupvaICachePrefetch(uintptr_t addr_in_words, uint32_t size);

 void cupvaAllowDataFlowActiveAfterVpuExit();

 #endif // CUPVA_DEVICE_H