device_core.h#
Fully qualified name: src/device/vpu_runtime/include/cupva_device/device_core.h
File members: src/device/vpu_runtime/include/cupva_device/device_core.h
/*
* Copyright (c) 2020-2023 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA Corporation is strictly prohibited.
*/
#ifndef CUPVA_DEVICE_CORE_H
#define CUPVA_DEVICE_CORE_H
#include "impl/dma_common.h"
#include <cupva_types.h>
#include <limits.h>
#include <stddef.h>
#include <stdint.h>
#if CUPVA_BUILD_MODE != CUPVA_NATIVE
typedef dvuint AgenCFG;
# define CUPVA_VPU_ASSERT(x) ((void)0)
#else
# include <assert.h>
# define CUPVA_VPU_ASSERT(x) assert(x)
#endif
#define IGNORE_RETURN(x) ((void)(x))
#define POINTER_CAST(_T, _P) ((_T *)((void *)_P))
#define CONST_POINTER_CAST(_T, _P) ((_T const *)((void const *)_P))
#define CUPVA_VPU_MAIN() int32_t CUPVA_VPU_MAIN_SYMBOL()
#define CUPVA_SINGLE_BUFFER_LENGTH_IMPL(tw, th, haloX, haloY) (((tw) + 2 * (haloX)) * ((th) + 2 * (haloY)))
#define CUPVA_BUFFER_LENGTH_IMPL_VAR(tw, th, ...) CUPVA_SINGLE_BUFFER_LENGTH_IMPL(tw, th, 0, 0)
#define CUPVA_BUFFER_LENGTH_IMPL_HALO_1D(tw, th, haloX, ...) CUPVA_SINGLE_BUFFER_LENGTH_IMPL(tw, th, haloX, 0)
#define CUPVA_BUFFER_LENGTH_IMPL_HALO_2D(tw, th, haloX, haloY, ...) \
CUPVA_SINGLE_BUFFER_LENGTH_IMPL(tw, th, haloX, haloY)
#define CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D(tw, th, haloX, haloY, type, mode) \
(TRANSPOSE_LINE_PITCH_ALIGNED(((tw) + 2 * (haloX)) * sizeof(type), mode, sizeof(type)) * ((th) + 2 * (haloY)) / \
sizeof(type))
#define CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D_V2(tw, th, haloX, haloY, mode, type) \
CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D(tw, th, haloX, haloY, type, mode)
#define CUPVA_BUFFER_LENGTH_MACRO(tw, th, haloX, haloY, type, mode, macro, ...) macro
#define CUPVA_CIRCULAR_BUFFER_LENGTH_IMPL_RM(type, tw, th, haloX, haloY, mode, ...) \
((CIRCULAR_BUFFER_ALIGNED_SIZE(TRANSPOSE_LINE_PITCH_ALIGNED(CIRCULAR_BUFFER_LINE_PITCH_ALIGNED( \
((tw) * CIRCULAR_BUFFER_LINE_PITCH_FACTOR0 + \
(haloX) * CIRCULAR_BUFFER_LINE_PITCH_FACTOR1) * \
sizeof(type)), \
mode, sizeof(type)) * \
((th) + 2 * (haloY))) + \
CIRCULAR_BUFFER_EXTRA_BYTES) / \
sizeof(type))
#define CUPVA_CIRCULAR_BUFFER_LENGTH_IMPL_CM(type, tw, th, haloX, haloY, mode, ...) \
((CIRCULAR_BUFFER_ALIGNED_SIZE( \
TRANSPOSE_LINE_PITCH_ALIGNED(CIRCULAR_BUFFER_LINE_PITCH_ALIGNED((((tw) + 2 * (haloX)) * sizeof(type))), \
mode, sizeof(type)) * \
((th) * CIRCULAR_BUFFER_LINE_PITCH_FACTOR0 + (haloY) * CIRCULAR_BUFFER_LINE_PITCH_FACTOR1)) + \
CIRCULAR_BUFFER_EXTRA_BYTES) / \
sizeof(type))
#define CUPVA_MAX(a, b) ((a) > (b) ? (a) : (b))
#define CUPVA_SINGLE_BUFFER_LENGTH(tw, th, ...) \
CUPVA_BUFFER_LENGTH_MACRO(tw, th, ##__VA_ARGS__, CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D, \
CUPVA_BUFFER_LENGTH_IMPL_HALO_2D, CUPVA_BUFFER_LENGTH_IMPL_HALO_2D, \
CUPVA_BUFFER_LENGTH_IMPL_HALO_1D, CUPVA_BUFFER_LENGTH_IMPL_VAR) \
(tw, th, ##__VA_ARGS__)
#define CUPVA_DOUBLE_BUFFER_LENGTH(tw, th, ...) \
(CIRCULAR_BUFFER_ALIGNED_SIZE(CUPVA_SINGLE_BUFFER_LENGTH(tw, th, ##__VA_ARGS__) * 2))
#define CUPVA_CIRCULAR_BUFFER_LENGTH(type, tw, th, haloX, haloY, ...) \
RDF_CIRCULAR_ROWMAJOR(type, tw, th, haloX, haloY, ##__VA_ARGS__)
#define RDF_SINGLE(type, tw, th, ...) \
CUPVA_BUFFER_LENGTH_MACRO(tw, th, ##__VA_ARGS__, type, CUPVA_BUFFER_LENGTH_IMPL_HALO_TRANSPOSE_2D_V2, \
CUPVA_BUFFER_LENGTH_IMPL_HALO_2D, CUPVA_BUFFER_LENGTH_IMPL_HALO_1D, \
CUPVA_BUFFER_LENGTH_IMPL_VAR) \
(tw, th, ##__VA_ARGS__, type)
#define RDF_DOUBLE(type, tw, th, ...) \
(CIRCULAR_BUFFER_ALIGNED_SIZE((RDF_SINGLE(type, tw, th, ##__VA_ARGS__)) * 2 * sizeof(type)) / sizeof(type))
#define RDF_CIRCULAR_ROWMAJOR(type, tw, th, haloX, haloY, ...) \
CUPVA_CIRCULAR_BUFFER_LENGTH_IMPL_RM(type, tw, th, haloX, haloY, ##__VA_ARGS__, TRANS_MODE_NONE)
#define RDF_CIRCULAR_COLUMNMAJOR(type, tw, th, haloX, haloY, ...) \
CUPVA_CIRCULAR_BUFFER_LENGTH_IMPL_CM(type, tw, th, haloX, haloY, ##__VA_ARGS__, TRANS_MODE_NONE)
#define RDF_CIRCULAR(type, tw, th, haloX, haloY, ...) \
CUPVA_MAX(RDF_CIRCULAR_ROWMAJOR(type, tw, th, haloX, haloY, ##__VA_ARGS__), \
RDF_CIRCULAR_COLUMNMAJOR(type, tw, th, haloX, haloY, ##__VA_ARGS__))
#define VMEM_NOEXPORT(bank, type, name, ...) VMEM_NOEXPORT_IMPL(bank, type, name, ##__VA_ARGS__)
#define VMEM_DATA_IMPL(bank, type, name, ...) \
GET_VMEM_MACRO(bank, type, name, ##__VA_ARGS__, VMEM_2D, VMEM_1D, VMEM_VAR, ...) \
(bank, type, name, ##__VA_ARGS__); \
CUPVA_EXPORT(name, name)
#define VMEM_DDF_HANDLER_WRAPPER(bank, type, name, nodesPerLane, lanes, isDDF) \
VMEM_DDF_HANDLER(bank, name, nodesPerLane, lanes)
#define VMEM_DDF_SELECTOR(bank, type, name, h, w, isDDF, macro, ...) macro
#define VMEM_INVOKE(NAME, ARGS) NAME ARGS
#define VMEM(bank, type, name, ...) \
VMEM_INVOKE(VMEM_INVOKE(VMEM_DDF_SELECTOR, (bank, type, name, ##__VA_ARGS__, VMEM_DDF_HANDLER_WRAPPER, \
VMEM_DATA_IMPL, VMEM_DATA_IMPL, VMEM_DATA_IMPL)), \
(bank, type, name, ##__VA_ARGS__))
#define EXTERN_VMEM(bank, type, name, ...) EXTERN_VMEM_IMPL(bank, type, name, ##__VA_ARGS__)
#define VMEM_POINTER(bank, name) \
VMEM_VAR(bank, ExtMemPointer, name); \
CUPVA_EXPORT(name, name, VMEM_TYPE_POINTER_EX)
#define EXTERN_VMEM_POINTER(bank, name) EXTERN_VMEM_IMPL(bank, ExtMemPointer, name)
#define VMEM_SURFACE(bank, name) \
VMEM_VAR(bank, VPUSurfaceData, name); \
CUPVA_EXPORT(name.pointer, _S##name, VMEM_TYPE_POINTER_EX); \
CUPVA_EXPORT(name.metadata, _M##name)
#define EXTERN_VMEM_SURFACE(bank, name) EXTERN_VMEM_IMPL(bank, VPUSurfaceData, name)
#define VMEM_RDF_UNIFIED(bank, name) \
VMEM_VAR(bank, UnifiedRDFHandler, name); \
CUPVA_EXPORT(name.hdl, name)
#define VMEM_DMA_CONFIG(bank, type, name, ...) \
GET_VMEM_MACRO(bank, type, name, ##__VA_ARGS__, VMEM_2D, VMEM_1D, VMEM_VAR, ...) \
(bank, type, name, ##__VA_ARGS__); \
CUPVA_EXPORT(name, name, VMEM_TYPE_VPUC_TABLE)
#define EXTERN_VMEM_DMA_CONFIG(bank, type, name, ...) EXTERN_VMEM_IMPL(bank, type, name, ##__VA_ARGS__)
#define CUPVA_ALIGNED(type, name, alignment) CUPVA_ALIGNED_IMPL(type, name, alignment)
#define CUPVA_EXPORT(variable, name, ...) \
CUPVA_EXPORT_IMPL(variable, name##_export, sizeof(variable), ##__VA_ARGS__, VMEM_TYPE_DATA)
inline void cupvaDataFlowTrig(uint32_t handler);
inline void cupvaDataFlowSync(uint32_t handler);
inline uint32_t cupvaGetVpuId(void);
inline uint32_t cupvaGetVmemBase(void);
inline uint32_t cupvaGetDmaDescBase(void);
inline uint32_t cupvaGetHwseqRamBase(void);
inline uint32_t cupvaGetChannelRegAddr(int8_t channel_idx, uint16_t reg_ofst);
inline uint32_t cupvaGetL2Base(void);
inline uint32_t cupvaGetL2Size(void);
inline void cupvaGetL2ExtMemPointer(ExtMemPointer *ptr, uint32_t offset)
{
ptr->base = cupvaGetL2Base();
ptr->size = cupvaGetL2Size();
ptr->offset = offset;
uint8_t *base = POINTER_CAST(uint8_t, &ptr->base);
base[7] = CUPVA_EXT_MEM_TYPE_L2;
}
inline uint32_t cupvaGetVmemAddress(void *ptr);
typedef struct
{
int32_t size;
uint16_t n1;
uint16_t n2;
uint16_t n3;
uint16_t n4;
uint16_t n5;
uint16_t n6;
int32_t s1;
int32_t s2;
int32_t s3;
int32_t s4;
int32_t s5;
int32_t s6;
} AgenWrapper;
#define INIT_AGEN1(AGEN_INIT, AGEN_WRAPPER) \
do \
{ \
AGEN_INIT.n1 = AGEN_WRAPPER.n1; \
AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size; \
} while (false)
#define INIT_AGEN2(AGEN_INIT, AGEN_WRAPPER) \
do \
{ \
AGEN_INIT.n1 = AGEN_WRAPPER.n1; \
AGEN_INIT.n2 = AGEN_WRAPPER.n2; \
AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size; \
AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size; \
} while (false)
#define INIT_AGEN3(AGEN_INIT, AGEN_WRAPPER) \
do \
{ \
AGEN_INIT.n1 = AGEN_WRAPPER.n1; \
AGEN_INIT.n2 = AGEN_WRAPPER.n2; \
AGEN_INIT.n3 = AGEN_WRAPPER.n3; \
AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size; \
AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size; \
AGEN_INIT.mod3 = (AGEN_WRAPPER.s3 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) - \
(((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2)) * \
AGEN_WRAPPER.size; \
} while (false)
#define INIT_AGEN4(AGEN_INIT, AGEN_WRAPPER) \
do \
{ \
AGEN_INIT.n1 = AGEN_WRAPPER.n1; \
AGEN_INIT.n2 = AGEN_WRAPPER.n2; \
AGEN_INIT.n3 = AGEN_WRAPPER.n3; \
AGEN_INIT.n4 = AGEN_WRAPPER.n4; \
AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size; \
AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size; \
AGEN_INIT.mod3 = (AGEN_WRAPPER.s3 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) - \
(((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2)) * \
AGEN_WRAPPER.size; \
AGEN_INIT.mod4 = (AGEN_WRAPPER.s4 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) - \
(((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) - \
(((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3)) * \
AGEN_WRAPPER.size; \
} while (false)
#define INIT_AGEN5(AGEN_INIT, AGEN_WRAPPER) \
do \
{ \
AGEN_INIT.n1 = AGEN_WRAPPER.n1; \
AGEN_INIT.n2 = AGEN_WRAPPER.n2; \
AGEN_INIT.n3 = AGEN_WRAPPER.n3; \
AGEN_INIT.n4 = AGEN_WRAPPER.n4; \
AGEN_INIT.n5 = AGEN_WRAPPER.n5; \
AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size; \
AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size; \
AGEN_INIT.mod3 = (AGEN_WRAPPER.s3 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) - \
(((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2)) * \
AGEN_WRAPPER.size; \
AGEN_INIT.mod4 = (AGEN_WRAPPER.s4 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) - \
(((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) - \
(((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3)) * \
AGEN_WRAPPER.size; \
AGEN_INIT.mod5 = \
(AGEN_WRAPPER.s5 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) - \
(((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) - (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3) - \
(((int32_t)AGEN_WRAPPER.n4 - 1) * AGEN_WRAPPER.s4)) * \
AGEN_WRAPPER.size; \
} while (false)
#define INIT_AGEN6(AGEN_INIT, AGEN_WRAPPER) \
do \
{ \
AGEN_INIT.n1 = AGEN_WRAPPER.n1; \
AGEN_INIT.n2 = AGEN_WRAPPER.n2; \
AGEN_INIT.n3 = AGEN_WRAPPER.n3; \
AGEN_INIT.n4 = AGEN_WRAPPER.n4; \
AGEN_INIT.n5 = AGEN_WRAPPER.n5; \
AGEN_INIT.n6 = AGEN_WRAPPER.n6; \
AGEN_INIT.mod1 = AGEN_WRAPPER.s1 * AGEN_WRAPPER.size; \
AGEN_INIT.mod2 = (AGEN_WRAPPER.s2 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1)) * AGEN_WRAPPER.size; \
AGEN_INIT.mod3 = (AGEN_WRAPPER.s3 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) - \
(((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2)) * \
AGEN_WRAPPER.size; \
AGEN_INIT.mod4 = (AGEN_WRAPPER.s4 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) - \
(((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) - \
(((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3)) * \
AGEN_WRAPPER.size; \
AGEN_INIT.mod5 = \
(AGEN_WRAPPER.s5 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) - \
(((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) - (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3) - \
(((int32_t)AGEN_WRAPPER.n4 - 1) * AGEN_WRAPPER.s4)) * \
AGEN_WRAPPER.size; \
AGEN_INIT.mod6 = \
(AGEN_WRAPPER.s6 - (((int32_t)AGEN_WRAPPER.n1 - 1) * AGEN_WRAPPER.s1) - \
(((int32_t)AGEN_WRAPPER.n2 - 1) * AGEN_WRAPPER.s2) - (((int32_t)AGEN_WRAPPER.n3 - 1) * AGEN_WRAPPER.s3) - \
(((int32_t)AGEN_WRAPPER.n4 - 1) * AGEN_WRAPPER.s4) - \
(((int32_t)AGEN_WRAPPER.n5 - 1) * AGEN_WRAPPER.s5)) * \
AGEN_WRAPPER.size; \
} while (false)
inline void cupvaModifyAgenCfgBase(AgenCFG *cfg, void *const addr);
inline void *cupvaGetAgenCfgBase(AgenCFG *cfg);
inline void cupvaPrefetchReadyWait(void);
inline void cupvaPrefetchDoneWait(void);
inline void cupvaAdvanceAgenCfg(AgenCFG *cfg, const int32_t step_in_bytes)
{
agen x = init_agen_from_cfg(*cfg);
adv_agen_base(x, step_in_bytes);
*cfg = extract_agen_cfg(x);
}
#define CUPVA_GET_DESC_ATTR_ADDR(desc_id, field) \
(cupvaGetDmaDescBase() + ((uint32_t)sizeof(PvaDmaDescriptor) * (uint32_t)(desc_id)) + \
(uint32_t)offsetof(PvaDmaDescriptor, field))
#define CUPVA_ICACHE_FIELD(r, f, value) \
(((value) & CUPVA_FIELD_MASK(CUPVA_ICACHE_##r##_0_##f##_RANGE)) \
<< CUPVA_FIELD_SHIFT(CUPVA_ICACHE_##r##_0_##f##_RANGE))
inline void cupvaCircularBufferMemcpy(void *cb, uint32_t size)
{
dvint *src = (dvint *)cb;
dvint *dst = (dvint *)((void *)&((uint8_t *)cb)[size]);
(void)(*dst = extract(sign_extend(*src)));
}
inline uint32_t _cupvaRasterDataFlowAdvProg(RasterDataFlowHandler &handler)
{
uint8_t const pc = handler.pc;
RasterDataFlowTrigInst &instruction = handler.trigProgram[pc];
uint8_t const newTrigCnt = (uint8_t)mod_inc((int32_t)handler.trigCnt, (int32_t)instruction.trigRpt);
uint8_t const jumpDec = (uint8_t)max((int32_t)instruction.jmpCnt, 1) - 1U; // Subtract 1, clamp to 0
// If we are not repeating this trigger, update jmpCnt
uint8_t const newJmpCnt = (newTrigCnt != 0U) ? instruction.jmpCnt : jumpDec;
// If the updated jmpCnt is different to curent jmpCnt, it means we should take the jump
uint32_t const applyJmp = (newJmpCnt != instruction.jmpCnt) ? 1U : 0U;
// Decide whether to 1) apply jump 2) advance PC or 3) stay with current PC
uint32_t const jumpOrAdv = (applyJmp != 0U) ? (uint32_t)instruction.jmpOffst : 1U;
uint32_t const pcOffset = (newTrigCnt != 0U) ? 0U : jumpOrAdv;
// GPIO[0] are not used for triggers so are used for 'no-trigger'
uint32_t const setTrigger = ((instruction.gpio > 0U) ? 1U : 0U);
uint32_t const trigger = setTrigger << instruction.gpio;
instruction.jmpCnt = newJmpCnt;
uint32_t const newPc = pc + pcOffset;
// Write updates to handler
handler.pc = (uint8_t)min(newPc, RDF_INST_COUNT - 1U); // pc - clamp to valid bounds
handler.trigCnt = newTrigCnt; // trigCnt
handler.gpioHistory = instruction.gpio;
return trigger;
}
inline void cupvaRasterDataFlowTrig(RasterDataFlowHandler &handler)
{
uint32_t trigger = _cupvaRasterDataFlowAdvProg(handler);
cupvaDataFlowTrig(trigger);
}
inline void cupvaRasterDataFlowSync(RasterDataFlowHandler &handler)
{
uint32_t const setTrigger = ((handler.gpioHistory > 0U) ? 1U : 0U);
uint32_t const trigger = setTrigger << handler.gpioHistory;
handler.gpioHistory = 0;
cupvaDataFlowSync(trigger);
}
inline int32_t cupvaRasterDataFlowGetLinePitch(RasterDataFlowHandler &handler)
{
return (int32_t)handler.linePitch;
}
inline uint8_t cupvaRasterDataFlowGetLayout(RasterDataFlowHandler &handler)
{
return CUPVA_EXTRACT_FIELD(RDF_TRANSFER_PROPERTIES_LAYOUT, handler.transferProperties);
}
inline bool cupvaRasterDataFlowIsRead(RasterDataFlowHandler &handler)
{
return (CUPVA_EXTRACT_FIELD(RDF_TRANSFER_PROPERTIES_IS_READ, handler.transferProperties) > 0U);
}
inline uint32_t cupvaRasterDataFlowGetBppLog2(RasterDataFlowHandler &handler)
{
return CUPVA_EXTRACT_FIELD(RDF_TRANSFER_PROPERTIES_BPPLOG2, (uint32_t)handler.transferProperties);
}
inline int32_t cupvaRasterDataFlowGetCbLen(RasterDataFlowHandler &handler)
{
return handler.cbLen;
}
inline int32_t cupvaRasterDataFlowGetAdvance(RasterDataFlowHandler &handler)
{
uint16_t ixAdv = handler.ixAdv;
uint16_t nxm1 = handler.nxm1;
bool isLast = ixAdv == nxm1;
ixAdv = isLast ? (uint16_t)0U : (ixAdv + (uint16_t)1U);
int32_t adv = isLast ? handler.adv[1] : handler.adv[0];
handler.ixAdv = ixAdv;
return adv;
}
inline int32_t cupvaRasterDataFlowGetOffset(RasterDataFlowHandler &handler, int32_t offset)
{
int32_t ret = cupvaRasterDataFlowGetAdvance(handler) + offset;
ret = (ret >= handler.cbLen) ? (ret - handler.cbLen) : ret;
ret = (ret < 0) ? (ret + handler.cbLen) : ret;
return ret;
}
inline int32_t cupvaTransposeLaneOfst(RasterDataFlowHandler &handler, TranspositionMode mode, int32_t bpp)
{
uint32_t linePitchInBytes = (uint32_t)(handler.linePitch * bpp);
uint8_t const transHOffset = TRANSPOSE_HORIZONTAL_OFFSET((uint8_t)mode, (uint8_t)bpp);
uint32_t laneOfst = (linePitchInBytes - (uint32_t)transHOffset) / VMEM_SUPERBANK_WIDTH_IN_BYTES;
return (int32_t)laneOfst;
}
void cupvaRasterDataFlowOpen(UnifiedRDFHandler &handler, void *vmemBuffer);
inline void *cupvaRasterDataFlowAcquire(UnifiedRDFHandler &handler)
{
// Trigger/sync values
uint32_t const shouldDeferSync = handler.deferSync;
uint32_t const shouldDeferTrig = handler.deferTrig;
uint32_t const sync = handler.nextSync;
uint32_t const trig = handler.nextTrig;
// Pointers for copying CB and returning
void *const base = handler.base;
void *const cbDst = handler.cbDst;
dvintx cbCopyData;
uint32_t const bppLog2 = cupvaRasterDataFlowGetBppLog2(handler.hdl);
// Scalar loads for tile advancement
uint16_t const ixAdvCurrent = handler.hdl.ixAdv;
uint16_t const nxm1Current = handler.hdl.nxm1;
int32_t const adv0 = handler.hdl.adv[0];
int32_t const adv1 = handler.hdl.adv[1];
uint32_t const tileOffsetCurrent = handler.tileOffset;
if (shouldDeferSync == 0U)
{
cupvaDataFlowSync(sync);
}
// Update offset step 1 - determine adv, load registers
int32_t const cbLen = handler.hdl.cbLen;
bool const isLast = (ixAdvCurrent == nxm1Current);
int32_t const adv = isLast ? adv1 : adv0;
uint16_t const ixAdv = isLast ? (uint16_t)0U : (ixAdvCurrent + (uint16_t)1U);
void *const retVal = (int8_t *)base + (tileOffsetCurrent << bppLog2);
// Vector read CB head
cbCopyData = sign_extend(*(dvint *)base);
if (shouldDeferTrig == 0U)
{
cupvaDataFlowTrig(trig);
}
// Update offset step 2 - compute new tileOffset with wrapping
handler.hdl.ixAdv = ixAdv;
int32_t const tileOffsetNew = adv + (int32_t)tileOffsetCurrent;
int32_t tileOffsetWrapped = (tileOffsetNew >= cbLen) ? (tileOffsetNew - cbLen) : tileOffsetNew;
// On host side, negative advancements with CB have been converted to positive advancements, so no need to check
// for condition tileOffsetNew < 0.
handler.tileOffset = (uint32_t)tileOffsetWrapped;
// Vector write CB tail
*(dvint *)cbDst = extract(cbCopyData);
// Return previously calculated tile offset as a pointer
return retVal;
}
inline void cupvaRasterDataFlowRelease(UnifiedRDFHandler &handler)
{
// Load data to registers
uint8_t const pc = handler.hdl.pc;
uint32_t const trigCnt = handler.hdl.trigCnt;
uint32_t const sync = handler.nextSync;
uint32_t const trig = handler.nextTrig;
uint32_t const shouldDeferTrig = handler.deferTrig;
uint32_t const shouldDeferSync = handler.deferSync;
// Next GPIO calculation part 1 - calculate PC jump
RasterDataFlowTrigInst &instruction = handler.hdl.trigProgram[pc];
uint32_t const trigRpt = instruction.trigRpt;
uint32_t const jmpCnt = instruction.jmpCnt;
int32_t const jmpOffset = instruction.jmpOffst;
uint32_t const newTrigCnt = (uint32_t)mod_inc((int32_t)trigCnt, (int32_t)trigRpt);
uint32_t const jumpDec = max(jmpCnt, 1U) - 1U; // Subtract 1, clamp to 0
if (shouldDeferSync != 0U)
{
cupvaDataFlowSync(sync);
}
// Next GPIO calculation part 1 - calculate next trigger
// If we are not repeating this trigger, update jmpCnt
uint32_t const newJmpCnt = (newTrigCnt != 0U) ? jmpCnt : jumpDec;
// If the updated jmpCnt is different to curent jmpCnt, it means we should take the jump
uint32_t const applyJmp = (newJmpCnt != jmpCnt) ? 1U : 0U;
// Decide whether to 1) apply jump 2) advance PC or 3) stay with current PC
uint32_t const jumpOrAdv = (applyJmp != 0U) ? (uint32_t)jmpOffset : 1U;
uint32_t const pcOffset = (newTrigCnt != 0U) ? 0U : jumpOrAdv;
// GPIO[0] are not used for triggers so are used for 'no-trigger'
uint32_t const setNextTrig = ((instruction.gpio > 0U) ? 1U : 0U);
uint32_t const nextTrig = setNextTrig << instruction.gpio;
if (shouldDeferTrig != 0U)
{
cupvaDataFlowTrig(trig);
}
// Write updates to handler
instruction.jmpCnt = (uint8_t)newJmpCnt;
uint32_t const newPc = pc + pcOffset;
handler.hdl.pc = (uint8_t)min(newPc, RDF_INST_COUNT - 1U); // pc - clamp to valid bounds
handler.hdl.trigCnt = (uint8_t)newTrigCnt; // trigCnt
handler.nextTrig = nextTrig;
handler.nextSync = trig;
}
inline void cupvaRasterDataFlowClose(UnifiedRDFHandler &handler)
{
cupvaDataFlowSync(handler.nextSync);
}
inline int32_t cupvaRasterDataFlowGetLinePitch(UnifiedRDFHandler &handler)
{
return cupvaRasterDataFlowGetLinePitch(handler.hdl);
}
inline int32_t cupvaRasterDataFlowGetCbLen(UnifiedRDFHandler &handler)
{
return cupvaRasterDataFlowGetCbLen(handler.hdl);
}
inline int32_t cupvaTransposeLaneOfst(UnifiedRDFHandler &handler, TranspositionMode mode, int32_t bpp)
{
return cupvaTransposeLaneOfst(handler.hdl, mode, bpp);
}
inline bool cupvaRasterDataFlowIsRead(UnifiedRDFHandler &handler)
{
return cupvaRasterDataFlowIsRead(handler.hdl);
}
inline uint32_t cupvaRasterDataFlowGetBppLog2(UnifiedRDFHandler &handler)
{
return cupvaRasterDataFlowGetBppLog2(handler.hdl);
}
// GOB width
#define BL_GOBW UINT32_C(64)
// GOB width in log2
#define BL_GOBW_LOG2 UINT32_C(6)
// GOB height
#define BL_GOBH UINT32_C(8)
// GOB height in log2
#define BL_GOBH_LOG2 UINT32_C(3)
// GOB size
#define BL_GOB_SZ (BL_GOBW * BL_GOBH)
// GOB size in log2
#define BL_GOB_SZ_LOG2 ((uint32_t)BL_GOBW_LOG2 + (uint32_t)BL_GOBH_LOG2)
// GOB sector width
#define BL_GOB_SECW UINT32_C(16)
// GOB sector width in log2
#define BL_GOB_SECW_LOG2 UINT32_C(4)
// GOB sector height
#define BL_GOB_SECH UINT32_C(2)
// GOB sector size
#define BL_GOB_SEC_SZ (BL_GOB_SECW * BL_GOB_SECH)
// GOB packet mask
#define BL_GOB_PACK_MASK (BL_GOBW >> 1)
// GOB packet stride
#define BL_GOB_PACK_STRIDE UINT32_C(8)
// GOB sub-packet vertical mask
#define BL_GOB_SUBPACK_VER_MASK UINT32_C(6)
// GOB sub-packet vertical stride
#define BL_GOB_SUBPACK_VER_STRIDE UINT32_C(32)
// GOB sub-packet horizontal mask
#define BL_GOB_SUBPACK_HOR_MASK (BL_GOB_SEC_SZ >> 1)
// GOB sub-packat horizontal stride
#define BL_GOB_SUBPACK_HOR_STRIDE UINT32_C(2)
// GOB sector vertical mask
#define BL_GOB_SEC_VER_MASK (BL_GOB_SECH - UINT32_C(1))
// GOB sector verical stride
#define BL_GOB_SEC_VER_STRIDE UINT32_C(16)
// GOB sector horizontal mask
#define BL_GOB_SEC_HOR_MASK (BL_GOB_SECW - UINT32_C(1))
// Swizzle bit which should be set on non-native builds
#define BL_SWIZZLE_BIT 39U
// Addresses from cupvaSurfaceAddress2D will fill metadata starting from this bit
#define BL_ADDR_METADATA_SHIFT 40U
// Address bits 13:9
#define SBADR_IOVA_RANGE \
UINT32_C(13) \
: UINT32_C(9)
#define BL_SBADR_IOVA_MASK (CUPVA_FIELD_MASK(SBADR_IOVA_RANGE) << CUPVA_FIELD_SHIFT(SBADR_IOVA_RANGE))
// Flag to indicate that the address is a BL address
#define BL_METADATA_FLAG_BIT 8U
#define BL_METADATA_FLAG (1UL << BL_METADATA_FLAG_BIT)
#define BL_MAX_BLOCKHEIGHT_LOG2 (5)
inline vintx _cupvaBlocklinearOffset2D(vintx const &xCoords, vintx const &yCoords, uint32_t const linePitch,
uint8_t const blockHeightLog2)
{
uint32_t const widthInGobs = linePitch >> (uint32_t)BL_GOBW_LOG2;
int32_t const blockSizeLog2 = (int32_t)BL_GOB_SZ_LOG2 + (int32_t)blockHeightLog2;
uint32_t const linesPerBlock = (uint32_t)BL_GOBH << (uint32_t)blockHeightLog2;
int32_t const linesPerBlockLog2 = (int32_t)BL_GOBH_LOG2 + (int32_t)blockHeightLog2;
vintx x = xCoords;
vintx y = yCoords;
vintx vgobBase = (((y & (linesPerBlock - 1U)) >> (int32_t)BL_GOBH_LOG2) << (int32_t)BL_GOB_SZ_LOG2) +
((x >> (int32_t)BL_GOBW_LOG2) << blockSizeLog2);
vgobBase = vmaddwhw((y >> linesPerBlockLog2) << blockSizeLog2, (int32_t)widthInGobs, vgobBase, 0, 1);
vintx vgobOffset =
vmaddwhw(y & BL_GOB_SEC_VER_MASK, (int32_t)BL_GOB_SEC_VER_STRIDE, (x & BL_GOB_SEC_HOR_MASK), 0, 1);
vgobOffset = vmaddwhw(x & BL_GOB_PACK_MASK, (int32_t)BL_GOB_PACK_STRIDE, vgobOffset, 0, 1);
vgobOffset = vmaddwhw(y & BL_GOB_SUBPACK_VER_MASK, (int32_t)BL_GOB_SUBPACK_VER_STRIDE, vgobOffset, 0, 1);
vgobOffset = vmaddwhw(x & BL_GOB_SUBPACK_HOR_MASK, (int32_t)BL_GOB_SUBPACK_HOR_STRIDE, vgobOffset, 0, 1);
#if ((CUPVA_BUILD_MODE == CUPVA_QNX || CUPVA_BUILD_MODE == CUPVA_L4T) && CUPVA_PVA_GEN_NUMBER <= 2)
vintx swizzleBit = (replicatew(1) << (int32_t)BL_SWIZZLE_BIT);
#else
vintx swizzleBit = replicatew(0);
#endif
return (swizzleBit + vgobBase + vgobOffset);
}
inline vintx _cupvaBlocklinearOffset2D(vintx const &xCoords, vintx const &yCoords, vintx const &vLinePitch,
uint8_t const blockHeightLog2)
{
vintx const widthInGobs = vLinePitch >> (uint32_t)BL_GOBW_LOG2;
int32_t const blockSizeLog2 = (int32_t)BL_GOB_SZ_LOG2 + (int32_t)blockHeightLog2;
uint32_t const linesPerBlock = (uint32_t)BL_GOBH << (uint32_t)blockHeightLog2;
int32_t const linesPerBlockLog2 = (int32_t)BL_GOBH_LOG2 + (int32_t)blockHeightLog2;
vintx x = xCoords;
vintx y = yCoords;
vintx vgobBase = (((y & (linesPerBlock - 1U)) >> (int32_t)BL_GOBH_LOG2) << (int32_t)BL_GOB_SZ_LOG2) +
((x >> (int32_t)BL_GOBW_LOG2) << blockSizeLog2);
vgobBase = vmaddwhw((y >> linesPerBlockLog2) << blockSizeLog2, widthInGobs, vgobBase, 0, 1);
vintx vgobOffset =
vmaddwhw(y & BL_GOB_SEC_VER_MASK, (int32_t)BL_GOB_SEC_VER_STRIDE, (x & BL_GOB_SEC_HOR_MASK), 0, 1);
vgobOffset = vmaddwhw(x & BL_GOB_PACK_MASK, (int32_t)BL_GOB_PACK_STRIDE, vgobOffset, 0, 1);
vgobOffset = vmaddwhw(y & BL_GOB_SUBPACK_VER_MASK, (int32_t)BL_GOB_SUBPACK_VER_STRIDE, vgobOffset, 0, 1);
vgobOffset = vmaddwhw(x & BL_GOB_SUBPACK_HOR_MASK, (int32_t)BL_GOB_SUBPACK_HOR_STRIDE, vgobOffset, 0, 1);
#if ((CUPVA_BUILD_MODE == CUPVA_QNX || CUPVA_BUILD_MODE == CUPVA_L4T) && CUPVA_PVA_GEN_NUMBER <= 2)
vintx swizzleBit = (replicatew(1) << (int32_t)BL_SWIZZLE_BIT);
#else
vintx swizzleBit = replicatew(0);
#endif
return (swizzleBit + vgobBase + vgobOffset);
}
inline uint64_t cupvaSurfaceAddress2D(VPUSurfaceData const &surfData, uint32_t const x, uint32_t const y)
{
CUPVA_VPU_ASSERT(surfData.metadata.linePitch >= 0);
CUPVA_VPU_ASSERT(surfData.metadata.blockHeightLog2 >= 0 &&
surfData.metadata.blockHeightLog2 <= BL_MAX_BLOCKHEIGHT_LOG2);
uint32_t const widthInGobs = surfData.metadata.linePitch >> BL_GOBW_LOG2;
uint8_t const blockHeightLog2 = (uint8_t)(surfData.metadata.blockHeightLog2 & 0x7FU);
uint8_t const blockSizeLog2 = BL_GOB_SZ_LOG2 + blockHeightLog2;
uint32_t const linesPerBlock = ((uint32_t)BL_GOBH) << blockHeightLog2;
uint8_t const linesPerBlockLog2 = BL_GOBH_LOG2 + blockHeightLog2;
CUPVA_VPU_ASSERT(((0xFFFFFFFFU) - ((((y >> (uint32_t)linesPerBlockLog2) << (uint32_t)blockSizeLog2) * widthInGobs) +
((x >> (uint32_t)BL_GOBW_LOG2) << (uint32_t)blockSizeLog2))) >=
((y & (linesPerBlock - 1U)) >> BL_GOBH_LOG2));
uint64_t const gobBase =
((((uint64_t)y >> (uint64_t)linesPerBlockLog2) << (uint64_t)blockSizeLog2) * (uint64_t)widthInGobs) +
(((uint64_t)x >> (uint64_t)BL_GOBW_LOG2) << (uint64_t)blockSizeLog2) +
((((uint64_t)y & ((uint64_t)linesPerBlock - 1UL)) >> (uint64_t)BL_GOBH_LOG2) << (uint64_t)BL_GOB_SZ_LOG2);
CUPVA_VPU_ASSERT(((0xFFFFFFFFFFFFFFFF) - surfData.pointer.base) >= surfData.pointer.offset);
ExtMemIova va;
va.val = surfData.pointer.base + surfData.pointer.offset;
// blBaddr needs bits 13:9, plus set bit 8 to indicate BL mode, to be shifted to bit 40
uint32_t const blMetadata = (va.iova.addrLo & BL_SBADR_IOVA_MASK) | BL_METADATA_FLAG;
uint64_t const blMetadataPacked = (uint64_t)blMetadata << (BL_ADDR_METADATA_SHIFT - BL_METADATA_FLAG_BIT);
uint32_t const packOffset = (x & BL_GOB_PACK_MASK) * BL_GOB_PACK_STRIDE;
uint32_t const subpackOffset = ((y & BL_GOB_SUBPACK_VER_MASK) * BL_GOB_SUBPACK_VER_STRIDE) +
((x & BL_GOB_SUBPACK_HOR_MASK) * BL_GOB_SUBPACK_HOR_STRIDE);
uint32_t const secOffset = ((y & BL_GOB_SEC_VER_MASK) * BL_GOB_SEC_VER_STRIDE) + (x & BL_GOB_SEC_HOR_MASK);
CUPVA_VPU_ASSERT(((0xFFFFFFFFU) - packOffset) >= subpackOffset);
CUPVA_VPU_ASSERT(((0xFFFFFFFFU) - packOffset - subpackOffset) >= secOffset);
uint64_t const gobOffset = (uint64_t)packOffset + (uint64_t)subpackOffset + (uint64_t)secOffset;
uint64_t const swizzleBit =
((CUPVA_BUILD_MODE == CUPVA_QNX || CUPVA_BUILD_MODE == CUPVA_L4T) && CUPVA_PVA_GEN_NUMBER <= 2)
? (1ULL << BL_SWIZZLE_BIT)
: 0ULL;
uint64_t const BLOffset = blMetadataPacked + swizzleBit + gobBase + gobOffset;
CUPVA_VPU_ASSERT(((0xFFFFFFFFFFFFFFFFUL) / surfData.metadata.linePitch) >= y);
CUPVA_VPU_ASSERT(((0xFFFFFFFFFFFFFFFFUL) - (surfData.metadata.linePitch * y)) >= x);
uint64_t const PLOffset = ((uint64_t)y * (uint64_t)surfData.metadata.linePitch) + (uint64_t)x;
uint64_t const offset = (surfData.metadata.format == 1U) ? BLOffset : PLOffset;
return (va.val + offset);
}
inline vintx cupvaSurfaceAddress2D(VPUSurfaceData const &surfData, dvintx coords)
{
CUPVA_VPU_ASSERT(surfData.metadata.linePitch >= 0);
CUPVA_VPU_ASSERT(surfData.metadata.blockHeightLog2 >= 0 && surfData.metadata.blockHeightLog2 <= 5);
vintx x = coords.hi;
vintx y = coords.lo;
CUPVA_VPU_ASSERT(((0xFFFFFFFFFFFFFFFF) - surfData.pointer.base) >= surfData.pointer.offset);
ExtMemIova va;
va.val = surfData.pointer.base + surfData.pointer.offset;
// blBaddr needs bits 13:9, plus set bit 8 to indicate BL mode, to be shifted to bit 40
uint32_t const blMetadata = (va.iova.addrLo & BL_SBADR_IOVA_MASK) | BL_METADATA_FLAG;
vintx blMetadataPacked = replicatew(blMetadata) << (int32_t)(BL_ADDR_METADATA_SHIFT - BL_METADATA_FLAG_BIT);
vintx vBLOffset =
_cupvaBlocklinearOffset2D(coords.hi, coords.lo, surfData.metadata.linePitch, surfData.metadata.blockHeightLog2);
vBLOffset = vBLOffset + blMetadataPacked;
vintx vPLOffset = vmaddwhw(y, replicatew(surfData.metadata.linePitch), x, 0, 1);
vintx voffset = vmux(replicatew((int32_t)surfData.metadata.format), vBLOffset, vPLOffset);
vintx vBaseAddr = replicatew(va.iova.addrLo) | (replicatew((int32_t)va.iova.addrHi) << 32);
return (vBaseAddr + voffset);
}
void cupvaFloatingPointNANErrorEnabled(bool enable);
void cupvaICachePrefetch(uintptr_t addr_in_words, uint32_t size);
#endif // CUPVA_DEVICE_H