sampler.h#

Fully qualified name: src/device/vpu_runtime/include/cupva_device/sampler.h

File members: src/device/vpu_runtime/include/cupva_device/sampler.h

/*
 * Copyright (c) 2023 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA Corporation is strictly prohibited.
 */

 #ifndef CUPVA_SAMPLER_H
 #define CUPVA_SAMPLER_H

 #include "device_core.h"

 #include <cupva_types.h>
 #include <stddef.h>
 #include <stdint.h>

 #if CUPVA_BUILD_MODE == CUPVA_NATIVE
 #    include <dlut_defs.h>
 #endif

 #define CUPVA_SAMPLER_GPIO 1024
 #define CUPVA_SAMPLER_CFG_ADDR 0x800
 #define CUPVA_SAMPLER_AMOD1 VMEM_SUPERBANK_WIDTH_IN_BYTES
 #define CUPVA_SAMPLER_DUPLICATE_HANDLING_EN 1U
 #define CUPVA_SAMPLER_TASKINFO_IDX_RND_MODE_RANGE 1U : 1U
 #define CUPVA_SAMPLER_TASKINFO_OUT_OF_RANGE_HANDLING_RANGE 3U : 2U
 #define CUPVA_SAMPLER_TASKINFO_ENTRY_TYPE_RANGE 6U : 4U
 #define CUPVA_SAMPLER_TASKINFO_INDEX_TYPE_RANGE 9U : 8U
 #define CUPVA_SAMPLER_TASKINFO_TASK_MODE_RANGE 14U : 12U
 #define CUPVA_SAMPLER_AUTOIDX_MODE_RANGER 4U : 4U
 #define CUPVA_SAMPLER_AUTOIDX_TRAVERSAL_DIR_RANGE 0U : 0U
 #define CUPVA_SAMPLER_OUTPUT_TRANS_MODE_RANGE 6U : 4U

 #define CUPVA_SAMPLER_TASKINFO_IDX_RND_MODE_SHIFT CUPVA_FIELD_SHIFT(CUPVA_SAMPLER_TASKINFO_IDX_RND_MODE_RANGE)
 #define CUPVA_SAMPLER_TASKINFO_OUT_OF_RANGE_HANDLING_SHIFT \
     CUPVA_FIELD_SHIFT(CUPVA_SAMPLER_TASKINFO_OUT_OF_RANGE_HANDLING_RANGE)
 #define CUPVA_SAMPLER_TASKINFO_TASK_MODE_SHIFT CUPVA_FIELD_SHIFT(CUPVA_SAMPLER_TASKINFO_TASK_MODE_RANGE)
 #define CUPVA_SAMPLER_TASKINFO_ENTRY_TYPE_SHIFT CUPVA_FIELD_SHIFT(CUPVA_SAMPLER_TASKINFO_ENTRY_TYPE_RANGE)
 #define CUPVA_SAMPLER_TASKINFO_INDEX_TYPE_SHIFT CUPVA_FIELD_SHIFT(CUPVA_SAMPLER_TASKINFO_INDEX_TYPE_RANGE)
 #define CUPVA_SAMPLER_AUTOIDX_MODE_SHIFT CUPVA_FIELD_SHIFT(CUPVA_SAMPLER_AUTOIDX_MODE_RANGER)
 #define CUPVA_SAMPLER_OUTPUT_TRANS_MODE_SHIFT CUPVA_FIELD_SHIFT(CUPVA_SAMPLER_OUTPUT_TRANS_MODE_RANGE)

 #define CUPVA_SAMPLER_ALIGNED(_a, _b, _c) (((_a) + ((_b) >> (_c)) - 1U) / ((_b) >> (_c)))

 struct CupvaSampler
 {
     uint8_t fraction_bits; // 0 ~ 16.
     uint8_t reserved_0;
     // containing
     //   bits 14:12  dlut_task_mode
     //   bits  9: 8  dlut_index_type
     //   bits  6: 4  dlut_entry_type
     //   bits  3: 2  dlut_out_of_range_hndl
     //   bit      1  dlut_idx_round_mode
     //   bit      0  dlut_duplicate_hndl
     uint16_t task_info;
     uint32_t x_int_limit; // U18, closed interval [0, x_int_limit << fraction_bits]
     uint32_t y_int_limit; // U18, closed interval [0, y_int_limit << fraction_bits]
     int32_t x_offset;
     int32_t y_offset;
     uint16_t task_len_N1; // number of pixels per round
     uint16_t task_len_N2; // number of rounds
     uint32_t sentinel_value;
     const void *table_addr;      // address of table, 64 bytes aligned
     ptrdiff_t table_addr_offset; // address update between rounds, 64 bytes aligned
     uint32_t table_line_pitch;   // in pixels, for conflict-free-2D mode it's 4K+2, otherwise any number

     uint8_t auto_idx_patch_height;
     uint8_t auto_idx_patch_width;
     // containing
     //      bit 0: TRAVERSAL_DIR
     //      bit 4: MODE (step scale needed or not)
     uint8_t auto_idx_fmt;
     uint8_t reserved_1;

     uint16_t index_agen_lane_ofst; // U12, for transposed access
     uint8_t reserved_2;
     // containing
     //   bits  6: 4  dlut_agen_transp_mode
     //   bit      0  dlut_xy_idx_intrlv_fmt
     uint8_t index_agen_fmt;

     uint16_t output_agen_lane_ofst; // U12, for transposed access
     uint8_t reserved_3;
     // containing
     //   bits  6: 4  dlut_agen_transp_mode
     uint8_t output_agen_fmt;

     uint32_t reserved_4[3];

     const void *index_agen_addr;
     ptrdiff_t index_agen_AMOD1;
     ptrdiff_t index_agen_AMOD2;

     void *output_agen_addr;
     ptrdiff_t output_agen_AMOD1;
     ptrdiff_t output_agen_AMOD2;

     uint32_t reserved_5;
     const CupvaSampler *next_task;

 #if CUPVA_BUILD_MODE == CUPVA_NATIVE
     // Address range check in native mode
     // [begin, end)
     const void *inputAddrStart;
     const void *inputAddrEnd;
     const void *indexAddrStart;
     const void *indexAddrEnd;
     const void *outputAddrStart;
     const void *outputAddrEnd;
     const void *transformationMatrixAddrStart;
     const void *transformationMatrixAddrEnd;
 #endif

 };

 enum CupvaSamplerMode
 {
     SAMPLER_LOOKUP_1D = 0,
     SAMPLER_LOOKUP_2D = 1,
     SAMPLER_INTERP_1D = 2,
     SAMPLER_INTERP_2D = 3,
     SAMPLER_CONFLICT_FREE_2D_INTERP = 4,
     SAMPLER_TABLE_REFORMAT = 5,
     SAMPLER_INTERP_2D_AUTO_IDX = 6
 };

 enum CupvaSamplerInputType
 {
     SAMPLER_INPUT_TYPE_S8 = 0,
     SAMPLER_INPUT_TYPE_S16 = 1,
     SAMPLER_INPUT_TYPE_S32 = 2,
     SAMPLER_INPUT_TYPE_U8 = 4,
     SAMPLER_INPUT_TYPE_U16 = 5,
     SAMPLER_INPUT_TYPE_U32 = 6
 };

 enum CupvaSamplerIndexType
 {
     SAMPLER_INDEX_TYPE_U16 = 1,
     SAMPLER_INDEX_TYPE_U32 = 2,
 };

 enum CupvaSamplerIndexRoundingMode
 {
     SAMPLER_FRAC_HANDLING_TRUNCATE = 0,
     SAMPLER_FRAC_HANDLING_ROUND = 1,
     SAMPLER_FRAC_HANDLING_INTERPOLATE = 2,
 };

 enum CupvaSamplerIndexInterleavingMode
 {
     SAMPLER_INTERLEAVING_ELEMENTS = 0,
     SAMPLER_INTERLEAVING_32B = 1,
 };

 enum CupvaSamplerTileType
 {
     SAMPLER_TILE_TRANS_ONLY = 0,
     SAMPLER_TILE_TRANS_SCALE = 1,
 };

 enum CupvaSamplerTileTraversalMode
 {
     SAMPLER_TRAVERSAL_STANDARD = 0,
     SAMPLER_TRAVERSAL_TRANSPOSED = 1,
 };

 enum CupvaSamplerTaskOutOfRangeHandlingMode
 {
     SAMPLER_OUT_OF_RANGE_CONSTANT = 1,
     SAMPLER_OUT_OF_RANGE_PREDICATE_OFF = 2,
 };

 enum CupvaSamplerInput2DFlags
 {
     NO_DEFAULT_LINE_PITCH = 1 << 0,
 };

 struct CupvaSamplerInput1D
 {
     void const *data;
     CupvaSamplerInputType type;
     uint32_t length;
     ptrdiff_t inputAdv;
     CupvaSamplerTaskOutOfRangeHandlingMode outOfRangeMode;
     uint32_t outOfRangeVal;
 };

 struct CupvaSamplerInput2D
 {
     void const *data;
     CupvaSamplerInputType type;
     uint32_t width;
     uint32_t height;
     uint32_t linePitch;
     ptrdiff_t inputAdv;
     CupvaSamplerTaskOutOfRangeHandlingMode outOfRangeMode;
     uint32_t outOfRangeVal;
     uint32_t flags;
 };

 struct CupvaSamplerIndices1D
 {
     void const *data;
     CupvaSamplerIndexType type;
     uint16_t width;
     uint16_t height;
     uint16_t linePitch;
     uint8_t fractionalBits;
     CupvaSamplerIndexRoundingMode fractionalHandling;
     int32_t offset;
 };

 struct CupvaSamplerIndices2D
 {
     void const *data;
     CupvaSamplerIndexType type;
     uint16_t width;
     uint16_t height;
     uint16_t linePitch;
     uint8_t fractionalBits;
     CupvaSamplerIndexRoundingMode fractionalHandling;
     int32_t offsetX;
     int32_t offsetY;
     CupvaSamplerIndexInterleavingMode interleaving;
 };

 struct CupvaSamplerTiles
 {
     void const *data;
     CupvaSamplerTileType type;
     uint8_t width;
     uint8_t height;
     uint16_t count;
     uint8_t fractionalBits;
     int32_t offsetX;
     int32_t offsetY;
     CupvaSamplerTileTraversalMode tileTraversalMode;
 };

 struct CupvaSamplerOutput
 {
     void *data;
     uint16_t pitch;
     TranspositionMode transMode;
 };

 inline uint16_t _cupvaSamplerGetInputBppLog2(CupvaSamplerInputType const type)
 {
     uint16_t bpp = 0U;

     switch (type)
     {
     case SAMPLER_INPUT_TYPE_S8:
     case SAMPLER_INPUT_TYPE_U8:
         bpp = 0U;
         break;
     case SAMPLER_INPUT_TYPE_S16:
     case SAMPLER_INPUT_TYPE_U16:
         bpp = 1U;
         break;
     case SAMPLER_INPUT_TYPE_S32:
     case SAMPLER_INPUT_TYPE_U32:
         bpp = 2U;
         break;
     default:
         // default log2(bpp) is 0
         break;
     }

     return bpp;
 }

 inline uint16_t _cupvaSamplerGetIndexBppLog2(CupvaSamplerIndexType const type)
 {
     uint16_t bpp = 0U;

     switch (type)
     {
     case SAMPLER_INDEX_TYPE_U16:
         bpp = 1U;
         break;
     case SAMPLER_INDEX_TYPE_U32:
         bpp = 2U;
         break;
     default:
         // default case log2(bpp) is 0
         break;
     }

     return bpp;
 }

 inline uint16_t _cupvaSamplerGetIndexPpeLog2(CupvaSamplerMode const mode)
 {
     uint16_t ppe = 0;
     switch (mode)
     {
     case SAMPLER_LOOKUP_2D:
     case SAMPLER_INTERP_2D:
     case SAMPLER_CONFLICT_FREE_2D_INTERP:
         ppe = 1U;
         break;
     case SAMPLER_LOOKUP_1D:
     case SAMPLER_INTERP_1D:
     case SAMPLER_TABLE_REFORMAT:
         ppe = 0U;
         break;
     default:
         ppe = 0U;
         break;
     }

     return ppe;
 }

 inline int32_t _cupvaSamplerAmod2(uint32_t byteOffset, uint32_t n1Iters, int32_t amod1, uint32_t bpe)
 {
     uint32_t const alignedN1       = CUPVA_SAMPLER_ALIGNED(n1Iters, (uint32_t)CUPVA_SAMPLER_AMOD1, bpe);
     uint32_t const alignedN1Minus1 = alignedN1 - 1U;
     int32_t const n1Int            = (int32_t)alignedN1Minus1;
     int32_t const byteOffsetInt    = (int32_t)byteOffset;
     return byteOffsetInt - (n1Int * amod1);
 }

 #if CUPVA_BUILD_MODE == CUPVA_NATIVE
 void cupvaSamplerSetupAddrRangeChecks(CupvaSampler *task);
 #else
 #    define cupvaSamplerSetupAddrRangeChecks(x)
 #endif

 inline void cupvaSamplerSetup(CupvaSampler *task, CupvaSamplerInput1D const *input,
                               CupvaSamplerIndices1D const *indices, CupvaSamplerOutput const *output)
 {
     CupvaSamplerMode const taskMode =
         (indices->fractionalHandling == SAMPLER_FRAC_HANDLING_INTERPOLATE) ? SAMPLER_INTERP_1D : SAMPLER_LOOKUP_1D;
     uint16_t const rndMode   = (indices->fractionalHandling == SAMPLER_FRAC_HANDLING_INTERPOLATE)
                                    ? (uint16_t)0U
                                    : (uint16_t)indices->fractionalHandling;
     uint16_t const taskN1    = indices->width;
     uint16_t const taskN2    = indices->height;
     uint16_t const linePitch = (indices->linePitch == 0U) ? indices->width : indices->linePitch;

     // Configure task info
     // Enable duplicate detection and consolidation by default.
     task->task_info = ((uint16_t)taskMode << CUPVA_SAMPLER_TASKINFO_TASK_MODE_SHIFT);
     task->task_info |= ((uint16_t)input->type << CUPVA_SAMPLER_TASKINFO_ENTRY_TYPE_SHIFT);
     task->task_info |= ((uint16_t)indices->type << CUPVA_SAMPLER_TASKINFO_INDEX_TYPE_SHIFT);
     task->task_info |= ((uint16_t)input->outOfRangeMode << CUPVA_SAMPLER_TASKINFO_OUT_OF_RANGE_HANDLING_SHIFT);
     task->task_info |= (rndMode << CUPVA_SAMPLER_TASKINFO_IDX_RND_MODE_SHIFT);
     task->task_len_N1    = taskN1;
     task->task_len_N2    = taskN2;
     task->sentinel_value = input->outOfRangeVal;

     // Configure table data.
     task->table_addr        = input->data;
     task->table_addr_offset = input->inputAdv;
     task->x_int_limit       = input->length;
     task->y_int_limit       = 0U;

     // Configure index.
     uint16_t const idxBpp       = _cupvaSamplerGetIndexBppLog2(indices->type);
     uint32_t const indexN1Bytes = (uint32_t)linePitch << idxBpp;
     task->x_offset              = indices->offset;
     task->fraction_bits         = indices->fractionalBits;
     task->index_agen_fmt        = 0U;
     task->index_agen_addr       = indices->data;
     task->index_agen_lane_ofst  = 0U;
     task->index_agen_AMOD1      = (int32_t)CUPVA_SAMPLER_AMOD1;
     task->index_agen_AMOD2      = _cupvaSamplerAmod2(indexN1Bytes, taskN1, (int32_t)CUPVA_SAMPLER_AMOD1, idxBpp);

     // Configure output.
     uint16_t const outputBpp     = _cupvaSamplerGetInputBppLog2(input->type);
     uint32_t const outputLPBytes = (uint32_t)output->pitch << outputBpp;
     // Only TRANS_MODE_1 is supported
     uint8_t const transMode    = (output->transMode == TRANS_MODE_NONE) ? 0U : 1U;
     uint8_t const transHOffset = (output->transMode == TRANS_MODE_NONE) ? 0U : (1U << outputBpp);
     CUPVA_VPU_ASSERT(outputLPBytes >= transHOffset);
     uint32_t const laneOfst32 = (outputLPBytes - (uint32_t)transHOffset) / VMEM_SUPERBANK_WIDTH_IN_BYTES;
     CUPVA_VPU_ASSERT(laneOfst32 <= UINT16_MAX);
     uint16_t const laneOfst       = (uint16_t)laneOfst32;
     task->output_agen_fmt         = (transMode << CUPVA_SAMPLER_OUTPUT_TRANS_MODE_SHIFT);
     task->output_agen_lane_ofst   = (output->transMode == TRANS_MODE_NONE) ? 0U : laneOfst;
     task->output_agen_addr        = output->data;
     uint32_t const transposeAmod1 = ((uint32_t)CUPVA_SAMPLER_AMOD1 >> outputBpp) * outputLPBytes;
     task->output_agen_AMOD1 =
         (output->transMode == TRANS_MODE_NONE) ? (int32_t)CUPVA_SAMPLER_AMOD1 : (int32_t)transposeAmod1;
     uint32_t const outputN2Bytes = (output->transMode == TRANS_MODE_NONE) ? outputLPBytes : transHOffset;
     int32_t const outAmod2       = _cupvaSamplerAmod2(outputN2Bytes, taskN1, task->output_agen_AMOD1, outputBpp);
     task->output_agen_AMOD2      = (outputLPBytes == 0U) ? (int32_t)0 : outAmod2;

     task->next_task = NULL;

     cupvaSamplerSetupAddrRangeChecks(task);
 }

 inline void cupvaSamplerSetup(CupvaSampler *task, CupvaSamplerInput2D const *input,
                               CupvaSamplerIndices2D const *indices, CupvaSamplerOutput const *output)
 {
     CupvaSamplerMode const taskMode =
         (indices->fractionalHandling == SAMPLER_FRAC_HANDLING_INTERPOLATE) ? SAMPLER_INTERP_2D : SAMPLER_LOOKUP_2D;
     uint16_t const rndMode          = (indices->fractionalHandling == SAMPLER_FRAC_HANDLING_INTERPOLATE)
                                           ? (uint16_t)0U
                                           : (uint16_t)indices->fractionalHandling;
     uint16_t const taskN1           = indices->width;
     uint16_t const taskN2           = indices->height;
     uint16_t const indicesLinePitch = (indices->linePitch == 0U) ? indices->width : indices->linePitch;
     CupvaSamplerInput2DFlags const noDefaultLinePitchFlag = NO_DEFAULT_LINE_PITCH;
     uint32_t const inputLinePitch =
         ((input->linePitch == 0U) && ((input->flags & ((uint32_t)noDefaultLinePitchFlag)) == 0U)) ? input->width
                                                                                                   : input->linePitch;

     // Configure task info
     // Enable duplicate detection and consolidation by default.
     task->task_info = CUPVA_SAMPLER_DUPLICATE_HANDLING_EN;
     task->task_info |= (uint16_t)taskMode << CUPVA_SAMPLER_TASKINFO_TASK_MODE_SHIFT;
     task->task_info |= ((uint16_t)input->type << CUPVA_SAMPLER_TASKINFO_ENTRY_TYPE_SHIFT);
     task->task_info |= ((uint16_t)indices->type << CUPVA_SAMPLER_TASKINFO_INDEX_TYPE_SHIFT);
     task->task_info |= ((uint16_t)input->outOfRangeMode << CUPVA_SAMPLER_TASKINFO_OUT_OF_RANGE_HANDLING_SHIFT);
     task->task_info |= ((uint16_t)rndMode << CUPVA_SAMPLER_TASKINFO_IDX_RND_MODE_SHIFT);
     task->task_len_N1    = taskN1;
     task->task_len_N2    = taskN2;
     task->sentinel_value = input->outOfRangeVal;

     // Configure table data.
     task->table_addr        = input->data;
     task->table_addr_offset = input->inputAdv;
     task->table_line_pitch  = inputLinePitch;
     task->x_int_limit       = input->width;
     task->y_int_limit       = input->height;

     // Configure index.
     uint16_t const idxBpp       = _cupvaSamplerGetIndexBppLog2(indices->type);
     uint16_t const idxBpe       = idxBpp + _cupvaSamplerGetIndexPpeLog2(taskMode);
     uint32_t const indexN1Bytes = (uint32_t)indicesLinePitch << idxBpe;
     task->x_offset              = indices->offsetX;
     task->y_offset              = indices->offsetY;
     task->fraction_bits         = indices->fractionalBits;
     task->index_agen_fmt        = (uint8_t)indices->interleaving;
     task->index_agen_addr       = indices->data;
     task->index_agen_lane_ofst  = 0U;
     task->index_agen_AMOD1      = (int32_t)CUPVA_SAMPLER_AMOD1;
     task->index_agen_AMOD2      = _cupvaSamplerAmod2(indexN1Bytes, taskN1, (int32_t)CUPVA_SAMPLER_AMOD1, idxBpe);

     // Configure output.
     uint16_t const outputBpp     = _cupvaSamplerGetInputBppLog2(input->type);
     uint32_t const outputLPBytes = (uint32_t)output->pitch << outputBpp;
     // Only TRANS_MODE_1 is supported
     uint8_t const transMode    = (output->transMode == TRANS_MODE_NONE) ? 0U : 1U;
     uint8_t const transHOffset = (output->transMode == TRANS_MODE_NONE) ? 0U : (1U << outputBpp);
     CUPVA_VPU_ASSERT(outputLPBytes >= transHOffset);
     uint32_t const laneOfst32 = (outputLPBytes - (uint32_t)transHOffset) / VMEM_SUPERBANK_WIDTH_IN_BYTES;
     CUPVA_VPU_ASSERT(laneOfst32 <= UINT16_MAX);
     uint16_t const laneOfst       = (uint16_t)laneOfst32;
     task->output_agen_fmt         = (transMode << CUPVA_SAMPLER_OUTPUT_TRANS_MODE_SHIFT);
     task->output_agen_lane_ofst   = (output->transMode == TRANS_MODE_NONE) ? 0U : laneOfst;
     task->output_agen_addr        = output->data;
     uint32_t const transposeAmod1 = ((uint32_t)CUPVA_SAMPLER_AMOD1 >> outputBpp) * outputLPBytes;
     task->output_agen_AMOD1 =
         (output->transMode == TRANS_MODE_NONE) ? (int32_t)CUPVA_SAMPLER_AMOD1 : (int32_t)transposeAmod1;
     uint32_t const outputN2Bytes = (output->transMode == TRANS_MODE_NONE) ? outputLPBytes : transHOffset;
     int32_t const outAmod2       = _cupvaSamplerAmod2(outputN2Bytes, taskN1, task->output_agen_AMOD1, outputBpp);
     task->output_agen_AMOD2      = (outputLPBytes == 0U) ? (int32_t)0 : outAmod2;

     task->next_task = NULL;

     cupvaSamplerSetupAddrRangeChecks(task);
 }

 inline void cupvaSamplerSetup(CupvaSampler *task, CupvaSamplerInput2D const *input, CupvaSamplerTiles const *tiles,
                               CupvaSamplerOutput const *output)
 {
     CupvaSamplerMode const taskMode = SAMPLER_INTERP_2D_AUTO_IDX;
     uint16_t const taskN1           = (uint16_t)(tiles->width) * tiles->height;
     uint16_t const taskN2           = tiles->count;
     uint32_t const inputLinePitch   = (input->linePitch == 0U) ? input->width : input->linePitch;

     // Configure task info
     // Enable duplicate detection and consolidation by default.
     task->task_info = CUPVA_SAMPLER_DUPLICATE_HANDLING_EN;
     task->task_info |= (uint16_t)taskMode << CUPVA_SAMPLER_TASKINFO_TASK_MODE_SHIFT;
     task->task_info |= (uint16_t)input->type << CUPVA_SAMPLER_TASKINFO_ENTRY_TYPE_SHIFT;
     task->task_info |= ((uint16_t)SAMPLER_INDEX_TYPE_U32 << CUPVA_SAMPLER_TASKINFO_INDEX_TYPE_SHIFT);
     task->task_info |= (uint16_t)input->outOfRangeMode << CUPVA_SAMPLER_TASKINFO_OUT_OF_RANGE_HANDLING_SHIFT;
     task->task_len_N1    = taskN1;
     task->task_len_N2    = taskN2;
     task->sentinel_value = input->outOfRangeVal;

     // Configure table data.
     task->table_addr        = input->data;
     task->table_addr_offset = input->inputAdv;
     task->table_line_pitch  = inputLinePitch;
     task->x_int_limit       = input->width;
     task->y_int_limit       = input->height;

     // Configure index.
     task->x_offset              = tiles->offsetX;
     task->y_offset              = tiles->offsetY;
     task->fraction_bits         = tiles->fractionalBits;
     task->auto_idx_patch_width  = tiles->width;
     task->auto_idx_patch_height = tiles->height;
     task->index_agen_addr       = tiles->data;
     task->index_agen_AMOD1      = (int32_t)CUPVA_SAMPLER_AMOD1;
     task->auto_idx_fmt = (uint8_t)tiles->tileTraversalMode | ((uint8_t)tiles->type << CUPVA_SAMPLER_AUTOIDX_MODE_SHIFT);

     // Configure output.
     uint16_t const outputBpp     = _cupvaSamplerGetInputBppLog2(input->type);
     uint32_t const outputLPBytes = (uint32_t)output->pitch << outputBpp;
     // Only TRANS_MODE_1 is supported
     uint8_t const transMode    = (output->transMode == TRANS_MODE_NONE) ? 0U : 1U;
     uint8_t const transHOffset = (output->transMode == TRANS_MODE_NONE) ? 0U : (1U << outputBpp);
     CUPVA_VPU_ASSERT(outputLPBytes >= transHOffset);
     uint32_t const laneOfst32 = (outputLPBytes - (uint32_t)transHOffset) / VMEM_SUPERBANK_WIDTH_IN_BYTES;
     CUPVA_VPU_ASSERT(laneOfst32 <= UINT16_MAX);
     uint16_t const laneOfst       = (uint16_t)laneOfst32;
     task->output_agen_fmt         = (transMode << CUPVA_SAMPLER_OUTPUT_TRANS_MODE_SHIFT);
     task->output_agen_lane_ofst   = (output->transMode == TRANS_MODE_NONE) ? 0U : laneOfst;
     task->output_agen_addr        = output->data;
     uint32_t const transposeAmod1 = ((uint32_t)CUPVA_SAMPLER_AMOD1 >> outputBpp) * outputLPBytes;
     task->output_agen_AMOD1 =
         (output->transMode == TRANS_MODE_NONE) ? (int32_t)CUPVA_SAMPLER_AMOD1 : (int32_t)transposeAmod1;
     uint32_t const outputN2Bytes = (output->transMode == TRANS_MODE_NONE) ? outputLPBytes : transHOffset;
     int32_t const outAmod2       = _cupvaSamplerAmod2(outputN2Bytes, taskN1, task->output_agen_AMOD1, outputBpp);
     task->output_agen_AMOD2      = (outputLPBytes == 0U) ? (int32_t)0 : outAmod2;

     task->next_task = NULL;

     cupvaSamplerSetupAddrRangeChecks(task);
 }

 inline void cupvaSamplerStart(CupvaSampler const *head)
 {
 #if CUPVA_BUILD_MODE != CUPVA_NATIVE
     chess_memory_fence();
     cp_store((intptr_t)head, CUPVA_SAMPLER_CFG_ADDR);
 #    if defined(CUPVA_PROFILING_FLAG) && CUPVA_PROFILING_FLAG == 1
     while ((gpi_rd() & CUPVA_SAMPLER_GPIO) != 0); // wait for DONE to be deasserted
 #    else
     wfe_gpi(CUPVA_SAMPLER_GPIO, 0); // wait for DONE to be deasserted
 #    endif
     gpo_setl(CUPVA_SAMPLER_GPIO); // assert START
 #else
     start_dlut((const dlut_param_cfg *)head);
 #endif
 }

 inline void cupvaSamplerWait(void)
 {
 #if CUPVA_BUILD_MODE != CUPVA_NATIVE
 #    if defined(CUPVA_PROFILING_FLAG) && CUPVA_PROFILING_FLAG == 1
     while ((gpi_rd() & CUPVA_SAMPLER_GPIO) == 0); // wait for DONE to be asserted
 #    else
     wfe_gpi(CUPVA_SAMPLER_GPIO, CUPVA_SAMPLER_GPIO); // wait for DONE to be asserted
 #    endif
     gpo_clrl(CUPVA_SAMPLER_GPIO); // deassert START
 #else
     check_dlut_done_and_clear_start();
 #endif
 }

 inline void cupvaSamplerUpdateAddrs(CupvaSampler *task, void const *input, void const *indices, void *output)
 {
     task->table_addr       = input;
     task->index_agen_addr  = indices;
     task->output_agen_addr = output;

     cupvaSamplerSetupAddrRangeChecks(task);
 }

 inline void cupvaSamplerUpdateInputAttributes(CupvaSampler *task, uint32_t const width, uint32_t const height,
                                               uint32_t const linePitch)
 {
     task->x_int_limit      = width;
     task->y_int_limit      = height;
     task->table_line_pitch = linePitch;

     cupvaSamplerSetupAddrRangeChecks(task);
 }

 inline void cupvaSamplerLink(CupvaSampler *task, CupvaSampler const *next)
 {
     task->next_task = next;
 }

 #endif