sampler.h#

Fully qualified name: src/device/vpu_runtime/include/cupva_device/sampler.h

File members: src/device/vpu_runtime/include/cupva_device/sampler.h

/*
 * Copyright (c) 2023 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA Corporation is strictly prohibited.
 */

#ifndef CUPVA_SAMPLER_H
#define CUPVA_SAMPLER_H

#include "device_core.h"

#include <cupva_types.h>
#include <stddef.h>
#include <stdint.h>

#if CUPVA_BUILD_MODE == CUPVA_NATIVE
#    include <dlut_defs.h>
#endif

#define CUPVA_SAMPLER_GPIO 1024
#define CUPVA_SAMPLER_CFG_ADDR 0x800
#define CUPVA_SAMPLER_AMOD1 VMEM_SUPERBANK_WIDTH_IN_BYTES
#define CUPVA_SAMPLER_DUPLICATE_HANDLING_EN 1U
#define CUPVA_SAMPLER_TASKINFO_IDX_RND_MODE_RANGE 1U : 1U
#define CUPVA_SAMPLER_TASKINFO_OUT_OF_RANGE_HANDLING_RANGE 3U : 2U
#define CUPVA_SAMPLER_TASKINFO_ENTRY_TYPE_RANGE 6U : 4U
#define CUPVA_SAMPLER_TASKINFO_INDEX_TYPE_RANGE 9U : 8U
#define CUPVA_SAMPLER_TASKINFO_TASK_MODE_RANGE 14U : 12U
#define CUPVA_SAMPLER_AUTOIDX_MODE_RANGER 4U : 4U
#define CUPVA_SAMPLER_AUTOIDX_TRAVERSAL_DIR_RANGE 0U : 0U
#define CUPVA_SAMPLER_OUTPUT_TRANS_MODE_RANGE 6U : 4U

#define CUPVA_SAMPLER_TASKINFO_IDX_RND_MODE_SHIFT CUPVA_FIELD_SHIFT(CUPVA_SAMPLER_TASKINFO_IDX_RND_MODE_RANGE)
#define CUPVA_SAMPLER_TASKINFO_OUT_OF_RANGE_HANDLING_SHIFT \
    CUPVA_FIELD_SHIFT(CUPVA_SAMPLER_TASKINFO_OUT_OF_RANGE_HANDLING_RANGE)
#define CUPVA_SAMPLER_TASKINFO_TASK_MODE_SHIFT CUPVA_FIELD_SHIFT(CUPVA_SAMPLER_TASKINFO_TASK_MODE_RANGE)
#define CUPVA_SAMPLER_TASKINFO_ENTRY_TYPE_SHIFT CUPVA_FIELD_SHIFT(CUPVA_SAMPLER_TASKINFO_ENTRY_TYPE_RANGE)
#define CUPVA_SAMPLER_TASKINFO_INDEX_TYPE_SHIFT CUPVA_FIELD_SHIFT(CUPVA_SAMPLER_TASKINFO_INDEX_TYPE_RANGE)
#define CUPVA_SAMPLER_AUTOIDX_MODE_SHIFT CUPVA_FIELD_SHIFT(CUPVA_SAMPLER_AUTOIDX_MODE_RANGER)
#define CUPVA_SAMPLER_OUTPUT_TRANS_MODE_SHIFT CUPVA_FIELD_SHIFT(CUPVA_SAMPLER_OUTPUT_TRANS_MODE_RANGE)

#define CUPVA_SAMPLER_ALIGNED(_a, _b, _c) (((_a) + ((_b) >> (_c)) - 1U) / ((_b) >> (_c)))

struct CupvaSampler
{
    uint8_t fraction_bits; // 0 ~ 16.
    uint8_t reserved_0;
    // containing
    //   bits 14:12  dlut_task_mode
    //   bits  9: 8  dlut_index_type
    //   bits  6: 4  dlut_entry_type
    //   bits  3: 2  dlut_out_of_range_hndl
    //   bit      1  dlut_idx_round_mode
    //   bit      0  dlut_duplicate_hndl
    uint16_t task_info;
    uint32_t x_int_limit; // U18, closed interval [0, x_int_limit << fraction_bits]
    uint32_t y_int_limit; // U18, closed interval [0, y_int_limit << fraction_bits]
    int32_t x_offset;
    int32_t y_offset;
    uint16_t task_len_N1; // number of pixels per round
    uint16_t task_len_N2; // number of rounds
    uint32_t sentinel_value;
    const void *table_addr;      // address of table, 64 bytes aligned
    ptrdiff_t table_addr_offset; // address update between rounds, 64 bytes aligned
    uint32_t table_line_pitch;   // in pixels, for conflict-free-2D mode it's 4K+2, otherwise any number

    uint8_t auto_idx_patch_height;
    uint8_t auto_idx_patch_width;
    // containing
    //      bit 0: TRAVERSAL_DIR
    //      bit 4: MODE (step scale needed or not)
    uint8_t auto_idx_fmt;
    uint8_t reserved_1;

    uint16_t index_agen_lane_ofst; // U12, for transposed access
    uint8_t reserved_2;
    // containing
    //   bits  6: 4  dlut_agen_transp_mode
    //   bit      0  dlut_xy_idx_intrlv_fmt
    uint8_t index_agen_fmt;

    uint16_t output_agen_lane_ofst; // U12, for transposed access
    uint8_t reserved_3;
    // containing
    //   bits  6: 4  dlut_agen_transp_mode
    uint8_t output_agen_fmt;

    uint32_t reserved_4[3];

    const void *index_agen_addr;
    ptrdiff_t index_agen_AMOD1;
    ptrdiff_t index_agen_AMOD2;

    void *output_agen_addr;
    ptrdiff_t output_agen_AMOD1;
    ptrdiff_t output_agen_AMOD2;

    uint32_t reserved_5;
    const CupvaSampler *next_task;

#if CUPVA_BUILD_MODE == CUPVA_NATIVE
    // Address range check in native mode
    // [begin, end)
    const void *inputAddrStart;
    const void *inputAddrEnd;
    const void *indexAddrStart;
    const void *indexAddrEnd;
    const void *outputAddrStart;
    const void *outputAddrEnd;
    const void *transformationMatrixAddrStart;
    const void *transformationMatrixAddrEnd;
#endif

};

enum CupvaSamplerMode
{
    SAMPLER_LOOKUP_1D = 0,
    SAMPLER_LOOKUP_2D = 1,
    SAMPLER_INTERP_1D = 2,
    SAMPLER_INTERP_2D = 3,
    SAMPLER_CONFLICT_FREE_2D_INTERP = 4,
    SAMPLER_TABLE_REFORMAT = 5,
    SAMPLER_INTERP_2D_AUTO_IDX = 6
};

enum CupvaSamplerInputType
{
    SAMPLER_INPUT_TYPE_S8 = 0,
    SAMPLER_INPUT_TYPE_S16 = 1,
    SAMPLER_INPUT_TYPE_S32 = 2,
    SAMPLER_INPUT_TYPE_U8 = 4,
    SAMPLER_INPUT_TYPE_U16 = 5,
    SAMPLER_INPUT_TYPE_U32 = 6
};

enum CupvaSamplerIndexType
{
    SAMPLER_INDEX_TYPE_U16 = 1,
    SAMPLER_INDEX_TYPE_U32 = 2,
};

enum CupvaSamplerIndexRoundingMode
{
    SAMPLER_FRAC_HANDLING_TRUNCATE = 0,
    SAMPLER_FRAC_HANDLING_ROUND = 1,
    SAMPLER_FRAC_HANDLING_INTERPOLATE = 2,
};

enum CupvaSamplerIndexInterleavingMode
{
    SAMPLER_INTERLEAVING_ELEMENTS = 0,
    SAMPLER_INTERLEAVING_32B = 1,
};

enum CupvaSamplerTileType
{
    SAMPLER_TILE_TRANS_ONLY = 0,
    SAMPLER_TILE_TRANS_SCALE = 1,
};

enum CupvaSamplerTileTraversalMode
{
    SAMPLER_TRAVERSAL_STANDARD = 0,
    SAMPLER_TRAVERSAL_TRANSPOSED = 1,
};

enum CupvaSamplerTaskOutOfRangeHandlingMode
{
    SAMPLER_OUT_OF_RANGE_CONSTANT = 1,
    SAMPLER_OUT_OF_RANGE_PREDICATE_OFF = 2,
};

enum CupvaSamplerInput2DFlags
{
    NO_DEFAULT_LINE_PITCH = 1 << 0,
};

struct CupvaSamplerInput1D
{
    void const *data;
    CupvaSamplerInputType type;
    uint32_t length;
    ptrdiff_t inputAdv;
    CupvaSamplerTaskOutOfRangeHandlingMode outOfRangeMode;
    uint32_t outOfRangeVal;
};

struct CupvaSamplerInput2D
{
    void const *data;
    CupvaSamplerInputType type;
    uint32_t width;
    uint32_t height;
    uint32_t linePitch;
    ptrdiff_t inputAdv;
    CupvaSamplerTaskOutOfRangeHandlingMode outOfRangeMode;
    uint32_t outOfRangeVal;
    uint32_t flags;
};

struct CupvaSamplerIndices1D
{
    void const *data;
    CupvaSamplerIndexType type;
    uint16_t width;
    uint16_t height;
    uint16_t linePitch;
    uint8_t fractionalBits;
    CupvaSamplerIndexRoundingMode fractionalHandling;
    int32_t offset;
};

struct CupvaSamplerIndices2D
{
    void const *data;
    CupvaSamplerIndexType type;
    uint16_t width;
    uint16_t height;
    uint16_t linePitch;
    uint8_t fractionalBits;
    CupvaSamplerIndexRoundingMode fractionalHandling;
    int32_t offsetX;
    int32_t offsetY;
    CupvaSamplerIndexInterleavingMode interleaving;
};

struct CupvaSamplerTiles
{
    void const *data;
    CupvaSamplerTileType type;
    uint8_t width;
    uint8_t height;
    uint16_t count;
    uint8_t fractionalBits;
    int32_t offsetX;
    int32_t offsetY;
    CupvaSamplerTileTraversalMode tileTraversalMode;
};

struct CupvaSamplerOutput
{
    void *data;
    uint16_t pitch;
    TranspositionMode transMode;
};

inline uint16_t _cupvaSamplerGetInputBppLog2(CupvaSamplerInputType const type)
{
    uint16_t bpp = 0U;

    switch (type)
    {
    case SAMPLER_INPUT_TYPE_S8:
    case SAMPLER_INPUT_TYPE_U8:
        bpp = 0U;
        break;
    case SAMPLER_INPUT_TYPE_S16:
    case SAMPLER_INPUT_TYPE_U16:
        bpp = 1U;
        break;
    case SAMPLER_INPUT_TYPE_S32:
    case SAMPLER_INPUT_TYPE_U32:
        bpp = 2U;
        break;
    default:
        // default log2(bpp) is 0
        break;
    }

    return bpp;
}

inline uint16_t _cupvaSamplerGetIndexBppLog2(CupvaSamplerIndexType const type)
{
    uint16_t bpp = 0U;

    switch (type)
    {
    case SAMPLER_INDEX_TYPE_U16:
        bpp = 1U;
        break;
    case SAMPLER_INDEX_TYPE_U32:
        bpp = 2U;
        break;
    default:
        // default case log2(bpp) is 0
        break;
    }

    return bpp;
}

inline uint16_t _cupvaSamplerGetIndexPpeLog2(CupvaSamplerMode const mode)
{
    uint16_t ppe = 0;
    switch (mode)
    {
    case SAMPLER_LOOKUP_2D:
    case SAMPLER_INTERP_2D:
    case SAMPLER_CONFLICT_FREE_2D_INTERP:
        ppe = 1U;
        break;
    case SAMPLER_LOOKUP_1D:
    case SAMPLER_INTERP_1D:
    case SAMPLER_TABLE_REFORMAT:
        ppe = 0U;
        break;
    default:
        ppe = 0U;
        break;
    }

    return ppe;
}

inline int32_t _cupvaSamplerAmod2(uint32_t byteOffset, uint32_t n1Iters, int32_t amod1, uint32_t bpe)
{
    uint32_t const alignedN1       = CUPVA_SAMPLER_ALIGNED(n1Iters, (uint32_t)CUPVA_SAMPLER_AMOD1, bpe);
    uint32_t const alignedN1Minus1 = alignedN1 - 1U;
    int32_t const n1Int            = (int32_t)alignedN1Minus1;
    int32_t const byteOffsetInt    = (int32_t)byteOffset;
    return byteOffsetInt - (n1Int * amod1);
}

#if CUPVA_BUILD_MODE == CUPVA_NATIVE
void cupvaSamplerSetupAddrRangeChecks(CupvaSampler *task);
#else
#    define cupvaSamplerSetupAddrRangeChecks(x)
#endif

inline void cupvaSamplerSetup(CupvaSampler *task, CupvaSamplerInput1D const *input,
                              CupvaSamplerIndices1D const *indices, CupvaSamplerOutput const *output)
{
    CupvaSamplerMode const taskMode =
        (indices->fractionalHandling == SAMPLER_FRAC_HANDLING_INTERPOLATE) ? SAMPLER_INTERP_1D : SAMPLER_LOOKUP_1D;
    uint16_t const rndMode   = (indices->fractionalHandling == SAMPLER_FRAC_HANDLING_INTERPOLATE)
                                   ? (uint16_t)0U
                                   : (uint16_t)indices->fractionalHandling;
    uint16_t const taskN1    = indices->width;
    uint16_t const taskN2    = indices->height;
    uint16_t const linePitch = (indices->linePitch == 0U) ? indices->width : indices->linePitch;

    // Configure task info
    // Enable duplicate detection and consolidation by default.
    task->task_info = ((uint16_t)taskMode << CUPVA_SAMPLER_TASKINFO_TASK_MODE_SHIFT);
    task->task_info |= ((uint16_t)input->type << CUPVA_SAMPLER_TASKINFO_ENTRY_TYPE_SHIFT);
    task->task_info |= ((uint16_t)indices->type << CUPVA_SAMPLER_TASKINFO_INDEX_TYPE_SHIFT);
    task->task_info |= ((uint16_t)input->outOfRangeMode << CUPVA_SAMPLER_TASKINFO_OUT_OF_RANGE_HANDLING_SHIFT);
    task->task_info |= (rndMode << CUPVA_SAMPLER_TASKINFO_IDX_RND_MODE_SHIFT);
    task->task_len_N1    = taskN1;
    task->task_len_N2    = taskN2;
    task->sentinel_value = input->outOfRangeVal;

    // Configure table data.
    task->table_addr        = input->data;
    task->table_addr_offset = input->inputAdv;
    task->x_int_limit       = input->length;
    task->y_int_limit       = 0U;

    // Configure index.
    uint16_t const idxBpp       = _cupvaSamplerGetIndexBppLog2(indices->type);
    uint32_t const indexN1Bytes = (uint32_t)linePitch << idxBpp;
    task->x_offset              = indices->offset;
    task->fraction_bits         = indices->fractionalBits;
    task->index_agen_fmt        = 0U;
    task->index_agen_addr       = indices->data;
    task->index_agen_lane_ofst  = 0U;
    task->index_agen_AMOD1      = (int32_t)CUPVA_SAMPLER_AMOD1;
    task->index_agen_AMOD2      = _cupvaSamplerAmod2(indexN1Bytes, taskN1, (int32_t)CUPVA_SAMPLER_AMOD1, idxBpp);

    // Configure output.
    uint16_t const outputBpp     = _cupvaSamplerGetInputBppLog2(input->type);
    uint32_t const outputLPBytes = (uint32_t)output->pitch << outputBpp;
    // Only TRANS_MODE_1 is supported
    uint8_t const transMode    = (output->transMode == TRANS_MODE_NONE) ? 0U : 1U;
    uint8_t const transHOffset = (output->transMode == TRANS_MODE_NONE) ? 0U : (1U << outputBpp);
    CUPVA_VPU_ASSERT(outputLPBytes >= transHOffset);
    uint32_t const laneOfst32 = (outputLPBytes - (uint32_t)transHOffset) / VMEM_SUPERBANK_WIDTH_IN_BYTES;
    CUPVA_VPU_ASSERT(laneOfst32 <= UINT16_MAX);
    uint16_t const laneOfst       = (uint16_t)laneOfst32;
    task->output_agen_fmt         = (transMode << CUPVA_SAMPLER_OUTPUT_TRANS_MODE_SHIFT);
    task->output_agen_lane_ofst   = (output->transMode == TRANS_MODE_NONE) ? 0U : laneOfst;
    task->output_agen_addr        = output->data;
    uint32_t const transposeAmod1 = ((uint32_t)CUPVA_SAMPLER_AMOD1 >> outputBpp) * outputLPBytes;
    task->output_agen_AMOD1 =
        (output->transMode == TRANS_MODE_NONE) ? (int32_t)CUPVA_SAMPLER_AMOD1 : (int32_t)transposeAmod1;
    uint32_t const outputN2Bytes = (output->transMode == TRANS_MODE_NONE) ? outputLPBytes : transHOffset;
    int32_t const outAmod2       = _cupvaSamplerAmod2(outputN2Bytes, taskN1, task->output_agen_AMOD1, outputBpp);
    task->output_agen_AMOD2      = (outputLPBytes == 0U) ? (int32_t)0 : outAmod2;

    task->next_task = NULL;

    cupvaSamplerSetupAddrRangeChecks(task);
}

inline void cupvaSamplerSetup(CupvaSampler *task, CupvaSamplerInput2D const *input,
                              CupvaSamplerIndices2D const *indices, CupvaSamplerOutput const *output)
{
    CupvaSamplerMode const taskMode =
        (indices->fractionalHandling == SAMPLER_FRAC_HANDLING_INTERPOLATE) ? SAMPLER_INTERP_2D : SAMPLER_LOOKUP_2D;
    uint16_t const rndMode          = (indices->fractionalHandling == SAMPLER_FRAC_HANDLING_INTERPOLATE)
                                          ? (uint16_t)0U
                                          : (uint16_t)indices->fractionalHandling;
    uint16_t const taskN1           = indices->width;
    uint16_t const taskN2           = indices->height;
    uint16_t const indicesLinePitch = (indices->linePitch == 0U) ? indices->width : indices->linePitch;
    CupvaSamplerInput2DFlags const noDefaultLinePitchFlag = NO_DEFAULT_LINE_PITCH;
    uint32_t const inputLinePitch =
        ((input->linePitch == 0U) && ((input->flags & ((uint32_t)noDefaultLinePitchFlag)) == 0U)) ? input->width
                                                                                                  : input->linePitch;

    // Configure task info
    // Enable duplicate detection and consolidation by default.
    task->task_info = CUPVA_SAMPLER_DUPLICATE_HANDLING_EN;
    task->task_info |= (uint16_t)taskMode << CUPVA_SAMPLER_TASKINFO_TASK_MODE_SHIFT;
    task->task_info |= ((uint16_t)input->type << CUPVA_SAMPLER_TASKINFO_ENTRY_TYPE_SHIFT);
    task->task_info |= ((uint16_t)indices->type << CUPVA_SAMPLER_TASKINFO_INDEX_TYPE_SHIFT);
    task->task_info |= ((uint16_t)input->outOfRangeMode << CUPVA_SAMPLER_TASKINFO_OUT_OF_RANGE_HANDLING_SHIFT);
    task->task_info |= ((uint16_t)rndMode << CUPVA_SAMPLER_TASKINFO_IDX_RND_MODE_SHIFT);
    task->task_len_N1    = taskN1;
    task->task_len_N2    = taskN2;
    task->sentinel_value = input->outOfRangeVal;

    // Configure table data.
    task->table_addr        = input->data;
    task->table_addr_offset = input->inputAdv;
    task->table_line_pitch  = inputLinePitch;
    task->x_int_limit       = input->width;
    task->y_int_limit       = input->height;

    // Configure index.
    uint16_t const idxBpp       = _cupvaSamplerGetIndexBppLog2(indices->type);
    uint16_t const idxBpe       = idxBpp + _cupvaSamplerGetIndexPpeLog2(taskMode);
    uint32_t const indexN1Bytes = (uint32_t)indicesLinePitch << idxBpe;
    task->x_offset              = indices->offsetX;
    task->y_offset              = indices->offsetY;
    task->fraction_bits         = indices->fractionalBits;
    task->index_agen_fmt        = (uint8_t)indices->interleaving;
    task->index_agen_addr       = indices->data;
    task->index_agen_lane_ofst  = 0U;
    task->index_agen_AMOD1      = (int32_t)CUPVA_SAMPLER_AMOD1;
    task->index_agen_AMOD2      = _cupvaSamplerAmod2(indexN1Bytes, taskN1, (int32_t)CUPVA_SAMPLER_AMOD1, idxBpe);

    // Configure output.
    uint16_t const outputBpp     = _cupvaSamplerGetInputBppLog2(input->type);
    uint32_t const outputLPBytes = (uint32_t)output->pitch << outputBpp;
    // Only TRANS_MODE_1 is supported
    uint8_t const transMode    = (output->transMode == TRANS_MODE_NONE) ? 0U : 1U;
    uint8_t const transHOffset = (output->transMode == TRANS_MODE_NONE) ? 0U : (1U << outputBpp);
    CUPVA_VPU_ASSERT(outputLPBytes >= transHOffset);
    uint32_t const laneOfst32 = (outputLPBytes - (uint32_t)transHOffset) / VMEM_SUPERBANK_WIDTH_IN_BYTES;
    CUPVA_VPU_ASSERT(laneOfst32 <= UINT16_MAX);
    uint16_t const laneOfst       = (uint16_t)laneOfst32;
    task->output_agen_fmt         = (transMode << CUPVA_SAMPLER_OUTPUT_TRANS_MODE_SHIFT);
    task->output_agen_lane_ofst   = (output->transMode == TRANS_MODE_NONE) ? 0U : laneOfst;
    task->output_agen_addr        = output->data;
    uint32_t const transposeAmod1 = ((uint32_t)CUPVA_SAMPLER_AMOD1 >> outputBpp) * outputLPBytes;
    task->output_agen_AMOD1 =
        (output->transMode == TRANS_MODE_NONE) ? (int32_t)CUPVA_SAMPLER_AMOD1 : (int32_t)transposeAmod1;
    uint32_t const outputN2Bytes = (output->transMode == TRANS_MODE_NONE) ? outputLPBytes : transHOffset;
    int32_t const outAmod2       = _cupvaSamplerAmod2(outputN2Bytes, taskN1, task->output_agen_AMOD1, outputBpp);
    task->output_agen_AMOD2      = (outputLPBytes == 0U) ? (int32_t)0 : outAmod2;

    task->next_task = NULL;

    cupvaSamplerSetupAddrRangeChecks(task);
}

inline void cupvaSamplerSetup(CupvaSampler *task, CupvaSamplerInput2D const *input, CupvaSamplerTiles const *tiles,
                              CupvaSamplerOutput const *output)
{
    CupvaSamplerMode const taskMode = SAMPLER_INTERP_2D_AUTO_IDX;
    uint16_t const taskN1           = (uint16_t)(tiles->width) * tiles->height;
    uint16_t const taskN2           = tiles->count;
    uint32_t const inputLinePitch   = (input->linePitch == 0U) ? input->width : input->linePitch;

    // Configure task info
    // Enable duplicate detection and consolidation by default.
    task->task_info = CUPVA_SAMPLER_DUPLICATE_HANDLING_EN;
    task->task_info |= (uint16_t)taskMode << CUPVA_SAMPLER_TASKINFO_TASK_MODE_SHIFT;
    task->task_info |= (uint16_t)input->type << CUPVA_SAMPLER_TASKINFO_ENTRY_TYPE_SHIFT;
    task->task_info |= ((uint16_t)SAMPLER_INDEX_TYPE_U32 << CUPVA_SAMPLER_TASKINFO_INDEX_TYPE_SHIFT);
    task->task_info |= (uint16_t)input->outOfRangeMode << CUPVA_SAMPLER_TASKINFO_OUT_OF_RANGE_HANDLING_SHIFT;
    task->task_len_N1    = taskN1;
    task->task_len_N2    = taskN2;
    task->sentinel_value = input->outOfRangeVal;

    // Configure table data.
    task->table_addr        = input->data;
    task->table_addr_offset = input->inputAdv;
    task->table_line_pitch  = inputLinePitch;
    task->x_int_limit       = input->width;
    task->y_int_limit       = input->height;

    // Configure index.
    task->x_offset              = tiles->offsetX;
    task->y_offset              = tiles->offsetY;
    task->fraction_bits         = tiles->fractionalBits;
    task->auto_idx_patch_width  = tiles->width;
    task->auto_idx_patch_height = tiles->height;
    task->index_agen_addr       = tiles->data;
    task->index_agen_AMOD1      = (int32_t)CUPVA_SAMPLER_AMOD1;
    task->auto_idx_fmt = (uint8_t)tiles->tileTraversalMode | ((uint8_t)tiles->type << CUPVA_SAMPLER_AUTOIDX_MODE_SHIFT);

    // Configure output.
    uint16_t const outputBpp     = _cupvaSamplerGetInputBppLog2(input->type);
    uint32_t const outputLPBytes = (uint32_t)output->pitch << outputBpp;
    // Only TRANS_MODE_1 is supported
    uint8_t const transMode    = (output->transMode == TRANS_MODE_NONE) ? 0U : 1U;
    uint8_t const transHOffset = (output->transMode == TRANS_MODE_NONE) ? 0U : (1U << outputBpp);
    CUPVA_VPU_ASSERT(outputLPBytes >= transHOffset);
    uint32_t const laneOfst32 = (outputLPBytes - (uint32_t)transHOffset) / VMEM_SUPERBANK_WIDTH_IN_BYTES;
    CUPVA_VPU_ASSERT(laneOfst32 <= UINT16_MAX);
    uint16_t const laneOfst       = (uint16_t)laneOfst32;
    task->output_agen_fmt         = (transMode << CUPVA_SAMPLER_OUTPUT_TRANS_MODE_SHIFT);
    task->output_agen_lane_ofst   = (output->transMode == TRANS_MODE_NONE) ? 0U : laneOfst;
    task->output_agen_addr        = output->data;
    uint32_t const transposeAmod1 = ((uint32_t)CUPVA_SAMPLER_AMOD1 >> outputBpp) * outputLPBytes;
    task->output_agen_AMOD1 =
        (output->transMode == TRANS_MODE_NONE) ? (int32_t)CUPVA_SAMPLER_AMOD1 : (int32_t)transposeAmod1;
    uint32_t const outputN2Bytes = (output->transMode == TRANS_MODE_NONE) ? outputLPBytes : transHOffset;
    int32_t const outAmod2       = _cupvaSamplerAmod2(outputN2Bytes, taskN1, task->output_agen_AMOD1, outputBpp);
    task->output_agen_AMOD2      = (outputLPBytes == 0U) ? (int32_t)0 : outAmod2;

    task->next_task = NULL;

    cupvaSamplerSetupAddrRangeChecks(task);
}

inline void cupvaSamplerStart(CupvaSampler const *head)
{
#if CUPVA_BUILD_MODE != CUPVA_NATIVE
    chess_memory_fence();
    cp_store((intptr_t)head, CUPVA_SAMPLER_CFG_ADDR);
#    if defined(CUPVA_PROFILING_FLAG) && CUPVA_PROFILING_FLAG == 1
    while ((gpi_rd() & CUPVA_SAMPLER_GPIO) != 0); // wait for DONE to be deasserted
#    else
    wfe_gpi(CUPVA_SAMPLER_GPIO, 0); // wait for DONE to be deasserted
#    endif
    gpo_setl(CUPVA_SAMPLER_GPIO); // assert START
#else
    start_dlut((const dlut_param_cfg *)head);
#endif
}

inline void cupvaSamplerWait(void)
{
#if CUPVA_BUILD_MODE != CUPVA_NATIVE
#    if defined(CUPVA_PROFILING_FLAG) && CUPVA_PROFILING_FLAG == 1
    while ((gpi_rd() & CUPVA_SAMPLER_GPIO) == 0); // wait for DONE to be asserted
#    else
    wfe_gpi(CUPVA_SAMPLER_GPIO, CUPVA_SAMPLER_GPIO); // wait for DONE to be asserted
#    endif
    gpo_clrl(CUPVA_SAMPLER_GPIO); // deassert START
#else
    check_dlut_done_and_clear_start();
#endif
}

inline void cupvaSamplerUpdateAddrs(CupvaSampler *task, void const *input, void const *indices, void *output)
{
    task->table_addr       = input;
    task->index_agen_addr  = indices;
    task->output_agen_addr = output;

    cupvaSamplerSetupAddrRangeChecks(task);
}

inline void cupvaSamplerUpdateInputAttributes(CupvaSampler *task, uint32_t const width, uint32_t const height,
                                              uint32_t const linePitch)
{
    task->x_int_limit      = width;
    task->y_int_limit      = height;
    task->table_line_pitch = linePitch;

    cupvaSamplerSetupAddrRangeChecks(task);
}

inline void cupvaSamplerLink(CupvaSampler *task, CupvaSampler const *next)
{
    task->next_task = next;
}

#endif