pvaAplSortVpu.hpp#

Fully qualified name: public/src/primitive/pvaAplSortVpu.hpp
File members: public/src/primitive/pvaAplSortVpu.hpp
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 *
 * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 * property and proprietary rights in and to this material, related
 * documentation and any modifications thereto. Any use, reproduction,
 * disclosure or distribution of this material and related documentation
 * without an express license agreement from NVIDIA CORPORATION or
 * its affiliates is strictly prohibited.
 */

#ifndef PVA_APL_SORT_VPU_HPP
#define PVA_APL_SORT_VPU_HPP

#include <cupva_device.h>
#include <stdio.h>
#include <string.h>

#include <type_traits>

namespace pvaApl {

// Define a traits struct to map DataType to specific vector types
template<typename DataType>
struct SortVectorTypes;

template<>
struct SortVectorTypes<uint16_t>
{
    using DVTYPE = dvshort;
    using DVTYPEX = dvshortx;
    using PMT_VTYPEX = vcharx;
};

template<>
struct SortVectorTypes<int16_t>
{
    using DVTYPE = dvshort;
    using DVTYPEX = dvshortx;
    using PMT_VTYPEX = vcharx;
};

template<>
struct SortVectorTypes<uint32_t>
{
    using DVTYPE = dvint;
    using DVTYPEX = dvintx;
    using PMT_VTYPEX = vshortx;
};

template<>
struct SortVectorTypes<int32_t>
{
    using DVTYPE = dvint;
    using DVTYPEX = dvintx;
    using PMT_VTYPEX = vshortx;
};

struct SortContext
{
    AgenCFG cfgs[128];
    static constexpr size_t SCRATCH_SIZE = ((16 + 1) * 512 * sizeof(uint32_t));
    uint8_t scratch[SCRATCH_SIZE];
};

template<typename DataType, int Size>
class SortVpu
{
private:
    using DVTYPE     = typename SortVectorTypes<DataType>::DVTYPE;
    using DVTYPEX    = typename SortVectorTypes<DataType>::DVTYPEX;
    using PMT_VTYPEX = typename SortVectorTypes<DataType>::PMT_VTYPEX;

    template<typename AgenType>
    auto DVLOAD(AgenType &ag) -> decltype(auto)
    {
        if constexpr (std::is_same_v<DataType, uint16_t>)
        {
            return static_cast<dvshortx>(dvushort_load(ag));
        }
        else if constexpr (std::is_same_v<DataType, int16_t>)
        {
            return static_cast<dvshortx>(dvshort_load(ag));
        }
        else if constexpr (std::is_same_v<DataType, uint32_t>)
        {
            return static_cast<dvintx>(dvuint_load(ag));
        }
        else if constexpr (std::is_same_v<DataType, int32_t>)
        {
            return static_cast<dvintx>(dvint_load(ag));
        }
        else
        {
            static_assert(!std::is_same_v<DataType, DataType>, "Unsupported type for dvload");
        }
    }

    template<typename AgenType>
    auto DVLOAD_TRANSP(AgenType &ag) -> decltype(auto)
    {
        if constexpr (std::is_same_v<DataType, uint16_t>)
        {
            return static_cast<dvshortx>(dvushort_load_transp(ag));
        }
        else if constexpr (std::is_same_v<DataType, int16_t>)
        {
            return static_cast<dvshortx>(dvshort_load_transp(ag));
        }
        else if constexpr (std::is_same_v<DataType, uint32_t>)
        {
            return static_cast<dvintx>(dvuint_load_transp(ag));
        }
        else if constexpr (std::is_same_v<DataType, int32_t>)
        {
            return static_cast<dvintx>(dvint_load_transp(ag));
        }
        else
        {
            static_assert(!std::is_same_v<DataType, DataType>, "Unsupported type for dvload");
        }
    }

    template<typename AgenType, typename VecType>
    auto DVLOAD_PERM_TRANSP(AgenType &ag, VecType vx) -> decltype(auto)
    {
        if constexpr (std::is_same_v<DataType, uint16_t>)
        {
            return static_cast<dvshortx>(dvushort_load_perm_transp(ag, vx));
        }
        else if constexpr (std::is_same_v<DataType, int16_t>)
        {
            return static_cast<dvshortx>(dvshort_load_perm_transp(ag, vx));
        }
        else if constexpr (std::is_same_v<DataType, uint32_t>)
        {
            return static_cast<dvintx>(dvuint_load_perm_transp(ag, vx));
        }
        else if constexpr (std::is_same_v<DataType, int32_t>)
        {
            return static_cast<dvintx>(dvint_load_perm_transp(ag, vx));
        }
        else
        {
            static_assert(!std::is_same_v<DataType, DataType>, "Unsupported type for dvload_perm");
        }
    }

    auto PMT_LOAD() -> decltype(auto)
    {
        if constexpr (std::is_same_v<DataType, uint16_t> || std::is_same_v<DataType, int16_t>)
        {
            int8_t pmtArr[] = {31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
                               15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0};
            return static_cast<vcharx>(sign_extend(*((vchar *)(pmtArr))));
        }
        else if constexpr (std::is_same_v<DataType, uint32_t> || std::is_same_v<DataType, int32_t>)
        {
            int16_t pmtArr[] = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
            return static_cast<vshortx>(sign_extend(*((vshort *)(pmtArr))));
        }
        else
        {
            static_assert(!std::is_same_v<DataType, DataType>, "Unsupported type for dvload_perm");
        }
    }

    inline void VSORT2_INPLACE(DVTYPEX &v1, DVTYPEX &v2)
    {
        dvsort2(v1, v2, v1, v2);
    }

    inline void MERGE_2_WAY(DVTYPEX &v1, DVTYPEX &v2)
    {
        VSORT2_INPLACE(v1, v2);
    }

    inline void MERGE_4_WAY(DVTYPEX &v1, DVTYPEX &v2, DVTYPEX &v3, DVTYPEX &v4)
    {
        VSORT2_INPLACE(v1, v3);
        VSORT2_INPLACE(v2, v4);
        MERGE_2_WAY(v1, v2);
        MERGE_2_WAY(v3, v4);
    }

    inline void MERGE_8_WAY(DVTYPEX &v1, DVTYPEX &v2, DVTYPEX &v3, DVTYPEX &v4, DVTYPEX &v5, DVTYPEX &v6, DVTYPEX &v7,
                            DVTYPEX &v8)
    {
        VSORT2_INPLACE(v1, v5);
        VSORT2_INPLACE(v2, v6);
        VSORT2_INPLACE(v3, v7);
        VSORT2_INPLACE(v4, v8);
        MERGE_4_WAY(v1, v2, v3, v4);
        MERGE_4_WAY(v5, v6, v7, v8);
    }

    inline void MERGE_16_WAY(DVTYPEX &v1, DVTYPEX &v2, DVTYPEX &v3, DVTYPEX &v4, DVTYPEX &v5, DVTYPEX &v6, DVTYPEX &v7,
                             DVTYPEX &v8, DVTYPEX &v9, DVTYPEX &v10, DVTYPEX &v11, DVTYPEX &v12, DVTYPEX &v13,
                             DVTYPEX &v14, DVTYPEX &v15, DVTYPEX &v16)
    {
        VSORT2_INPLACE(v1, v9);
        VSORT2_INPLACE(v2, v10);
        VSORT2_INPLACE(v3, v11);
        VSORT2_INPLACE(v4, v12);
        VSORT2_INPLACE(v5, v13);
        VSORT2_INPLACE(v6, v14);
        VSORT2_INPLACE(v7, v15);
        VSORT2_INPLACE(v8, v16);
        MERGE_8_WAY(v1, v2, v3, v4, v5, v6, v7, v8);
        MERGE_8_WAY(v9, v10, v11, v12, v13, v14, v15, v16);
    }

    inline void MERGE_16_WAY_R(DVTYPEX &v1, DVTYPEX &v2, DVTYPEX &v3, DVTYPEX &v4, DVTYPEX &v5, DVTYPEX &v6,
                               DVTYPEX &v7, DVTYPEX &v8, DVTYPEX &v9, DVTYPEX &v10, DVTYPEX &v11, DVTYPEX &v12,
                               DVTYPEX &v13, DVTYPEX &v14, DVTYPEX &v15, DVTYPEX &v16)
    {
        VSORT2_INPLACE(v1, v16);
        VSORT2_INPLACE(v2, v15);
        VSORT2_INPLACE(v3, v14);
        VSORT2_INPLACE(v4, v13);
        VSORT2_INPLACE(v5, v12);
        VSORT2_INPLACE(v6, v11);
        VSORT2_INPLACE(v7, v10);
        VSORT2_INPLACE(v8, v9);
        MERGE_8_WAY(v1, v2, v3, v4, v5, v6, v7, v8);
        MERGE_8_WAY(v9, v10, v11, v12, v13, v14, v15, v16);
    }

    void even_odd_merge_sort(AgenCFG *cfgs_in1, AgenCFG *cfgs_out, int height)
    {

        DVTYPEX v1, v2, v3, v4, v5, v6, v7, v8;
        agen in, out;

        in  = init_agen_from_cfg(cfgs_in1[0]);
        out = init_agen_from_cfg(cfgs_out[0]);

        int niter = height >> 3;

        for (int i = 0; i < niter; i++) chess_loop_range(2, )
        chess_prepare_for_pipelining
        {
            v1 = DVLOAD(in);
            v2 = DVLOAD(in);
            v3 = DVLOAD(in);
            v4 = DVLOAD(in);
            v5 = DVLOAD(in);
            v6 = DVLOAD(in);
            v7 = DVLOAD(in);
            v8 = DVLOAD(in);

            VSORT2_INPLACE(v1, v2);
            VSORT2_INPLACE(v3, v4);
            VSORT2_INPLACE(v5, v6);
            VSORT2_INPLACE(v7, v8);

            VSORT2_INPLACE(v1, v3);
            VSORT2_INPLACE(v5, v7);
            VSORT2_INPLACE(v2, v4);
            VSORT2_INPLACE(v6, v8);

            VSORT2_INPLACE(v2, v3);
            VSORT2_INPLACE(v6, v7);

            VSORT2_INPLACE(v1, v5);
            VSORT2_INPLACE(v2, v6);
            VSORT2_INPLACE(v3, v7);
            VSORT2_INPLACE(v4, v8);

            VSORT2_INPLACE(v3, v5);
            VSORT2_INPLACE(v4, v6);

            VSORT2_INPLACE(v2, v3);
            VSORT2_INPLACE(v4, v5);
            VSORT2_INPLACE(v6, v7);

            vstore(v1, out);
            vstore(v2, out);
            vstore(v3, out);
            vstore(v4, out);
            vstore(v5, out);
            vstore(v6, out);
            vstore(v7, out);
            vstore(v8, out);
        }
    }

    void bitonic_merge_8_way_reverse(AgenCFG *cfgs_in1, AgenCFG *cfgs_in2, AgenCFG *cfgs_out, int height)
    {
        DVTYPEX v1, v2, v3, v4, v5, v6, v7, v8;
        agen_A in1, in2;
        agen_B out;

        in1 = init_agen_A_from_cfg(cfgs_in1[0]);
        in2 = init_agen_A_from_cfg(cfgs_in2[0]);
        out = init_agen_B_from_cfg(cfgs_out[0]);

        int niter = height >> 3;

        for (int i = 0; i < niter; i++) chess_loop_range(2, )
        chess_prepare_for_pipelining
        {
            v1 = DVLOAD(in1);
            v2 = DVLOAD(in1);
            v3 = DVLOAD(in1);
            v4 = DVLOAD(in1);

            v5 = DVLOAD(in2);
            v6 = DVLOAD(in2);
            v7 = DVLOAD(in2);
            v8 = DVLOAD(in2);

            MERGE_8_WAY(v1, v2, v3, v4, v5, v6, v7, v8);

            vstore(v1, out);
            vstore(v2, out);
            vstore(v3, out);
            vstore(v4, out);
            vstore(v5, out);
            vstore(v6, out);
            vstore(v7, out);
            vstore(v8, out);
        }
    }

    void bitonic_merge_8_way_transp(AgenCFG *cfgs_in1, AgenCFG *cfgs_out, int height)
    {
        DVTYPEX v1, v2, v3, v4, v5, v6, v7, v8;
        agen in, out;

        in  = init_agen_from_cfg(cfgs_in1[0]);
        out = init_agen_from_cfg(cfgs_out[0]);

        int niter = height >> 3;

        for (int i = 0; i < niter; i++) chess_loop_range(2, )
        chess_prepare_for_pipelining
        {
            v1 = DVLOAD(in);
            v2 = DVLOAD(in);
            v3 = DVLOAD(in);
            v4 = DVLOAD(in);
            v5 = DVLOAD(in);
            v6 = DVLOAD(in);
            v7 = DVLOAD(in);
            v8 = DVLOAD(in);

            MERGE_8_WAY(v1, v2, v3, v4, v5, v6, v7, v8);

            vstore_transp(v1, out);
            vstore_transp(v2, out);
            vstore_transp(v3, out);
            vstore_transp(v4, out);
            vstore_transp(v5, out);
            vstore_transp(v6, out);
            vstore_transp(v7, out);
            vstore_transp(v8, out);
        }
    }

    void bitonic_merge_8_way(AgenCFG *cfgs_in1, AgenCFG *cfgs_out, int height)
    {
        DVTYPEX v1, v2, v3, v4, v5, v6, v7, v8;
        agen in, out;

        in  = init_agen_from_cfg(cfgs_in1[0]);
        out = init_agen_from_cfg(cfgs_out[0]);

        int niter = height >> 3;

        for (int i = 0; i < niter; i++) chess_loop_range(2, )
        chess_prepare_for_pipelining
        {
            v1 = DVLOAD(in);
            v2 = DVLOAD(in);
            v3 = DVLOAD(in);
            v4 = DVLOAD(in);
            v5 = DVLOAD(in);
            v6 = DVLOAD(in);
            v7 = DVLOAD(in);
            v8 = DVLOAD(in);

            MERGE_8_WAY(v1, v2, v3, v4, v5, v6, v7, v8);

            vstore(v1, out);
            vstore(v2, out);
            vstore(v3, out);
            vstore(v4, out);
            vstore(v5, out);
            vstore(v6, out);
            vstore(v7, out);
            vstore(v8, out);
        }
    }

    void bitonic_merge_4_way_reverse(AgenCFG *cfgs_in1, AgenCFG *cfgs_in2, AgenCFG *cfgs_out, int height)
    {
        DVTYPEX v1, v2, v3, v4;
        agen_A in1, in2;
        agen_B out;

        in1 = init_agen_A_from_cfg(cfgs_in1[0]);
        in2 = init_agen_A_from_cfg(cfgs_in2[0]);
        out = init_agen_B_from_cfg(cfgs_out[0]);

        int niter = height >> 2;

        for (int i = 0; i < niter; i++) chess_loop_range(8, )
        chess_unroll_loop(2)
        chess_prepare_for_pipelining
        {
            v1 = DVLOAD(in1);
            v2 = DVLOAD(in1);
            v3 = DVLOAD(in2);
            v4 = DVLOAD(in2);

            MERGE_4_WAY(v1, v2, v3, v4);

            vstore(v1, out);
            vstore(v2, out);
            vstore(v3, out);
            vstore(v4, out);
        }
    }

    void bitonic_merge_4_way_transp(AgenCFG *cfgs_in1, AgenCFG *cfgs_out, int height)
    {
        DVTYPEX v1, v2, v3, v4;
        agen in, out;

        in  = init_agen_from_cfg(cfgs_in1[0]);
        out = init_agen_from_cfg(cfgs_out[0]);

        int niter = height >> 2;

        for (int i = 0; i < niter; i++) chess_loop_range(8, )
        chess_unroll_loop(2)
        chess_prepare_for_pipelining
        {
            v1 = DVLOAD(in);
            v2 = DVLOAD(in);
            v3 = DVLOAD(in);
            v4 = DVLOAD(in);

            MERGE_4_WAY(v1, v2, v3, v4);

            vstore_transp(v1, out);
            vstore_transp(v2, out);
            vstore_transp(v3, out);
            vstore_transp(v4, out);
        }
    }

    void bitonic_merge_4_way(AgenCFG *cfgs_in1, AgenCFG *cfgs_out, int height)
    {
        DVTYPEX v1, v2, v3, v4;
        agen in, out;

        in  = init_agen_from_cfg(cfgs_in1[0]);
        out = init_agen_from_cfg(cfgs_out[0]);

        int niter = height >> 2;

        for (int i = 0; i < niter; i++) chess_loop_range(8, )
        chess_unroll_loop(2)
        chess_prepare_for_pipelining
        {
            v1 = DVLOAD(in);
            v2 = DVLOAD(in);
            v3 = DVLOAD(in);
            v4 = DVLOAD(in);

            MERGE_4_WAY(v1, v2, v3, v4);

            vstore(v1, out);
            vstore(v2, out);
            vstore(v3, out);
            vstore(v4, out);
        }
    }

    void bitonic_merge_transpose_8_way_reverse(AgenCFG *cfgs_in1, AgenCFG *cfgs_in2, AgenCFG *cfgs_out, int height,
                                               PMT_VTYPEX *pmt_ptr)
    {
        DVTYPEX v1, v2, v3, v4, v5, v6, v7, v8;
        agen_A in1, in2;
        agen_B out;

        in1 = init_agen_A_from_cfg(cfgs_in1[0]);
        in2 = init_agen_A_from_cfg(cfgs_in2[0]);
        out = init_agen_B_from_cfg(cfgs_out[0]);

        PMT_VTYPEX pmt = *pmt_ptr;

        int niter = height >> 3;

        for (int i = 0; i < niter; i++) chess_loop_range(4, )
        chess_prepare_for_pipelining
        {
            v1 = DVLOAD_TRANSP(in1);
            v2 = DVLOAD_TRANSP(in1);
            v3 = DVLOAD_TRANSP(in1);
            v4 = DVLOAD_TRANSP(in1);

            v5 = DVLOAD_PERM_TRANSP(in2, pmt);
            v6 = DVLOAD_PERM_TRANSP(in2, pmt);
            v7 = DVLOAD_PERM_TRANSP(in2, pmt);
            v8 = DVLOAD_PERM_TRANSP(in2, pmt);

            MERGE_8_WAY(v1, v2, v3, v4, v5, v6, v7, v8);

            vstore_transp(v1, out);
            vstore_transp(v2, out);
            vstore_transp(v3, out);
            vstore_transp(v4, out);

            vstore_transp(v5, out);
            vstore_transp(v6, out);
            vstore_transp(v7, out);
            vstore_transp(v8, out);
        }
    }

    void bitonic_merge_transpose_8_way(AgenCFG *cfgs_in1, AgenCFG *cfgs_out, int height)
    {
        DVTYPEX v1, v2, v3, v4, v5, v6, v7, v8;
        agen in, out;

        in  = init_agen_from_cfg(cfgs_in1[0]);
        out = init_agen_from_cfg(cfgs_out[0]);

        int niter = height >> 3;

        for (int i = 0; i < niter; i++) chess_loop_range(4, )
        chess_prepare_for_pipelining
        {
            v1 = DVLOAD_TRANSP(in);
            v2 = DVLOAD_TRANSP(in);
            v3 = DVLOAD_TRANSP(in);
            v4 = DVLOAD_TRANSP(in);

            v5 = DVLOAD_TRANSP(in);
            v6 = DVLOAD_TRANSP(in);
            v7 = DVLOAD_TRANSP(in);
            v8 = DVLOAD_TRANSP(in);

            MERGE_8_WAY(v1, v2, v3, v4, v5, v6, v7, v8);

            vstore_transp(v1, out);
            vstore_transp(v2, out);
            vstore_transp(v3, out);
            vstore_transp(v4, out);

            vstore_transp(v5, out);
            vstore_transp(v6, out);
            vstore_transp(v7, out);
            vstore_transp(v8, out);
        }
    }

    void bitonic_merge_transpose_4_way_reverse(AgenCFG *cfgs_in1, AgenCFG *cfgs_in2, AgenCFG *cfgs_out, int height,
                                               PMT_VTYPEX *pmt_ptr)
    {
        DVTYPEX v1, v2, v3, v4;
        agen_A in1, in2;
        agen_B out;

        in1 = init_agen_A_from_cfg(cfgs_in1[0]);
        in2 = init_agen_A_from_cfg(cfgs_in2[0]);
        out = init_agen_B_from_cfg(cfgs_out[0]);

        chess_separator_scheduler();

        PMT_VTYPEX pmt = *pmt_ptr;

        int niter = height >> 2;

        for (int i = 0; i < niter; i++) chess_loop_range(8, )
        chess_unroll_loop(2)
        chess_prepare_for_pipelining
        {
            v1 = DVLOAD_TRANSP(in1);
            v2 = DVLOAD_TRANSP(in1);
            v3 = DVLOAD_PERM_TRANSP(in2, pmt);
            v4 = DVLOAD_PERM_TRANSP(in2, pmt);

            MERGE_4_WAY(v1, v2, v3, v4);

            vstore_transp(v1, out);
            vstore_transp(v2, out);
            vstore_transp(v3, out);
            vstore_transp(v4, out);
        }
    }

    void bitonic_merge_transpose_4_way(AgenCFG *cfgs_in1, AgenCFG *cfgs_out, int height)
    {
        DVTYPEX v1, v2, v3, v4;
        agen in, out;

        in  = init_agen_from_cfg(cfgs_in1[0]);
        out = init_agen_from_cfg(cfgs_out[0]);

        int niter = height >> 2;

        for (int i = 0; i < niter; i++) chess_loop_range(8, )
        chess_unroll_loop(2)
        chess_prepare_for_pipelining
        {
            v1 = DVLOAD_TRANSP(in);
            v2 = DVLOAD_TRANSP(in);
            v3 = DVLOAD_TRANSP(in);
            v4 = DVLOAD_TRANSP(in);

            MERGE_4_WAY(v1, v2, v3, v4);

            vstore_transp(v1, out);
            vstore_transp(v2, out);
            vstore_transp(v3, out);
            vstore_transp(v4, out);
        }
    }

    void bitonic_merge_transpose_2_way_reverse(AgenCFG *cfgs_in1, AgenCFG *cfgs_in2, AgenCFG *cfgs_out, int height,
                                               PMT_VTYPEX *pmt_ptr)
    {
        DVTYPEX v1, v2;
        agen_A in1, in2;
        agen_B out;

        in1 = init_agen_A_from_cfg(cfgs_in1[0]);
        in2 = init_agen_A_from_cfg(cfgs_in2[0]);
        out = init_agen_B_from_cfg(cfgs_out[0]);

        PMT_VTYPEX pmt = *pmt_ptr;

        int niter = height >> 1;

        for (int i = 0; i < niter; i++) chess_loop_range(16, )
        chess_unroll_loop(4)
        chess_prepare_for_pipelining
        {
            v1 = DVLOAD_TRANSP(in1);
            v2 = DVLOAD_PERM_TRANSP(in2, pmt);

            VSORT2_INPLACE(v1, v2);

            vstore_transp(v1, out);
            vstore_transp(v2, out);
        }
    }

    void bitonic_merge_transpose_2_way(AgenCFG *cfgs_in1, AgenCFG *cfgs_out, int height)
    {
        DVTYPEX v1, v2;
        agen in, out;

        in  = init_agen_from_cfg(cfgs_in1[0]);
        out = init_agen_from_cfg(cfgs_out[0]);

        int niter = height >> 1;

        for (int i = 0; i < niter; i++) chess_loop_range(16, )
        chess_unroll_loop(4)
        chess_prepare_for_pipelining
        {
            v1 = DVLOAD_TRANSP(in);
            v2 = DVLOAD_TRANSP(in);

            VSORT2_INPLACE(v1, v2);

            vstore_transp(v1, out);
            vstore_transp(v2, out);
        }
    }

    void even_odd_merge_sort_init(DataType *ping, DataType *pong, AgenCFG **cfgs, int lofst, int height)
    {
        AgenWrapper wrapper;
        int vecw = pva_elementsof(DVTYPEX);

        agen in      = init((DVTYPE *)ping);
        wrapper.size = sizeof(DataType);
        wrapper.n1   = height;
        wrapper.s1   = vecw;
        INIT_AGEN1(in, wrapper);

        agen out     = init((DVTYPE *)pong);
        wrapper.size = sizeof(DataType);
        wrapper.n1   = height;
        wrapper.s1   = lofst;
        INIT_AGEN1(out, wrapper);

        *(*cfgs)++ = extract_agen_cfg(in);
        *(*cfgs)++ = extract_agen_cfg(out);
    }

    void bitonic_merge_transpose_n_way_reverse_init(DataType *ping, DataType *pong, int *it, AgenCFG **cfgs, int lofst,
                                                    int height, int h_dist, int n)
    {
        agen in1, in2, out;
        int vecw = pva_elementsof(DVTYPEX);

        DataType *src = (*it) & 1 ? ping : pong;
        DataType *dst = (*it) & 1 ? pong : ping;
        (*it)++;

        int niter1 = n / 2;
        int niter2 = vecw / h_dist / n;
        int niter3 = height / vecw;
        int niter4 = h_dist;

        AgenWrapper wrapper;
        wrapper.size = sizeof(DataType);

        in1           = init((DVTYPE *)src);
        in1.lane_ofst = lofst / vecw;
        wrapper.n1    = niter1;
        wrapper.n2    = niter2;
        wrapper.n3    = niter3;
        wrapper.n4    = niter4;
        wrapper.s1    = h_dist;
        wrapper.s2    = n * h_dist;
        wrapper.s3    = vecw * lofst;
        wrapper.s4    = 1;
        INIT_AGEN4(in1, wrapper);

        *(*cfgs)++ = extract_agen_cfg(in1);

        in2           = init((DVTYPE *)(src + (height - vecw) * lofst + h_dist * n - 1));
        in2.lane_ofst = lofst / vecw;
        wrapper.s1    = -h_dist;
        wrapper.s2    = n * h_dist;
        wrapper.s3    = -vecw * lofst;
        wrapper.s4    = -1;
        INIT_AGEN4(in2, wrapper);

        *(*cfgs)++ = extract_agen_cfg(in2);

        niter1 = vecw / h_dist;
        niter2 = height / vecw;
        niter3 = h_dist;

        out           = init((DVTYPE *)dst);
        out.lane_ofst = lofst / vecw;
        wrapper.n1    = niter1;
        wrapper.n2    = niter2;
        wrapper.n3    = niter3;
        wrapper.s1    = h_dist;
        wrapper.s2    = vecw * lofst;
        wrapper.s3    = 1;
        INIT_AGEN3(out, wrapper);

        *(*cfgs)++ = extract_agen_cfg(out);
    }

    void bitonic_merge_transpose_n_way_init(DataType *ping, DataType *pong, int *it, AgenCFG **cfgs, int lofst,
                                            int height, int h_dist)
    {
        agen in, out;
        int vecw = pva_elementsof(DVTYPEX);

        DataType *src = (*it) & 1 ? ping : pong;
        DataType *dst = (*it) & 1 ? pong : ping;
        (*it)++;

        int niter1 = vecw / h_dist;
        int niter2 = height / vecw;
        int niter3 = h_dist;

        AgenWrapper wrapper;
        wrapper.size = sizeof(DataType);

        in           = init((DVTYPE *)src);
        in.lane_ofst = lofst / vecw;
        wrapper.n1   = niter1;
        wrapper.n2   = niter2;
        wrapper.n3   = niter3;
        wrapper.s1   = h_dist;
        wrapper.s2   = vecw * lofst;
        wrapper.s3   = 1;
        INIT_AGEN3(in, wrapper);

        *(*cfgs)++ = extract_agen_cfg(in);

        out           = init((DVTYPE *)dst);
        out.lane_ofst = lofst / vecw;
        wrapper.n1    = niter1;
        wrapper.n2    = niter2;
        wrapper.n3    = niter3;
        wrapper.s1    = h_dist;
        wrapper.s2    = vecw * lofst;
        wrapper.s3    = 1;
        INIT_AGEN3(out, wrapper);

        *(*cfgs)++ = extract_agen_cfg(out);
    }

    void bitonic_merge_transpose_init(DataType *ping, DataType *pong, int *it, AgenCFG **cfgs, int lofst, int height,
                                      int h_dist)
    {
        int reverse = 1;
        while (h_dist > 0)
        {
            if (h_dist > 8 || h_dist == 4)
            {
                if (reverse)
                {
                    bitonic_merge_transpose_n_way_reverse_init(ping, pong, it, cfgs, lofst, height, h_dist >> (3 - 1),
                                                               8);
                }
                else
                {
                    bitonic_merge_transpose_n_way_init(ping, pong, it, cfgs, lofst, height, h_dist >> (3 - 1));
                }
                h_dist = h_dist >> 3;
            }
            else if (h_dist == 8 || h_dist == 2)
            {
                if (reverse)
                {
                    bitonic_merge_transpose_n_way_reverse_init(ping, pong, it, cfgs, lofst, height, h_dist >> (2 - 1),
                                                               4);
                }
                else
                {
                    bitonic_merge_transpose_n_way_init(ping, pong, it, cfgs, lofst, height, h_dist >> (2 - 1));
                }
                h_dist = h_dist >> 2;
            }
            else
            {
                if (reverse)
                {
                    bitonic_merge_transpose_n_way_reverse_init(ping, pong, it, cfgs, lofst, height, h_dist, 2);
                }
                else
                {
                    bitonic_merge_transpose_n_way_init(ping, pong, it, cfgs, lofst, height, h_dist);
                }
                h_dist = h_dist >> 1;
            }
            reverse = 0;
        }
    }

    void bitonic_merge_n_way_reverse_init(DataType *ping, DataType *pong, int *it, AgenCFG **cfgs, int lofst,
                                          int height, int v_dist, int n)
    {
        agen agen;

        DataType *src = (*it) & 1 ? ping : pong;
        DataType *dst = (*it) & 1 ? pong : ping;
        (*it)++;

        int niter1 = n / 2;
        int niter2 = height / v_dist / n;
        int niter3 = v_dist;

        AgenWrapper wrapper;
        wrapper.size = sizeof(DataType);

        agen       = init((DVTYPE *)src);
        wrapper.n1 = niter1;
        wrapper.n2 = niter2;
        wrapper.n3 = niter3;
        wrapper.s1 = v_dist * lofst;
        wrapper.s2 = n * v_dist * lofst;
        wrapper.s3 = lofst;
        INIT_AGEN3(agen, wrapper);

        *(*cfgs)++ = extract_agen_cfg(agen);

        agen       = init((DVTYPE *)(src + (n * v_dist - 1) * lofst));
        wrapper.s1 = -v_dist * lofst;
        wrapper.s2 = n * v_dist * lofst;
        wrapper.s3 = -lofst;
        INIT_AGEN3(agen, wrapper);

        *(*cfgs)++ = extract_agen_cfg(agen);

        niter1 = height / v_dist;
        niter2 = v_dist;

        agen       = init((DVTYPE *)dst);
        wrapper.n1 = niter1;
        wrapper.n2 = niter2;
        wrapper.s1 = v_dist * lofst;
        wrapper.s2 = lofst;
        INIT_AGEN2(agen, wrapper);

        *(*cfgs)++ = extract_agen_cfg(agen);
    }

    void bitonic_merge_n_way_transpose_init(DataType *ping, DataType *pong, int *it, AgenCFG **cfgs, int lofst,
                                            int height, int v_dist)
    {
        agen agen;

        DataType *src = ping;
        DataType *dst = pong;
        (*it)++;

        int vecw   = pva_elementsof(DVTYPEX);
        int niter1 = height / v_dist;
        int niter2 = v_dist;

        AgenWrapper wrapper;

        agen         = init((DVTYPE *)src);
        wrapper.size = sizeof(DataType);
        wrapper.n1   = niter1;
        wrapper.n2   = niter2;
        wrapper.s1   = v_dist * lofst;
        wrapper.s2   = lofst;
        INIT_AGEN2(agen, wrapper);

        *(*cfgs)++ = extract_agen_cfg(agen);

        agen         = init((DVTYPE *)dst);
        wrapper.size = sizeof(DataType);
        wrapper.n1   = niter1;
        wrapper.n2   = niter2;
        wrapper.s1   = v_dist;
        wrapper.s2   = 1;
        INIT_AGEN2(agen, wrapper);
        agen.lane_ofst = height / vecw;

        *(*cfgs)++ = extract_agen_cfg(agen);
    }

    void bitonic_merge_n_way_init(DataType *ping, DataType *pong, int *it, AgenCFG **cfgs, int lofst, int height,
                                  int v_dist)
    {
        agen agen;

        DataType *src = (*it) & 1 ? ping : pong;
        DataType *dst = (*it) & 1 ? pong : ping;
        (*it)++;

        int niter1 = height / v_dist;
        int niter2 = v_dist;

        AgenWrapper wrapper;

        agen         = init((DVTYPE *)src);
        wrapper.size = sizeof(DataType);
        wrapper.n1   = niter1;
        wrapper.n2   = niter2;
        wrapper.s1   = v_dist * lofst;
        wrapper.s2   = lofst;
        INIT_AGEN2(agen, wrapper);

        *(*cfgs)++ = extract_agen_cfg(agen);

        agen         = init((DVTYPE *)dst);
        wrapper.size = sizeof(DataType);
        wrapper.n1   = niter1;
        wrapper.n2   = niter2;
        wrapper.s1   = v_dist * lofst;
        wrapper.s2   = lofst;
        INIT_AGEN2(agen, wrapper);

        *(*cfgs)++ = extract_agen_cfg(agen);
    }

    void bitonic_merge_init(DataType *ping, DataType *pong, DataType *output, int *it, AgenCFG **cfgs, int lofst,
                            int height, int v_dist, int reverse, int last)
    {
        while (v_dist > 0)
        {
            if (v_dist > 8 || v_dist == 4)
            {
                if (reverse)
                {
                    bitonic_merge_n_way_reverse_init(ping, pong, it, cfgs, lofst, height, v_dist >> (3 - 1), 8);
                }
                else
                {
                    if (last && (v_dist >> 3) == 0)
                    {
                        DataType *src = (*it) & 1 ? ping : pong;
                        bitonic_merge_n_way_transpose_init(src, output, it, cfgs, lofst, height, v_dist >> (3 - 1));
                    }
                    else
                    {
                        bitonic_merge_n_way_init(ping, pong, it, cfgs, lofst, height, v_dist >> (3 - 1));
                    }
                }
                v_dist = v_dist >> 3;
            }
            else
            {
                if (reverse)
                {
                    bitonic_merge_n_way_reverse_init(ping, pong, it, cfgs, lofst, height, v_dist >> (2 - 1), 4);
                }
                else
                {
                    if (last && (v_dist >> 2) == 0)
                    {
                        DataType *src = (*it) & 1 ? ping : pong;
                        bitonic_merge_n_way_transpose_init(src, output, it, cfgs, lofst, height, v_dist >> (2 - 1));
                    }
                    else
                    {
                        bitonic_merge_n_way_init(ping, pong, it, cfgs, lofst, height, v_dist >> (2 - 1));
                    }
                }
                v_dist = v_dist >> 2;
            }
            reverse = 0;
        }
    }

    void sort_height_512(AgenCFG *cfgs, PMT_VTYPEX *pmt_ptr)
    {
        int it     = 0;
        int height = 512;

        even_odd_merge_sort(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_2_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_transpose_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;

        if constexpr (std::is_same_v<DataType, int16_t> || std::is_same_v<DataType, uint16_t>)
        {
            bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_transpose_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
            it += 3;
            bitonic_merge_transpose_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
        }

        bitonic_merge_8_way_transp(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
    }

    void sort_height_256(AgenCFG *cfgs, PMT_VTYPEX *pmt_ptr)
    {
        int it     = 0;
        int height = 256;

        even_odd_merge_sort(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_2_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_transpose_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;

        if constexpr (std::is_same_v<DataType, int16_t> || std::is_same_v<DataType, uint16_t>)
        {
            bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_transpose_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
            it += 3;
            bitonic_merge_transpose_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
        }

        bitonic_merge_4_way_transp(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
    }

    void sort_height_128(AgenCFG *cfgs, PMT_VTYPEX *pmt_ptr)
    {
        int it     = 0;
        int height = 128;

        even_odd_merge_sort(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_2_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_transpose_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;

        if constexpr (std::is_same_v<DataType, int16_t> || std::is_same_v<DataType, uint16_t>)
        {
            bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_transpose_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
            it += 3;
            bitonic_merge_transpose_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
        }

        bitonic_merge_4_way_transp(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
    }

    void sort_height_64(AgenCFG *cfgs, PMT_VTYPEX *pmt_ptr)
    {
        int it     = 0;
        int height = 64;

        even_odd_merge_sort(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_2_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_transpose_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;

        if constexpr (std::is_same_v<DataType, int16_t> || std::is_same_v<DataType, uint16_t>)
        {
            bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_transpose_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
            it += 3;
            bitonic_merge_transpose_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
        }

        bitonic_merge_8_way_transp(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
    }

    void sort_height_32(AgenCFG *cfgs, PMT_VTYPEX *pmt_ptr)
    {
        int it     = 0;
        int height = 32;

        even_odd_merge_sort(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
        it += 3;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_2_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_transpose_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
        it += 3;
        bitonic_merge_transpose_4_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
        bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
        it += 2;

        if constexpr (std::is_same_v<DataType, int16_t> || std::is_same_v<DataType, uint16_t>)
        {
            bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_transpose_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
            it += 3;
            bitonic_merge_transpose_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_8_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
        }

        bitonic_merge_4_way_transp(&cfgs[it], &cfgs[it + 1], height);
        it += 2;
    }

    void sort_height_16(AgenCFG *cfgs, PMT_VTYPEX *pmt_ptr)
    {
        int it     = 0;
        int height = 16;

        if constexpr (std::is_same_v<DataType, int32_t> || std::is_same_v<DataType, uint32_t>)
        {
            even_odd_merge_sort(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height);
            it += 3;
            bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_transpose_2_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
            it += 3;
            bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_transpose_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
            it += 3;
            bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_transpose_8_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
            it += 3;
            bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_transpose_4_way_reverse(&cfgs[it], &cfgs[it + 1], &cfgs[it + 2], height, pmt_ptr);
            it += 3;
            bitonic_merge_transpose_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_4_way(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
            bitonic_merge_4_way_transp(&cfgs[it], &cfgs[it + 1], height);
            it += 2;
        }
    }

public:
    SortVpu()
    {
        static_assert(
            Size == 256 || Size == 512 || Size == 1024 || Size == 2048 || Size == 4096 || Size == 8192 || Size == 16384,
            "Unsupported size value for SortVpu template instantiation.");

        if constexpr (Size == 256)
        {
            static_assert(std::is_same_v<DataType, int32_t> || std::is_same_v<DataType, uint32_t>,
                          "SortVpu<DataType, 256> only supports 32-bit DataType (int32_t or uint32_t).");
        }

        if constexpr (Size == 512)
        {
            static_assert(std::is_same_v<DataType, int32_t> || std::is_same_v<DataType, uint32_t>,
                          "SortVpu<DataType, 512> only supports 32-bit DataType (int32_t or uint32_t).");
        }

        if constexpr (Size == 16384)
        {
            static_assert(std::is_same_v<DataType, int16_t> || std::is_same_v<DataType, uint16_t>,
                          "SortVpu<DataType, 16384> only supports 16-bit DataType (int16_t or uint16_t).");
        }
    }

    void Init(DataType *src, DataType *dst, SortContext *context)
    {
        int it = 0;
        int v_dist, h_dist;

        int vecw   = pva_elementsof(DVTYPEX);
        int lofst  = vecw + 1;
        int height = Size / vecw;

        AgenCFG *cfgs = context->cfgs;
        DataType *tmp = reinterpret_cast<DataType *>(context->scratch);

        even_odd_merge_sort_init(src, tmp, &cfgs, lofst, height);

        for (v_dist = 8; v_dist < height; v_dist *= 2)
        {
            bitonic_merge_init(src, tmp, dst, &it, &cfgs, lofst, height, v_dist, 1, 0);
        }

        for (h_dist = 1; h_dist < vecw; h_dist *= 2)
        {
            bitonic_merge_transpose_init(src, tmp, &it, &cfgs, lofst, height, h_dist);
            bitonic_merge_init(src, tmp, dst, &it, &cfgs, lofst, height, height / 2, 0, h_dist == vecw / 2);
        }
    }

    void Execute(SortContext *context)
    {
        AgenCFG *cfgs        = context->cfgs;
        PMT_VTYPEX pmt       = PMT_LOAD();
        constexpr int height = Size / pva_elementsof(DVTYPEX);

        if constexpr (height == 512)
            sort_height_512(cfgs, &pmt);
        else if constexpr (height == 256)
            sort_height_256(cfgs, &pmt);
        else if constexpr (height == 128)
            sort_height_128(cfgs, &pmt);
        else if constexpr (height == 64)
            sort_height_64(cfgs, &pmt);
        else if constexpr (height == 32)
            sort_height_32(cfgs, &pmt);
        else if constexpr (height == 16)
            sort_height_16(cfgs, &pmt);
    }

    static constexpr size_t MIN_INPUT_BUFFER_SIZE{(pva_elementsof(DVTYPEX) + 1) * (Size / pva_elementsof(DVTYPEX)) *
                                                  sizeof(DataType)};
    static constexpr size_t MIN_OUTPUT_BUFFER_SIZE{(Size / pva_elementsof(DVTYPEX) + 1) * pva_elementsof(DVTYPEX) *
                                                   sizeof(DataType)};
};
} // namespace pvaApl

#endif /* PVA_APL_SORT_VPU_HPP */