pvaAplConv2dVpu.hpp#
Fully qualified name: public/src/primitive/pvaAplConv2dVpu.hpp
File members: public/src/primitive/pvaAplConv2dVpu.hpp
/*
* SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: LicenseRef-NvidiaProprietary
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#ifndef PVA_APL_CONV2D_VPU_HPP
#define PVA_APL_CONV2D_VPU_HPP
#include <cupva_device.h>
#include <limits>
#include <type_traits>
namespace pvaApl {
template<int32_t KernelSize, typename DataType, typename KernelType>
class Conv2dVpu
{
private:
static_assert(sizeof(DataType) == sizeof(KernelType), "DataType and KernelType must have the same size.");
template<typename AgenType>
auto dvload_perm(AgenType &ag, vcharx vx) -> decltype(auto)
{
if constexpr (std::is_same_v<DataType, uint8_t>)
{
return static_cast<dvcharx>(dvuchar_load_perm(ag, vx));
}
else if constexpr (std::is_same_v<DataType, int8_t>)
{
return static_cast<dvcharx>(dvchar_load_perm(ag, vx));
}
else if constexpr (std::is_same_v<DataType, uint16_t>)
{
return static_cast<dvshortx>(dvushort_load_perm(ag, vx));
}
else if constexpr (std::is_same_v<DataType, int16_t>)
{
return static_cast<dvshortx>(dvshort_load_perm(ag, vx));
}
else
{
static_assert(!std::is_same_v<DataType, DataType>, "Unsupported type for dvload_perm");
}
}
template<typename AgenType>
auto dvload_perm_coef(AgenType &ag, vcharx vx) -> decltype(auto)
{
if constexpr (std::is_same_v<KernelType, uint8_t>)
{
return static_cast<dvcharx>(dvuchar_load_perm(ag, vx));
}
else if constexpr (std::is_same_v<KernelType, int8_t>)
{
return static_cast<dvcharx>(dvchar_load_perm(ag, vx));
}
else if constexpr (std::is_same_v<KernelType, uint16_t>)
{
return static_cast<dvshortx>(dvushort_load_perm(ag, vx));
}
else if constexpr (std::is_same_v<KernelType, int16_t>)
{
return static_cast<dvshortx>(dvshort_load_perm(ag, vx));
}
else
{
static_assert(!std::is_same_v<KernelType, KernelType>, "Unsupported type for dvload_perm_coef");
}
}
template<typename AgenType>
auto dvload(AgenType &ag) -> decltype(auto)
{
if constexpr (std::is_same_v<DataType, uint8_t>)
{
return static_cast<dvcharx>(dvuchar_load(ag));
}
else if constexpr (std::is_same_v<DataType, int8_t>)
{
return static_cast<dvcharx>(dvchar_load(ag));
}
else if constexpr (std::is_same_v<DataType, uint16_t>)
{
return static_cast<dvshortx>(dvushort_load(ag));
}
else if constexpr (std::is_same_v<DataType, int16_t>)
{
return static_cast<dvshortx>(dvshort_load(ag));
}
else
{
static_assert(!std::is_same_v<DataType, DataType>, "Unsupported type for dvload");
}
}
AgenCFG m_cfgs[4];
int32_t m_niter;
public:
void Update(DataType *src_even, DataType *src_odd, DataType *dst)
{
cupvaModifyAgenCfgBase(&m_cfgs[0], src_even);
cupvaModifyAgenCfgBase(&m_cfgs[1], src_odd);
cupvaModifyAgenCfgBase(&m_cfgs[3], dst);
}
void Init(DataType *src_even_tp, int32_t src_even_lp, DataType *src_odd_tp, int32_t src_odd_lp, KernelType *knl_tp,
int32_t knl_lp, DataType *restrict dst_tp, int32_t dst_lp, int32_t qbits, int32_t tw, int32_t th,
DataType *cb_even, DataType *cb_odd, int32_t cb_len)
{
int low_bound = std::numeric_limits<DataType>::min();
int high_bound = std::numeric_limits<DataType>::max();
constexpr int SIGNED_SATURATION = 2;
constexpr int UNSIGNED_SATURATION = 3;
int sat_opt = std::is_signed_v<DataType> ? SIGNED_SATURATION : UNSIGNED_SATURATION;
if constexpr (KernelSize == 3 && (std::is_same_v<DataType, int8_t> || std::is_same_v<DataType, uint8_t>))
{
// 3x3 Convolution for 8-bit input
// ------------------------------------------------ //
// IMPORTANT: //
// it is caller's job to make the srcs and dst buf's//
// height round up to multiple of 6 //
// ------------------------------------------------ //
int niter_tw = (tw - 1) / 64 + 1;
int niter_th = (((th / 2) - 1) / 3) + 1;
int niter_kw = 1;
int niter_kh = 2;
AgenWrapper w0, w1, w2, w3;
agen agen_ld_coef = init((dvchar *)knl_tp);
w0.size = sizeof(char);
w0.n1 = niter_kh;
w0.s1 = knl_lp;
INIT_AGEN1(agen_ld_coef, w0);
agen agen_ld_src_even = init((dvchar *)src_even_tp);
w1.size = sizeof(char);
w1.n1 = 2;
w1.n2 = 4;
w1.n3 = niter_tw;
w1.n4 = niter_th;
w1.s1 = 32;
w1.s2 = src_even_lp;
w1.s3 = 64;
w1.s4 = src_even_lp * 3;
INIT_AGEN4(agen_ld_src_even, w1);
agen_ld_src_even = update_agen_cb_start(agen_ld_src_even, (intptr_t)cb_even);
agen_ld_src_even = update_agen_cb_size(agen_ld_src_even, cb_len * sizeof(char));
agen agen_ld_src_odd = init((dvchar *)src_odd_tp);
w2.size = sizeof(char);
w2.n1 = 2;
w2.n2 = 4;
w2.n3 = niter_tw;
w2.n4 = niter_th;
w2.s1 = 32;
w2.s2 = src_odd_lp;
w2.s3 = 64;
w2.s4 = src_odd_lp * 3;
INIT_AGEN4(agen_ld_src_odd, w2);
agen_ld_src_odd = update_agen_cb_start(agen_ld_src_odd, (intptr_t)cb_odd);
agen_ld_src_odd = update_agen_cb_size(agen_ld_src_odd, cb_len * sizeof(char));
agen agen_st_dst = init((dvchar *)dst_tp);
w3.size = sizeof(char);
w3.n1 = 6;
w3.n2 = niter_tw;
w3.n3 = niter_th;
w3.s1 = dst_lp;
w3.s2 = 64;
w3.s3 = dst_lp * 6;
INIT_AGEN3(agen_st_dst, w3);
agen_st_dst.round = qbits;
agen_st_dst.sat_lim_lo = low_bound;
agen_st_dst.sat_val_lo = low_bound;
agen_st_dst.sat_lim_hi = high_bound;
agen_st_dst.sat_val_hi = high_bound;
agen_st_dst.sat_opt = sat_opt;
m_cfgs[0] = extract_agen_cfg(agen_ld_src_even);
m_cfgs[1] = extract_agen_cfg(agen_ld_src_odd);
m_cfgs[2] = extract_agen_cfg(agen_ld_coef);
m_cfgs[3] = extract_agen_cfg(agen_st_dst);
m_niter = niter_tw * niter_th;
}
else if constexpr (KernelSize == 3 && (std::is_same_v<DataType, int16_t> || std::is_same_v<DataType, uint16_t>))
{
// 3x3 Convolution for 16-bit input
int vecw = chess_elementsof(dvshortx);
int niter_tw = (tw - 1) / vecw + 1;
int niter_th = (((th / 2) - 1) / 2) + 1;
int niter_kw = 2;
int niter_kh = 2;
AgenWrapper w0, w1, w2, w3;
agen agen_ld_coef = init((dvshort *)knl_tp);
w0.size = sizeof(short);
w0.n1 = niter_kw;
w0.n2 = niter_kh;
w0.s1 = 4;
w0.s2 = knl_lp;
INIT_AGEN2(agen_ld_coef, w0);
agen agen_ld_src_even = init((dvshort *)src_even_tp);
w1.size = sizeof(short);
w1.n1 = 2;
w1.n2 = 3;
w1.n3 = niter_tw;
w1.n4 = niter_th;
w1.s1 = 4;
w1.s2 = src_even_lp;
w1.s3 = vecw;
w1.s4 = src_even_lp * 2;
INIT_AGEN4(agen_ld_src_even, w1);
agen_ld_src_even = update_agen_cb_start(agen_ld_src_even, (intptr_t)cb_even);
agen_ld_src_even = update_agen_cb_size(agen_ld_src_even, cb_len * sizeof(short));
agen agen_ld_src_odd = init((dvshort *)src_odd_tp);
w2.size = sizeof(short);
w2.n1 = 2;
w2.n2 = 3;
w2.n3 = niter_tw;
w2.n4 = niter_th;
w2.s1 = 4;
w2.s2 = src_odd_lp;
w2.s3 = vecw;
w2.s4 = src_odd_lp * 2;
INIT_AGEN4(agen_ld_src_odd, w2);
agen_ld_src_odd = update_agen_cb_start(agen_ld_src_odd, (intptr_t)cb_odd);
agen_ld_src_odd = update_agen_cb_size(agen_ld_src_odd, cb_len * sizeof(short));
agen agen_st_dst = init((dvshort *restrict)dst_tp);
w3.size = sizeof(short);
w3.n1 = 4;
w3.n2 = niter_tw;
w3.n3 = niter_th;
w3.s1 = dst_lp;
w3.s2 = vecw;
w3.s3 = dst_lp * 4;
INIT_AGEN3(agen_st_dst, w3);
agen_st_dst.round = qbits;
agen_st_dst.sat_lim_lo = low_bound;
agen_st_dst.sat_val_lo = low_bound;
agen_st_dst.sat_lim_hi = high_bound;
agen_st_dst.sat_val_hi = high_bound;
agen_st_dst.sat_opt = sat_opt;
m_cfgs[0] = extract_agen_cfg(agen_ld_src_even);
m_cfgs[1] = extract_agen_cfg(agen_ld_src_odd);
m_cfgs[2] = extract_agen_cfg(agen_ld_coef);
m_cfgs[3] = extract_agen_cfg(agen_st_dst);
m_niter = niter_tw * niter_th;
}
else if constexpr (KernelSize == 5 && (std::is_same_v<DataType, int8_t> || std::is_same_v<DataType, uint8_t>))
{
// 5x5 Convolution for 8-bit input
int niter_tw = (tw - 1) / 32 + 1;
int niter_th = (th - 1) / 8 + 1;
int niter_kw = 2;
int niter_kh = 3;
AgenWrapper w0, w1, w2, w3;
agen agen_ld_coef = init((dvchar *)knl_tp);
w0.size = sizeof(char);
w0.n1 = niter_kw;
w0.n2 = niter_kh;
w0.s1 = 4; // go to next horizontal 4 taps
w0.s2 = knl_lp; // lp = 4 * KWA
INIT_AGEN2(agen_ld_coef, w0);
agen agen_ld_src_even = init((dvchar *)src_even_tp);
w1.size = sizeof(char);
w1.n1 = 6;
w1.n2 = 2; // 5x5 kernel padded to 8x6. 8 / 4 = 2 when horizontally
w1.n3 = niter_tw;
w1.n4 = niter_th;
w1.s1 = src_even_lp;
w1.s2 = 4;
w1.s3 = 32;
w1.s4 = src_even_lp * 4;
INIT_AGEN4(agen_ld_src_even, w1);
agen_ld_src_even = update_agen_cb_start(agen_ld_src_even, (intptr_t)cb_even);
agen_ld_src_even = update_agen_cb_size(agen_ld_src_even, cb_len * sizeof(char));
agen agen_ld_src_odd = init((dvchar *)src_odd_tp);
w2.size = sizeof(char);
w2.n1 = 6;
w2.n2 = 2; // 5x5 kernel padded to 8x6. 8 / 4 = 2 when horizontally
w2.n3 = niter_tw;
w2.n4 = niter_th;
w2.s1 = src_odd_lp;
w2.s2 = 4;
w2.s3 = 32;
w2.s4 = src_odd_lp * 4;
INIT_AGEN4(agen_ld_src_odd, w2);
agen_ld_src_odd = update_agen_cb_start(agen_ld_src_odd, (intptr_t)cb_odd);
agen_ld_src_odd = update_agen_cb_size(agen_ld_src_odd, cb_len * sizeof(char));
agen agen_st_dst = init((dvchar *)dst_tp);
w3.size = sizeof(char);
w3.n1 = 8;
w3.n2 = niter_tw;
w3.n3 = niter_th;
w3.s1 = dst_lp;
w3.s2 = 32;
w3.s3 = dst_lp * 8;
INIT_AGEN3(agen_st_dst, w3);
agen_st_dst.round = qbits;
agen_st_dst.sat_lim_lo = low_bound;
agen_st_dst.sat_val_lo = low_bound;
agen_st_dst.sat_lim_hi = high_bound;
agen_st_dst.sat_val_hi = high_bound;
agen_st_dst.sat_opt = sat_opt;
m_cfgs[0] = extract_agen_cfg(agen_ld_src_even);
m_cfgs[1] = extract_agen_cfg(agen_ld_src_odd);
m_cfgs[2] = extract_agen_cfg(agen_ld_coef);
m_cfgs[3] = extract_agen_cfg(agen_st_dst);
m_niter = niter_tw * niter_th;
}
else if constexpr (KernelSize == 5 && (std::is_same_v<DataType, int16_t> || std::is_same_v<DataType, uint16_t>))
{
// 5x5 Convolution for 16-bit input
int vecw = chess_elementsof(dvshortx);
int niter_tw = (tw - 1) / vecw + 1;
int niter_th = th / 4;
int niter_kw = 4;
int niter_kh = 3;
AgenWrapper w0, w1, w2, w3;
agen agen_ld_coef = init((dvshort *)knl_tp);
w0.size = sizeof(short);
w0.n1 = niter_kw;
w0.n2 = niter_kh;
w0.s1 = 4;
w0.s2 = knl_lp;
INIT_AGEN2(agen_ld_coef, w0);
agen agen_ld_src_even = init((dvshort *)src_even_tp);
w1.size = sizeof(short);
w1.n1 = 2; // 2 for 3x3 or 5x5, 4 pixels apart
w1.n2 = 4; // read 8 rows per iteration, deinterleave into 2 superbanks
w1.n3 = niter_tw;
w1.n4 = niter_th;
w1.s1 = 4;
w1.s2 = src_even_lp;
w1.s3 = vecw;
w1.s4 = src_even_lp * 2;
INIT_AGEN4(agen_ld_src_even, w1);
agen_ld_src_even = update_agen_cb_start(agen_ld_src_even, (intptr_t)cb_even);
agen_ld_src_even = update_agen_cb_size(agen_ld_src_even, cb_len * sizeof(short));
agen agen_ld_src_odd = init((dvshort *)src_odd_tp);
w2.size = sizeof(short);
w2.n1 = 2;
w2.n2 = 4;
w2.n3 = niter_tw;
w2.n4 = niter_th;
w2.s1 = 4;
w2.s2 = src_odd_lp;
w2.s3 = vecw;
w2.s4 = src_odd_lp * 2;
INIT_AGEN4(agen_ld_src_odd, w2);
agen_ld_src_odd = update_agen_cb_start(agen_ld_src_odd, (intptr_t)cb_odd);
agen_ld_src_odd = update_agen_cb_size(agen_ld_src_odd, cb_len * sizeof(short));
agen agen_st_dst = init((dvshort *restrict)dst_tp);
w3.size = sizeof(short);
w3.n1 = 4;
w3.n2 = niter_tw;
w3.n3 = niter_th;
w3.s1 = dst_lp;
w3.s2 = vecw;
w3.s3 = dst_lp * 4;
INIT_AGEN3(agen_st_dst, w3);
agen_st_dst.round = qbits;
agen_st_dst.sat_lim_lo = low_bound;
agen_st_dst.sat_val_lo = low_bound;
agen_st_dst.sat_lim_hi = high_bound;
agen_st_dst.sat_val_hi = high_bound;
agen_st_dst.sat_opt = sat_opt;
m_cfgs[0] = extract_agen_cfg(agen_ld_src_even);
m_cfgs[1] = extract_agen_cfg(agen_ld_src_odd);
m_cfgs[2] = extract_agen_cfg(agen_ld_coef);
m_cfgs[3] = extract_agen_cfg(agen_st_dst);
m_niter = niter_tw * niter_th;
}
else if constexpr (KernelSize == 7 && (std::is_same_v<DataType, int8_t> || std::is_same_v<DataType, uint8_t>))
{
// 7x7 Convolution for 8-bit input
int niter_tw = (tw - 1) / 32 + 1;
int niter_th = (th - 1) / 6 + 1;
int niter_kw = 2;
int niter_kh = 4;
AgenWrapper w0, w1, w2, w3;
agen agen_ld_coef = init((dvchar *)knl_tp);
w0.size = sizeof(char);
w0.n1 = niter_kw;
w0.n2 = niter_kh;
w0.s1 = 4; // go to next horizontal 4 taps
w0.s2 = knl_lp; // lp = 4 * KWA
INIT_AGEN2(agen_ld_coef, w0);
agen agen_ld_src_even = init((dvchar *)src_even_tp);
w1.size = sizeof(char);
w1.n1 = 6;
w1.n2 = 2; // 7x7 kernel padded to 8x8. 8 / 4 = 2 when horizontally
w1.n3 = niter_tw;
w1.n4 = niter_th;
w1.s1 = src_even_lp;
w1.s2 = 4;
w1.s3 = 32;
w1.s4 = src_even_lp * 3;
INIT_AGEN4(agen_ld_src_even, w1);
agen_ld_src_even = update_agen_cb_start(agen_ld_src_even, (intptr_t)cb_even);
agen_ld_src_even = update_agen_cb_size(agen_ld_src_even, cb_len * sizeof(char));
agen agen_ld_src_odd = init((dvchar *)src_odd_tp);
w2.size = sizeof(char);
w2.n1 = 6;
w2.n2 = 2; // 7x7 kernel padded to 8x8. 8 / 4 = 2 when horizontally
w2.n3 = niter_tw;
w2.n4 = niter_th;
w2.s1 = src_odd_lp;
w2.s2 = 4;
w2.s3 = 32;
w2.s4 = src_odd_lp * 3;
INIT_AGEN4(agen_ld_src_odd, w2);
agen_ld_src_odd = update_agen_cb_start(agen_ld_src_odd, (intptr_t)cb_odd);
agen_ld_src_odd = update_agen_cb_size(agen_ld_src_odd, cb_len * sizeof(char));
agen agen_st_dst = init((dvchar *)dst_tp);
w3.size = sizeof(char);
w3.n1 = 6;
w3.n2 = niter_tw;
w3.n3 = niter_th;
w3.s1 = dst_lp;
w3.s2 = 32;
w3.s3 = dst_lp * 6;
INIT_AGEN3(agen_st_dst, w3);
agen_st_dst.round = qbits;
agen_st_dst.sat_lim_lo = low_bound;
agen_st_dst.sat_val_lo = low_bound;
agen_st_dst.sat_lim_hi = high_bound;
agen_st_dst.sat_val_hi = high_bound;
agen_st_dst.sat_opt = sat_opt;
m_cfgs[0] = extract_agen_cfg(agen_ld_src_even);
m_cfgs[1] = extract_agen_cfg(agen_ld_src_odd);
m_cfgs[2] = extract_agen_cfg(agen_ld_coef);
m_cfgs[3] = extract_agen_cfg(agen_st_dst);
m_niter = niter_tw * niter_th;
}
else if constexpr (KernelSize == 7 && (std::is_same_v<DataType, int16_t> || std::is_same_v<DataType, uint16_t>))
{
// 7x7 Convolution for 16-bit input
int vecw = chess_elementsof(dvshortx);
int niter_tw = (tw - 1) / vecw + 1;
int niter_th = th / 4;
int niter_kw = 4;
int niter_kh = 4;
AgenWrapper w0, w1, w2, w3;
agen agen_ld_coef = init((dvshort *)knl_tp);
w0.size = sizeof(short);
w0.n1 = niter_kw;
w0.n2 = niter_kh;
w0.s1 = 4;
w0.s2 = knl_lp;
INIT_AGEN2(agen_ld_coef, w0);
agen agen_ld_src_even = init((dvshort *)src_even_tp);
w1.size = sizeof(short);
w1.n1 = 3;
w1.n2 = 5; // read 10 rows per iteration, deinterleave into 2 superbanks
w1.n3 = niter_tw;
w1.n4 = niter_th;
w1.s1 = 4;
w1.s2 = src_even_lp;
w1.s3 = vecw;
w1.s4 = src_even_lp * 2;
INIT_AGEN4(agen_ld_src_even, w1);
agen_ld_src_even = update_agen_cb_start(agen_ld_src_even, (intptr_t)cb_even);
agen_ld_src_even = update_agen_cb_size(agen_ld_src_even, cb_len * sizeof(short));
agen agen_ld_src_odd = init((dvshort *)src_odd_tp);
w2.size = sizeof(short);
w2.n1 = 3;
w2.n2 = 5;
w2.n3 = niter_tw;
w2.n4 = niter_th;
w2.s1 = 4;
w2.s2 = src_odd_lp;
w2.s3 = vecw;
w2.s4 = src_odd_lp * 2;
INIT_AGEN4(agen_ld_src_odd, w2);
agen_ld_src_odd = update_agen_cb_start(agen_ld_src_odd, (intptr_t)cb_odd);
agen_ld_src_odd = update_agen_cb_size(agen_ld_src_odd, cb_len * sizeof(short));
agen agen_st_dst = init((dvshort *restrict)dst_tp);
w3.size = sizeof(short);
w3.n1 = 4;
w3.n2 = niter_tw;
w3.n3 = niter_th;
w3.s1 = dst_lp;
w3.s2 = vecw;
w3.s3 = dst_lp * 4;
INIT_AGEN3(agen_st_dst, w3);
agen_st_dst.round = qbits;
agen_st_dst.sat_lim_lo = low_bound;
agen_st_dst.sat_val_lo = low_bound;
agen_st_dst.sat_lim_hi = high_bound;
agen_st_dst.sat_val_hi = high_bound;
agen_st_dst.sat_opt = sat_opt;
m_cfgs[0] = extract_agen_cfg(agen_ld_src_even);
m_cfgs[1] = extract_agen_cfg(agen_ld_src_odd);
m_cfgs[2] = extract_agen_cfg(agen_ld_coef);
m_cfgs[3] = extract_agen_cfg(agen_st_dst);
m_niter = niter_tw * niter_th;
}
else
{
static_assert(std::is_same_v<DataType, void>, "Unsupported combination of KernelSize and DataType");
}
}
void Execute()
{
agen_C agen_ld_coef = init_agen_C_from_cfg(m_cfgs[2]);
agen_A agen_ld_src_even = init_agen_A_from_cfg(m_cfgs[0]);
agen_B agen_ld_src_odd = init_agen_B_from_cfg(m_cfgs[1]);
agen_C agen_st_dst = init_agen_C_from_cfg(m_cfgs[3]);
int niter = m_niter;
if constexpr (KernelSize == 3 && (std::is_same_v<DataType, int8_t> || std::is_same_v<DataType, uint8_t>))
{
constexpr int8_t m_knl_perm[32] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7};
constexpr int8_t m_src_perm[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19};
// 3x3 Convolution for 8-bit input
vcharx knl_perm = zero_extend(*((vuchar *)(m_knl_perm)));
vcharx src_perm = zero_extend(*((vuchar *)(m_src_perm)));
dvcharx vcoef01, vcoef23;
dvcharx vdata0_lo, vdata1_lo, vdata2_lo, vdata3_lo, vdata4_lo, vdata5_lo, vdata6_lo, vdata7_lo;
dvcharx vdata0_hi, vdata1_hi, vdata2_hi, vdata3_hi, vdata4_hi, vdata5_hi, vdata6_hi, vdata7_hi;
dvshortx vacc0_lo, vacc1_lo, vacc2_lo, vacc3_lo, vacc4_lo, vacc5_lo;
dvshortx vacc0_hi, vacc1_hi, vacc2_hi, vacc3_hi, vacc4_hi, vacc5_hi;
chess_separator_scheduler();
vcoef01 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef23 = dvload_perm_coef(agen_ld_coef, knl_perm);
// 16 loads for 8 src rows -> 8 VLIW
// 12 computes for 6 dst rows -> 6 VLIW -> 1.33 VLIW each 64 outputs
// 6 stores for 6 dst rows -> 6 VLIW
for (int i = 0; i < niter; i++) chess_loop_range(8, )
chess_prepare_for_pipelining
{
vdata0_lo = dvload_perm(agen_ld_src_odd, src_perm);
vdata1_lo = dvload_perm(agen_ld_src_even, src_perm);
vdata0_hi = dvload_perm(agen_ld_src_odd, src_perm);
vdata1_hi = dvload_perm(agen_ld_src_even, src_perm);
vdata2_lo = dvload_perm(agen_ld_src_odd, src_perm);
vdata3_lo = dvload_perm(agen_ld_src_even, src_perm);
vdata2_hi = dvload_perm(agen_ld_src_odd, src_perm);
vdata3_hi = dvload_perm(agen_ld_src_even, src_perm);
vdata4_lo = dvload_perm(agen_ld_src_odd, src_perm);
vdata5_lo = dvload_perm(agen_ld_src_even, src_perm);
vdata4_hi = dvload_perm(agen_ld_src_odd, src_perm);
vdata5_hi = dvload_perm(agen_ld_src_even, src_perm);
vdata6_lo = dvload_perm(agen_ld_src_odd, src_perm);
vdata7_lo = dvload_perm(agen_ld_src_even, src_perm);
vdata6_hi = dvload_perm(agen_ld_src_odd, src_perm);
vdata7_hi = dvload_perm(agen_ld_src_even, src_perm);
vfilt4x2x2_bbh(vdata0_lo, vdata1_lo, vcoef01, vacc0_lo, vacc1_lo, 0, vacc0_lo, vacc1_lo);
vfilt4x2x2_bbh(vdata0_hi, vdata1_hi, vcoef01, vacc0_hi, vacc1_hi, 0, vacc0_hi, vacc1_hi);
chess_separator();
vfilt4x2x2_bbh(vdata2_lo, vdata3_lo, vcoef23, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2x2_bbh(vdata2_hi, vdata3_hi, vcoef23, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
chess_separator();
vfilt4x2x2_bbh(vdata2_lo, vdata3_lo, vcoef01, vacc2_lo, vacc3_lo, 0, vacc2_lo, vacc3_lo);
vfilt4x2x2_bbh(vdata2_hi, vdata3_hi, vcoef01, vacc2_hi, vacc3_hi, 0, vacc2_hi, vacc3_hi);
chess_separator();
vfilt4x2x2_bbh(vdata4_lo, vdata5_lo, vcoef23, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2x2_bbh(vdata4_hi, vdata5_hi, vcoef23, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
chess_separator();
vfilt4x2x2_bbh(vdata4_lo, vdata5_lo, vcoef01, vacc4_lo, vacc5_lo, 0, vacc4_lo, vacc5_lo);
vfilt4x2x2_bbh(vdata4_hi, vdata5_hi, vcoef01, vacc4_hi, vacc5_hi, 0, vacc4_hi, vacc5_hi);
chess_separator();
vfilt4x2x2_bbh(vdata6_lo, vdata7_lo, vcoef23, vacc4_lo, vacc5_lo, -1, vacc4_lo, vacc5_lo);
vfilt4x2x2_bbh(vdata6_hi, vdata7_hi, vcoef23, vacc4_hi, vacc5_hi, -1, vacc4_hi, vacc5_hi);
chess_separator();
vstore_i2(vacc0_lo, vacc0_hi, agen_st_dst);
vstore_i2(vacc1_lo, vacc1_hi, agen_st_dst);
vstore_i2(vacc2_lo, vacc2_hi, agen_st_dst);
vstore_i2(vacc3_lo, vacc3_hi, agen_st_dst);
vstore_i2(vacc4_lo, vacc4_hi, agen_st_dst);
vstore_i2(vacc5_lo, vacc5_hi, agen_st_dst);
}
}
else if constexpr (KernelSize == 3 && (std::is_same_v<DataType, int16_t> || std::is_same_v<DataType, uint16_t>))
{
// 3x3 Convolution for 16-bit input
constexpr int8_t m_knl_perm[32] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11};
vcharx knl_perm = zero_extend(*((vuchar *)(m_knl_perm)));
dvshortx vdata0a, vdata0b, vdata1a, vdata1b, vdata2a, vdata2b;
dvshortx vdata3a, vdata3b, vdata4a, vdata4b, vdata5a, vdata5b;
dvshortx vcoef0, vcoef1, vcoef2, vcoef3;
dvintx vacc0_lo, vacc0_hi, vacc1_lo, vacc1_hi, vacc2_lo, vacc2_hi, vacc3_lo, vacc3_hi;
chess_separator_scheduler();
vcoef0 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef1 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef2 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef3 = dvload_perm_coef(agen_ld_coef, knl_perm);
// 12 loads for 6 src rows -> 6 VLIW
// 16 computes for 4 dst rows -> 8 VLIW -> 2 VLIW each 32 outputs
// 4 stores for 4 dst rows -> 4 VLIW
for (int i = 0; i < niter; i++) chess_loop_range(8, )
chess_prepare_for_pipelining
{
vdata0a = dvload(agen_ld_src_odd);
vdata0b = dvload(agen_ld_src_odd);
vdata1a = dvload(agen_ld_src_even);
vdata1b = dvload(agen_ld_src_even);
vdata2a = dvload(agen_ld_src_odd);
vdata2b = dvload(agen_ld_src_odd);
vdata3a = dvload(agen_ld_src_even);
vdata3b = dvload(agen_ld_src_even);
vdata4a = dvload(agen_ld_src_odd);
vdata4b = dvload(agen_ld_src_odd);
vdata5a = dvload(agen_ld_src_even);
vdata5b = dvload(agen_ld_src_even);
vfilt4x2_hhw(vdata0a.lo, vdata0b.lo, vcoef0, vacc0_lo, vacc1_lo, 0, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata0a.hi, vdata0b.hi, vcoef0, vacc0_hi, vacc1_hi, 0, vacc0_hi, vacc1_hi);
chess_separator();
vfilt4x2_hhw(vdata1a.lo, vdata1b.lo, vcoef1, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata1a.hi, vdata1b.hi, vcoef1, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
chess_separator();
vfilt4x2_hhw(vdata2a.lo, vdata2b.lo, vcoef2, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata2a.hi, vdata2b.hi, vcoef2, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
chess_separator();
vfilt4x2_hhw(vdata3a.lo, vdata3b.lo, vcoef3, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata3a.hi, vdata3b.hi, vcoef3, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
chess_separator();
vfilt4x2_hhw(vdata2a.lo, vdata2b.lo, vcoef0, vacc2_lo, vacc3_lo, 0, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata2a.hi, vdata2b.hi, vcoef0, vacc2_hi, vacc3_hi, 0, vacc2_hi, vacc3_hi);
chess_separator();
vfilt4x2_hhw(vdata3a.lo, vdata3b.lo, vcoef1, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata3a.hi, vdata3b.hi, vcoef1, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
chess_separator();
vfilt4x2_hhw(vdata4a.lo, vdata4b.lo, vcoef2, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata4a.hi, vdata4b.hi, vcoef2, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
chess_separator();
vfilt4x2_hhw(vdata5a.lo, vdata5b.lo, vcoef3, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata5a.hi, vdata5b.hi, vcoef3, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
chess_separator();
vstore_i2(vacc0_lo, vacc0_hi, agen_st_dst);
vstore_i2(vacc1_lo, vacc1_hi, agen_st_dst);
vstore_i2(vacc2_lo, vacc2_hi, agen_st_dst);
vstore_i2(vacc3_lo, vacc3_hi, agen_st_dst);
}
}
else if constexpr (KernelSize == 5 && (std::is_same_v<DataType, int8_t> || std::is_same_v<DataType, uint8_t>))
{
// 5x5 Convolution for 8-bit input
constexpr int8_t m_knl_perm[32] = {0, 1, 4, 5, 0, 1, 4, 5, 0, 1, 4, 5, 0, 1, 4, 5,
8, 9, 12, 13, 8, 9, 12, 13, 8, 9, 12, 13, 8, 9, 12, 13};
constexpr int8_t m_src_perm[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19};
vcharx knl_perm = zero_extend(*((vuchar *)(m_knl_perm)));
vcharx src_perm = zero_extend(*((vuchar *)(m_src_perm)));
dvcharx vcoef0, vcoef1, vcoef2, vcoef3, vcoef4, vcoef5;
dvcharx chess_storage(DV0) vdata0, chess_storage(DV1) vdata1;
dvcharx chess_storage(DV2) vdata2, chess_storage(DV3) vdata3;
dvcharx chess_storage(DV4) vdata4, chess_storage(DV5) vdata5;
dvcharx chess_storage(DV6) vdata6, chess_storage(DV7) vdata7;
dvcharx chess_storage(DV8) vdata8, chess_storage(DV9) vdata9;
dvcharx chess_storage(DV10) vdata10, chess_storage(DV11) vdata11;
dvcharx chess_storage(DV12) vdata12, chess_storage(DV13) vdata13;
dvcharx chess_storage(DV14) vdata14, chess_storage(DV15) vdata15;
dvshortx vacc0, vacc1, vacc2, vacc3;
dvshortx vacc4, vacc5, vacc6, vacc7;
dvshortx vacc8, vacc9, vacc10, vacc11;
dvshortx vacc12, vacc13, vacc14, vacc15;
chess_separator_scheduler();
// | dvcoef0 | dvcoef1 | //
// | dvcoef2 | dvcoef3 | //
// | dvcoef4 | dvcoef5 | //
vcoef0 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef1 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef2 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef3 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef4 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef5 = dvload_perm_coef(agen_ld_coef, knl_perm);
vdata0 = dvload_perm(agen_ld_src_even, src_perm);
vdata1 = dvload_perm(agen_ld_src_odd, src_perm);
vdata2 = dvload_perm(agen_ld_src_even, src_perm);
vdata3 = dvload_perm(agen_ld_src_odd, src_perm);
vdata4 = dvload_perm(agen_ld_src_even, src_perm);
vdata5 = dvload_perm(agen_ld_src_odd, src_perm);
vdata6 = dvload_perm(agen_ld_src_even, src_perm);
vdata7 = dvload_perm(agen_ld_src_odd, src_perm);
vdata8 = dvload_perm(agen_ld_src_even, src_perm);
vdata9 = dvload_perm(agen_ld_src_odd, src_perm);
vdata10 = dvload_perm(agen_ld_src_even, src_perm);
vdata11 = dvload_perm(agen_ld_src_odd, src_perm);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef0, vacc0, vacc1, 0, vacc0, vacc1);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef2, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef4, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef0, vacc2, vacc3, 0, vacc2, vacc3);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef2, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef4, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef0, vacc4, vacc5, 0, vacc4, vacc5);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef2, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef4, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef0, vacc6, vacc7, 0, vacc6, vacc7);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef2, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef4, vacc6, vacc7, -1, vacc6, vacc7);
vdata12 = dvload_perm(agen_ld_src_even, src_perm);
vdata13 = dvload_perm(agen_ld_src_odd, src_perm);
vdata14 = dvload_perm(agen_ld_src_even, src_perm);
vdata15 = dvload_perm(agen_ld_src_odd, src_perm);
vdata0 = dvload_perm(agen_ld_src_even, src_perm);
vdata1 = dvload_perm(agen_ld_src_odd, src_perm);
vdata2 = dvload_perm(agen_ld_src_even, src_perm);
vdata3 = dvload_perm(agen_ld_src_odd, src_perm);
vdata4 = dvload_perm(agen_ld_src_even, src_perm);
vdata5 = dvload_perm(agen_ld_src_odd, src_perm);
vdata6 = dvload_perm(agen_ld_src_even, src_perm);
vdata7 = dvload_perm(agen_ld_src_odd, src_perm);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef1, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef3, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef5, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef1, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef3, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef5, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef1, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef3, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef5, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef1, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef3, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef5, vacc6, vacc7, -1, vacc6, vacc7);
vdata8 = dvload_perm(agen_ld_src_even, src_perm);
vdata9 = dvload_perm(agen_ld_src_odd, src_perm);
vdata10 = dvload_perm(agen_ld_src_even, src_perm);
vdata11 = dvload_perm(agen_ld_src_odd, src_perm);
vdata12 = dvload_perm(agen_ld_src_even, src_perm);
vdata13 = dvload_perm(agen_ld_src_odd, src_perm);
vdata14 = dvload_perm(agen_ld_src_even, src_perm);
vdata15 = dvload_perm(agen_ld_src_odd, src_perm);
vdata0 = dvload_perm(agen_ld_src_even, src_perm);
vdata1 = dvload_perm(agen_ld_src_odd, src_perm);
vdata2 = dvload_perm(agen_ld_src_even, src_perm);
vdata3 = dvload_perm(agen_ld_src_odd, src_perm);
vdata4 = dvload_perm(agen_ld_src_even, src_perm);
vdata5 = dvload_perm(agen_ld_src_odd, src_perm);
vstore_i_hb(vacc0, agen_st_dst);
vstore_i_hb(vacc1, agen_st_dst);
vstore_i_hb(vacc2, agen_st_dst);
vstore_i_hb(vacc3, agen_st_dst);
// Need to make sure niter is even because of manual unroll 2. The prediction off will prevent any extra write.
niter = (niter % 2 == 0) ? niter : niter + 1;
// 24 loads -> 12 VLIW
// 24 computes -> 12 VLIW -> 1.5 VLIW per 32 outputs
// 8 stores -> 8 VLIW
for (int i = 0; i < (niter - 2) / 2; i++) chess_loop_range(8, ) //chess_prepare_for_pipelining
{
vstore_i_hb(vacc4, agen_st_dst);
vstore_i_hb(vacc5, agen_st_dst);
vstore_i_hb(vacc6, agen_st_dst);
vstore_i_hb(vacc7, agen_st_dst);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef0, vacc8, vacc9, 0, vacc8, vacc9);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef2, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef4, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef0, vacc10, vacc11, 0, vacc10, vacc11);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef2, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef4, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef0, vacc12, vacc13, 0, vacc12, vacc13);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef2, vacc12, vacc13, -1, vacc12, vacc13);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef4, vacc12, vacc13, -1, vacc12, vacc13);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef0, vacc14, vacc15, 0, vacc14, vacc15);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef2, vacc14, vacc15, -1, vacc14, vacc15);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef4, vacc14, vacc15, -1, vacc14, vacc15);
vdata6 = dvload_perm(agen_ld_src_even, src_perm);
vdata7 = dvload_perm(agen_ld_src_odd, src_perm);
vdata8 = dvload_perm(agen_ld_src_even, src_perm);
vdata9 = dvload_perm(agen_ld_src_odd, src_perm);
vdata10 = dvload_perm(agen_ld_src_even, src_perm);
vdata11 = dvload_perm(agen_ld_src_odd, src_perm);
vdata12 = dvload_perm(agen_ld_src_even, src_perm);
vdata13 = dvload_perm(agen_ld_src_odd, src_perm);
vdata14 = dvload_perm(agen_ld_src_even, src_perm);
vdata15 = dvload_perm(agen_ld_src_odd, src_perm);
vdata0 = dvload_perm(agen_ld_src_even, src_perm);
vdata1 = dvload_perm(agen_ld_src_odd, src_perm);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef1, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef3, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef5, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef1, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef3, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef5, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef1, vacc12, vacc13, -1, vacc12, vacc13);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef3, vacc12, vacc13, -1, vacc12, vacc13);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef5, vacc12, vacc13, -1, vacc12, vacc13);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef1, vacc14, vacc15, -1, vacc14, vacc15);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef3, vacc14, vacc15, -1, vacc14, vacc15);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef5, vacc14, vacc15, -1, vacc14, vacc15);
vstore_i_hb(vacc8, agen_st_dst);
vstore_i_hb(vacc9, agen_st_dst);
vstore_i_hb(vacc10, agen_st_dst);
vstore_i_hb(vacc11, agen_st_dst);
vstore_i_hb(vacc12, agen_st_dst);
vstore_i_hb(vacc13, agen_st_dst);
vstore_i_hb(vacc14, agen_st_dst);
vstore_i_hb(vacc15, agen_st_dst);
vdata2 = dvload_perm(agen_ld_src_even, src_perm);
vdata3 = dvload_perm(agen_ld_src_odd, src_perm);
vdata4 = dvload_perm(agen_ld_src_even, src_perm);
vdata5 = dvload_perm(agen_ld_src_odd, src_perm);
vdata6 = dvload_perm(agen_ld_src_even, src_perm);
vdata7 = dvload_perm(agen_ld_src_odd, src_perm);
vdata8 = dvload_perm(agen_ld_src_even, src_perm);
vdata9 = dvload_perm(agen_ld_src_odd, src_perm);
vdata10 = dvload_perm(agen_ld_src_even, src_perm);
vdata11 = dvload_perm(agen_ld_src_odd, src_perm);
vdata12 = dvload_perm(agen_ld_src_even, src_perm);
vdata13 = dvload_perm(agen_ld_src_odd, src_perm);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef0, vacc0, vacc1, 0, vacc0, vacc1);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef2, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef4, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef0, vacc2, vacc3, 0, vacc2, vacc3);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef2, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef4, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef0, vacc4, vacc5, 0, vacc4, vacc5);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef2, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef4, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef0, vacc6, vacc7, 0, vacc6, vacc7);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef2, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef4, vacc6, vacc7, -1, vacc6, vacc7);
vdata14 = dvload_perm(agen_ld_src_even, src_perm);
vdata15 = dvload_perm(agen_ld_src_odd, src_perm);
vdata0 = dvload_perm(agen_ld_src_even, src_perm);
vdata1 = dvload_perm(agen_ld_src_odd, src_perm);
vdata2 = dvload_perm(agen_ld_src_even, src_perm);
vdata3 = dvload_perm(agen_ld_src_odd, src_perm);
vdata4 = dvload_perm(agen_ld_src_even, src_perm);
vdata5 = dvload_perm(agen_ld_src_odd, src_perm);
vdata6 = dvload_perm(agen_ld_src_even, src_perm);
vdata7 = dvload_perm(agen_ld_src_odd, src_perm);
vdata8 = dvload_perm(agen_ld_src_even, src_perm);
vdata9 = dvload_perm(agen_ld_src_odd, src_perm);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef1, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef3, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef5, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef1, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef3, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef5, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef1, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef3, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef5, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef1, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef3, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef5, vacc6, vacc7, -1, vacc6, vacc7);
vdata10 = dvload_perm(agen_ld_src_even, src_perm);
vdata11 = dvload_perm(agen_ld_src_odd, src_perm);
vdata12 = dvload_perm(agen_ld_src_even, src_perm);
vdata13 = dvload_perm(agen_ld_src_odd, src_perm);
vdata14 = dvload_perm(agen_ld_src_even, src_perm);
vdata15 = dvload_perm(agen_ld_src_odd, src_perm);
vdata0 = dvload_perm(agen_ld_src_even, src_perm);
vdata1 = dvload_perm(agen_ld_src_odd, src_perm);
vdata2 = dvload_perm(agen_ld_src_even, src_perm);
vdata3 = dvload_perm(agen_ld_src_odd, src_perm);
vdata4 = dvload_perm(agen_ld_src_even, src_perm);
vdata5 = dvload_perm(agen_ld_src_odd, src_perm);
vstore_i_hb(vacc0, agen_st_dst);
vstore_i_hb(vacc1, agen_st_dst);
vstore_i_hb(vacc2, agen_st_dst);
vstore_i_hb(vacc3, agen_st_dst);
}
vstore_i_hb(vacc4, agen_st_dst);
vstore_i_hb(vacc5, agen_st_dst);
vstore_i_hb(vacc6, agen_st_dst);
vstore_i_hb(vacc7, agen_st_dst);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef0, vacc8, vacc9, 0, vacc8, vacc9);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef2, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef4, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef0, vacc10, vacc11, 0, vacc10, vacc11);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef2, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef4, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef0, vacc12, vacc13, 0, vacc12, vacc13);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef2, vacc12, vacc13, -1, vacc12, vacc13);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef4, vacc12, vacc13, -1, vacc12, vacc13);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef0, vacc14, vacc15, 0, vacc14, vacc15);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef2, vacc14, vacc15, -1, vacc14, vacc15);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef4, vacc14, vacc15, -1, vacc14, vacc15);
vdata6 = dvload_perm(agen_ld_src_even, src_perm);
vdata7 = dvload_perm(agen_ld_src_odd, src_perm);
vdata8 = dvload_perm(agen_ld_src_even, src_perm);
vdata9 = dvload_perm(agen_ld_src_odd, src_perm);
vdata10 = dvload_perm(agen_ld_src_even, src_perm);
vdata11 = dvload_perm(agen_ld_src_odd, src_perm);
vdata12 = dvload_perm(agen_ld_src_even, src_perm);
vdata13 = dvload_perm(agen_ld_src_odd, src_perm);
vdata14 = dvload_perm(agen_ld_src_even, src_perm);
vdata15 = dvload_perm(agen_ld_src_odd, src_perm);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef1, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef3, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef5, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef1, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef3, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef5, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef1, vacc12, vacc13, -1, vacc12, vacc13);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef3, vacc12, vacc13, -1, vacc12, vacc13);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef5, vacc12, vacc13, -1, vacc12, vacc13);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef1, vacc14, vacc15, -1, vacc14, vacc15);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef3, vacc14, vacc15, -1, vacc14, vacc15);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef5, vacc14, vacc15, -1, vacc14, vacc15);
vstore_i_hb(vacc8, agen_st_dst);
vstore_i_hb(vacc9, agen_st_dst);
vstore_i_hb(vacc10, agen_st_dst);
vstore_i_hb(vacc11, agen_st_dst);
vstore_i_hb(vacc12, agen_st_dst);
vstore_i_hb(vacc13, agen_st_dst);
vstore_i_hb(vacc14, agen_st_dst);
vstore_i_hb(vacc15, agen_st_dst);
}
else if constexpr (KernelSize == 5 && (std::is_same_v<DataType, int16_t> || std::is_same_v<DataType, uint16_t>))
{
// 5x5 Convolution for 16-bit input
constexpr int8_t m_knl_perm[32] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
16, 17, 18, 19, 16, 17, 18, 19, 16, 17, 18, 19, 16, 17, 18, 19};
vcharx knl_perm = zero_extend(*((vuchar *)(m_knl_perm)));
dvshortx vdata0a, vdata0b, vdata1a, vdata1b, vdata2a, vdata2b;
dvshortx vdata3a, vdata3b, vdata4a, vdata4b, vdata5a, vdata5b;
dvshortx vdata6a, vdata6b, vdata7a, vdata7b;
dvshortx vcoef00, vcoef01, vcoef10, vcoef11, vcoef20, vcoef21;
dvshortx vcoef30, vcoef31, vcoef40, vcoef41, vcoef50, vcoef51;
dvintx vacc0_lo, vacc1_lo, vacc0_hi, vacc1_hi;
dvintx vacc2_lo, vacc3_lo, vacc2_hi, vacc3_hi;
vcoef00 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef01 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef10 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef11 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef20 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef21 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef30 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef31 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef40 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef41 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef50 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef51 = dvload_perm_coef(agen_ld_coef, knl_perm);
// 16 loads for 8 src rows -> 8 VLIW
// 48 computes for 4 dst rows -> 24 VLIW -> 6 VLIW each 32 outputs
// 4 stores for 4 dst rows -> 4 VLIW
for (int i = 0; i < niter; i++) chess_loop_range(3, )
chess_prepare_for_pipelining
{
vdata0a = dvload(agen_ld_src_even);
vdata0b = dvload(agen_ld_src_even);
vdata1a = dvload(agen_ld_src_odd);
vdata1b = dvload(agen_ld_src_odd);
vdata2a = dvload(agen_ld_src_even);
vdata2b = dvload(agen_ld_src_even);
vdata3a = dvload(agen_ld_src_odd);
vdata3b = dvload(agen_ld_src_odd);
vdata4a = dvload(agen_ld_src_even);
vdata4b = dvload(agen_ld_src_even);
vdata5a = dvload(agen_ld_src_odd);
vdata5b = dvload(agen_ld_src_odd);
vdata6a = dvload(agen_ld_src_even);
vdata6b = dvload(agen_ld_src_even);
vdata7a = dvload(agen_ld_src_odd);
vdata7b = dvload(agen_ld_src_odd);
vfilt4x2_hhw(vdata0a.lo, vdata0b.lo, vcoef00, vacc0_lo, vacc1_lo, 0, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata0b.lo, vdata0a.lo, vcoef01, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata1a.lo, vdata1b.lo, vcoef10, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata1b.lo, vdata1a.lo, vcoef11, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata2a.lo, vdata2b.lo, vcoef20, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata2b.lo, vdata2a.lo, vcoef21, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata3a.lo, vdata3b.lo, vcoef30, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata3b.lo, vdata3a.lo, vcoef31, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata4a.lo, vdata4b.lo, vcoef40, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata4b.lo, vdata4a.lo, vcoef41, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata5a.lo, vdata5b.lo, vcoef50, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata5b.lo, vdata5a.lo, vcoef51, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata0a.hi, vdata0b.hi, vcoef00, vacc0_hi, vacc1_hi, 0, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata0b.hi, vdata0a.lo, vcoef01, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata1a.hi, vdata1b.hi, vcoef10, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata1b.hi, vdata1a.lo, vcoef11, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata2a.hi, vdata2b.hi, vcoef20, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata2b.hi, vdata2a.lo, vcoef21, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata3a.hi, vdata3b.hi, vcoef30, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata3b.hi, vdata3a.lo, vcoef31, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata4a.hi, vdata4b.hi, vcoef40, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata4b.hi, vdata4a.lo, vcoef41, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata5a.hi, vdata5b.hi, vcoef50, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata5b.hi, vdata5a.lo, vcoef51, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata2a.lo, vdata2b.lo, vcoef00, vacc2_lo, vacc3_lo, 0, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata2b.lo, vdata2a.lo, vcoef01, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata3a.lo, vdata3b.lo, vcoef10, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata3b.lo, vdata3a.lo, vcoef11, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata4a.lo, vdata4b.lo, vcoef20, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata4b.lo, vdata4a.lo, vcoef21, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata5a.lo, vdata5b.lo, vcoef30, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata5b.lo, vdata5a.lo, vcoef31, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata6a.lo, vdata6b.lo, vcoef40, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata6b.lo, vdata6a.lo, vcoef41, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata7a.lo, vdata7b.lo, vcoef50, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata7b.lo, vdata7a.lo, vcoef51, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata2a.hi, vdata2b.hi, vcoef00, vacc2_hi, vacc3_hi, 0, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata2b.hi, vdata2a.lo, vcoef01, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata3a.hi, vdata3b.hi, vcoef10, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata3b.hi, vdata3a.lo, vcoef11, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata4a.hi, vdata4b.hi, vcoef20, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata4b.hi, vdata4a.lo, vcoef21, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata5a.hi, vdata5b.hi, vcoef30, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata5b.hi, vdata5a.lo, vcoef31, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata6a.hi, vdata6b.hi, vcoef40, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata6b.hi, vdata6a.lo, vcoef41, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata7a.hi, vdata7b.hi, vcoef50, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata7b.hi, vdata7a.lo, vcoef51, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vstore_i2(vacc0_lo, vacc0_hi, agen_st_dst);
vstore_i2(vacc1_lo, vacc1_hi, agen_st_dst);
vstore_i2(vacc2_lo, vacc2_hi, agen_st_dst);
vstore_i2(vacc3_lo, vacc3_hi, agen_st_dst);
}
}
else if constexpr (KernelSize == 7 && (std::is_same_v<DataType, int8_t> || std::is_same_v<DataType, uint8_t>))
{
// 7x7 Convolution for 8-bit input
constexpr int8_t m_knl_perm[32] = {0, 1, 4, 5, 0, 1, 4, 5, 0, 1, 4, 5, 0, 1, 4, 5,
8, 9, 12, 13, 8, 9, 12, 13, 8, 9, 12, 13, 8, 9, 12, 13};
constexpr int8_t m_src_perm[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19};
vcharx knl_perm = zero_extend(*((vuchar *)(m_knl_perm)));
vcharx src_perm = zero_extend(*((vuchar *)(m_src_perm)));
dvcharx vcoef0, vcoef1, vcoef2, vcoef3, vcoef4, vcoef5, vcoef6, vcoef7;
dvcharx chess_storage(DV0) vdata0, chess_storage(DV1) vdata1;
dvcharx chess_storage(DV2) vdata2, chess_storage(DV3) vdata3;
dvcharx chess_storage(DV4) vdata4, chess_storage(DV5) vdata5;
dvcharx chess_storage(DV6) vdata6, chess_storage(DV7) vdata7;
dvcharx chess_storage(DV8) vdata8, chess_storage(DV9) vdata9;
dvcharx chess_storage(DV10) vdata10, chess_storage(DV11) vdata11;
dvcharx chess_storage(DV12) vdata12, chess_storage(DV13) vdata13;
dvcharx chess_storage(DV14) vdata14, chess_storage(DV15) vdata15;
dvshortx vacc0, vacc1, vacc2, vacc3;
dvshortx vacc4, vacc5, vacc6, vacc7;
dvshortx vacc8, vacc9, vacc10, vacc11;
chess_separator_scheduler();
// | dvcoef0 | dvcoef1 | //
// | dvcoef2 | dvcoef3 | //
// | dvcoef4 | dvcoef5 | //
// | dvcoef6 | dvcoef7 | //
vcoef0 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef1 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef2 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef3 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef4 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef5 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef6 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef7 = dvload_perm_coef(agen_ld_coef, knl_perm);
vdata0 = dvload_perm(agen_ld_src_odd, src_perm);
vdata1 = dvload_perm(agen_ld_src_even, src_perm);
vdata2 = dvload_perm(agen_ld_src_odd, src_perm);
vdata3 = dvload_perm(agen_ld_src_even, src_perm);
vdata4 = dvload_perm(agen_ld_src_odd, src_perm);
vdata5 = dvload_perm(agen_ld_src_even, src_perm);
vdata6 = dvload_perm(agen_ld_src_odd, src_perm);
vdata7 = dvload_perm(agen_ld_src_even, src_perm);
vdata8 = dvload_perm(agen_ld_src_odd, src_perm);
vdata9 = dvload_perm(agen_ld_src_even, src_perm);
vdata10 = dvload_perm(agen_ld_src_odd, src_perm);
vdata11 = dvload_perm(agen_ld_src_even, src_perm);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef0, vacc0, vacc1, 0, vacc0, vacc1);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef2, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef4, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef6, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef0, vacc2, vacc3, 0, vacc2, vacc3);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef2, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef4, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef6, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef0, vacc4, vacc5, 0, vacc4, vacc5);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef2, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef4, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef6, vacc4, vacc5, -1, vacc4, vacc5);
vdata12 = dvload_perm(agen_ld_src_odd, src_perm);
vdata13 = dvload_perm(agen_ld_src_even, src_perm);
vdata14 = dvload_perm(agen_ld_src_odd, src_perm);
vdata15 = dvload_perm(agen_ld_src_even, src_perm);
vdata0 = dvload_perm(agen_ld_src_odd, src_perm);
vdata1 = dvload_perm(agen_ld_src_even, src_perm);
vdata2 = dvload_perm(agen_ld_src_odd, src_perm);
vdata3 = dvload_perm(agen_ld_src_even, src_perm);
vdata4 = dvload_perm(agen_ld_src_odd, src_perm);
vdata5 = dvload_perm(agen_ld_src_even, src_perm);
vdata6 = dvload_perm(agen_ld_src_odd, src_perm);
vdata7 = dvload_perm(agen_ld_src_even, src_perm);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef1, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef3, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef5, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef7, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef1, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef3, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef5, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef7, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef1, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef3, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef5, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef7, vacc4, vacc5, -1, vacc4, vacc5);
vdata8 = dvload_perm(agen_ld_src_odd, src_perm);
vdata9 = dvload_perm(agen_ld_src_even, src_perm);
vdata10 = dvload_perm(agen_ld_src_odd, src_perm);
vdata11 = dvload_perm(agen_ld_src_even, src_perm);
vdata12 = dvload_perm(agen_ld_src_odd, src_perm);
vdata13 = dvload_perm(agen_ld_src_even, src_perm);
vdata14 = dvload_perm(agen_ld_src_odd, src_perm);
vdata15 = dvload_perm(agen_ld_src_even, src_perm);
vdata0 = dvload_perm(agen_ld_src_odd, src_perm);
vdata1 = dvload_perm(agen_ld_src_even, src_perm);
vdata2 = dvload_perm(agen_ld_src_odd, src_perm);
vdata3 = dvload_perm(agen_ld_src_even, src_perm);
vstore_i_hb(vacc0, agen_st_dst);
vstore_i_hb(vacc1, agen_st_dst);
vstore_i_hb(vacc2, agen_st_dst);
// Need to make sure niter is even because of manual unroll 2. The prediction off will prevent any extra write.
niter = (niter % 2 == 0) ? niter : niter + 1;
// 24 loads -> 12 VLIW
// 24 computes -> 12 VLIW -> 2 VLIW per 32 outputs
// 6 stores -> 6 VLIW
for (int i = 0; i < (niter - 2) / 2; i++) chess_loop_range(6, ) //chess_prepare_for_pipelining
{
vstore_i_hb(vacc3, agen_st_dst);
vstore_i_hb(vacc4, agen_st_dst);
vstore_i_hb(vacc5, agen_st_dst);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef0, vacc6, vacc7, 0, vacc6, vacc7);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef2, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef4, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef6, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef0, vacc8, vacc9, 0, vacc8, vacc9);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef2, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef4, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef6, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef0, vacc10, vacc11, 0, vacc10, vacc11);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef2, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef4, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef6, vacc10, vacc11, -1, vacc10, vacc11);
vdata4 = dvload_perm(agen_ld_src_odd, src_perm);
vdata5 = dvload_perm(agen_ld_src_even, src_perm);
vdata6 = dvload_perm(agen_ld_src_odd, src_perm);
vdata7 = dvload_perm(agen_ld_src_even, src_perm);
vdata8 = dvload_perm(agen_ld_src_odd, src_perm);
vdata9 = dvload_perm(agen_ld_src_even, src_perm);
vdata10 = dvload_perm(agen_ld_src_odd, src_perm);
vdata11 = dvload_perm(agen_ld_src_even, src_perm);
vdata12 = dvload_perm(agen_ld_src_odd, src_perm);
vdata13 = dvload_perm(agen_ld_src_even, src_perm);
vdata14 = dvload_perm(agen_ld_src_odd, src_perm);
vdata15 = dvload_perm(agen_ld_src_even, src_perm);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef1, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef3, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef5, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef7, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef1, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef3, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef5, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef7, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef1, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef3, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef5, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef7, vacc10, vacc11, -1, vacc10, vacc11);
vstore_i_hb(vacc6, agen_st_dst);
vstore_i_hb(vacc7, agen_st_dst);
vstore_i_hb(vacc8, agen_st_dst);
vstore_i_hb(vacc9, agen_st_dst);
vstore_i_hb(vacc10, agen_st_dst);
vstore_i_hb(vacc11, agen_st_dst);
vdata0 = dvload_perm(agen_ld_src_odd, src_perm);
vdata1 = dvload_perm(agen_ld_src_even, src_perm);
vdata2 = dvload_perm(agen_ld_src_odd, src_perm);
vdata3 = dvload_perm(agen_ld_src_even, src_perm);
vdata4 = dvload_perm(agen_ld_src_odd, src_perm);
vdata5 = dvload_perm(agen_ld_src_even, src_perm);
vdata6 = dvload_perm(agen_ld_src_odd, src_perm);
vdata7 = dvload_perm(agen_ld_src_even, src_perm);
vdata8 = dvload_perm(agen_ld_src_odd, src_perm);
vdata9 = dvload_perm(agen_ld_src_even, src_perm);
vdata10 = dvload_perm(agen_ld_src_odd, src_perm);
vdata11 = dvload_perm(agen_ld_src_even, src_perm);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef0, vacc0, vacc1, 0, vacc0, vacc1);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef2, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef4, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef6, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef0, vacc2, vacc3, 0, vacc2, vacc3);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef2, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef4, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef6, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef0, vacc4, vacc5, 0, vacc4, vacc5);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef2, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef4, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef6, vacc4, vacc5, -1, vacc4, vacc5);
vdata12 = dvload_perm(agen_ld_src_odd, src_perm);
vdata13 = dvload_perm(agen_ld_src_even, src_perm);
vdata14 = dvload_perm(agen_ld_src_odd, src_perm);
vdata15 = dvload_perm(agen_ld_src_even, src_perm);
vdata0 = dvload_perm(agen_ld_src_odd, src_perm);
vdata1 = dvload_perm(agen_ld_src_even, src_perm);
vdata2 = dvload_perm(agen_ld_src_odd, src_perm);
vdata3 = dvload_perm(agen_ld_src_even, src_perm);
vdata4 = dvload_perm(agen_ld_src_odd, src_perm);
vdata5 = dvload_perm(agen_ld_src_even, src_perm);
vdata6 = dvload_perm(agen_ld_src_odd, src_perm);
vdata7 = dvload_perm(agen_ld_src_even, src_perm);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef1, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef3, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef5, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef7, vacc0, vacc1, -1, vacc0, vacc1);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef1, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef3, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef5, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef7, vacc2, vacc3, -1, vacc2, vacc3);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef1, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef3, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef5, vacc4, vacc5, -1, vacc4, vacc5);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef7, vacc4, vacc5, -1, vacc4, vacc5);
vdata8 = dvload_perm(agen_ld_src_odd, src_perm);
vdata9 = dvload_perm(agen_ld_src_even, src_perm);
vdata10 = dvload_perm(agen_ld_src_odd, src_perm);
vdata11 = dvload_perm(agen_ld_src_even, src_perm);
vdata12 = dvload_perm(agen_ld_src_odd, src_perm);
vdata13 = dvload_perm(agen_ld_src_even, src_perm);
vdata14 = dvload_perm(agen_ld_src_odd, src_perm);
vdata15 = dvload_perm(agen_ld_src_even, src_perm);
vdata0 = dvload_perm(agen_ld_src_odd, src_perm);
vdata1 = dvload_perm(agen_ld_src_even, src_perm);
vdata2 = dvload_perm(agen_ld_src_odd, src_perm);
vdata3 = dvload_perm(agen_ld_src_even, src_perm);
vstore_i_hb(vacc0, agen_st_dst);
vstore_i_hb(vacc1, agen_st_dst);
vstore_i_hb(vacc2, agen_st_dst);
}
vstore_i_hb(vacc3, agen_st_dst);
vstore_i_hb(vacc4, agen_st_dst);
vstore_i_hb(vacc5, agen_st_dst);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef0, vacc6, vacc7, 0, vacc6, vacc7);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef2, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef4, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef6, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef0, vacc8, vacc9, 0, vacc8, vacc9);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef2, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef4, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef6, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef0, vacc10, vacc11, 0, vacc10, vacc11);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef2, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata0, vdata1, vcoef4, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata2, vdata3, vcoef6, vacc10, vacc11, -1, vacc10, vacc11);
vdata4 = dvload_perm(agen_ld_src_odd, src_perm);
vdata5 = dvload_perm(agen_ld_src_even, src_perm);
vdata6 = dvload_perm(agen_ld_src_odd, src_perm);
vdata7 = dvload_perm(agen_ld_src_even, src_perm);
vdata8 = dvload_perm(agen_ld_src_odd, src_perm);
vdata9 = dvload_perm(agen_ld_src_even, src_perm);
vdata10 = dvload_perm(agen_ld_src_odd, src_perm);
vdata11 = dvload_perm(agen_ld_src_even, src_perm);
vdata12 = dvload_perm(agen_ld_src_odd, src_perm);
vdata13 = dvload_perm(agen_ld_src_even, src_perm);
vdata14 = dvload_perm(agen_ld_src_odd, src_perm);
vdata15 = dvload_perm(agen_ld_src_even, src_perm);
vfilt4x2x2_bbh(vdata4, vdata5, vcoef1, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef3, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef5, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef7, vacc6, vacc7, -1, vacc6, vacc7);
vfilt4x2x2_bbh(vdata6, vdata7, vcoef1, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef3, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef5, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef7, vacc8, vacc9, -1, vacc8, vacc9);
vfilt4x2x2_bbh(vdata8, vdata9, vcoef1, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata10, vdata11, vcoef3, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata12, vdata13, vcoef5, vacc10, vacc11, -1, vacc10, vacc11);
vfilt4x2x2_bbh(vdata14, vdata15, vcoef7, vacc10, vacc11, -1, vacc10, vacc11);
vstore_i_hb(vacc6, agen_st_dst);
vstore_i_hb(vacc7, agen_st_dst);
vstore_i_hb(vacc8, agen_st_dst);
vstore_i_hb(vacc9, agen_st_dst);
vstore_i_hb(vacc10, agen_st_dst);
vstore_i_hb(vacc11, agen_st_dst);
}
else if constexpr (KernelSize == 7 && (std::is_same_v<DataType, int16_t> || std::is_same_v<DataType, uint16_t>))
{
// 7x7 Convolution for 16-bit input
constexpr int8_t m_knl_perm[32] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
16, 17, 18, 19, 16, 17, 18, 19, 16, 17, 18, 19, 16, 17, 18, 19};
vcharx knl_perm = zero_extend(*((vuchar *)(m_knl_perm)));
dvshortx vdata0a, vdata0b, vdata0c, vdata1a, vdata1b, vdata1c;
dvshortx vdata2a, vdata2b, vdata2c, vdata3a, vdata3b, vdata3c;
dvshortx vdata4a, vdata4b, vdata4c, vdata5a, vdata5b, vdata5c;
dvshortx vdata6a, vdata6b, vdata6c, vdata7a, vdata7b, vdata7c;
dvshortx vdata8a, vdata8b, vdata8c, vdata9a, vdata9b, vdata9c;
dvshortx vcoef00, vcoef01, vcoef10, vcoef11, vcoef20, vcoef21;
dvshortx vcoef30, vcoef31, vcoef40, vcoef41, vcoef50, vcoef51;
dvshortx vcoef60, vcoef61, vcoef70, vcoef71;
dvintx vacc0_lo, vacc1_lo, vacc0_hi, vacc1_hi;
dvintx vacc2_lo, vacc3_lo, vacc2_hi, vacc3_hi;
vcoef00 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef01 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef10 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef11 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef20 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef21 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef30 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef31 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef40 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef41 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef50 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef51 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef60 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef61 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef70 = dvload_perm_coef(agen_ld_coef, knl_perm);
vcoef71 = dvload_perm_coef(agen_ld_coef, knl_perm);
// 30 loads for 10 src rows -> 15 VLIW
// 64 computes for 4 dst rows -> 32 VLIW -> 8 VLIW each 32 outputs
// 4 stores for 4 dst rows -> 4 VLIW
for (int i = 0; i < niter; i++) chess_loop_range(3, )
chess_prepare_for_pipelining
{
vdata0a = dvload(agen_ld_src_odd);
vdata0b = dvload(agen_ld_src_odd);
vdata0c = dvload(agen_ld_src_odd);
vdata1a = dvload(agen_ld_src_even);
vdata1b = dvload(agen_ld_src_even);
vdata1c = dvload(agen_ld_src_even);
vdata2a = dvload(agen_ld_src_odd);
vdata2b = dvload(agen_ld_src_odd);
vdata2c = dvload(agen_ld_src_odd);
vdata3a = dvload(agen_ld_src_even);
vdata3b = dvload(agen_ld_src_even);
vdata3c = dvload(agen_ld_src_even);
vdata4a = dvload(agen_ld_src_odd);
vdata4b = dvload(agen_ld_src_odd);
vdata4c = dvload(agen_ld_src_odd);
vdata5a = dvload(agen_ld_src_even);
vdata5b = dvload(agen_ld_src_even);
vdata5c = dvload(agen_ld_src_even);
vdata6a = dvload(agen_ld_src_odd);
vdata6b = dvload(agen_ld_src_odd);
vdata6c = dvload(agen_ld_src_odd);
vdata7a = dvload(agen_ld_src_even);
vdata7b = dvload(agen_ld_src_even);
vdata7c = dvload(agen_ld_src_even);
vdata8a = dvload(agen_ld_src_odd);
vdata8b = dvload(agen_ld_src_odd);
vdata8c = dvload(agen_ld_src_odd);
vdata9a = dvload(agen_ld_src_even);
vdata9b = dvload(agen_ld_src_even);
vdata9c = dvload(agen_ld_src_even);
vfilt4x2_hhw(vdata0a.lo, vdata0b.lo, vcoef00, vacc0_lo, vacc1_lo, 0, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata0b.lo, vdata0c.lo, vcoef01, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata1a.lo, vdata1b.lo, vcoef10, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata1b.lo, vdata1c.lo, vcoef11, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata2a.lo, vdata2b.lo, vcoef20, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata2b.lo, vdata2c.lo, vcoef21, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata3a.lo, vdata3b.lo, vcoef30, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata3b.lo, vdata3c.lo, vcoef31, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata4a.lo, vdata4b.lo, vcoef40, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata4b.lo, vdata4c.lo, vcoef41, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata5a.lo, vdata5b.lo, vcoef50, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata5b.lo, vdata5c.lo, vcoef51, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata6a.lo, vdata6b.lo, vcoef60, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata6b.lo, vdata6c.lo, vcoef61, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata7a.lo, vdata7b.lo, vcoef70, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata7b.lo, vdata7c.lo, vcoef71, vacc0_lo, vacc1_lo, -1, vacc0_lo, vacc1_lo);
vfilt4x2_hhw(vdata0a.hi, vdata0b.hi, vcoef00, vacc0_hi, vacc1_hi, 0, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata0b.hi, vdata0c.hi, vcoef01, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata1a.hi, vdata1b.hi, vcoef10, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata1b.hi, vdata1c.hi, vcoef11, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata2a.hi, vdata2b.hi, vcoef20, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata2b.hi, vdata2c.hi, vcoef21, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata3a.hi, vdata3b.hi, vcoef30, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata3b.hi, vdata3c.hi, vcoef31, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata4a.hi, vdata4b.hi, vcoef40, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata4b.hi, vdata4c.hi, vcoef41, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata5a.hi, vdata5b.hi, vcoef50, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata5b.hi, vdata5c.hi, vcoef51, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata6a.hi, vdata6b.hi, vcoef60, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata6b.hi, vdata6c.hi, vcoef61, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata7a.hi, vdata7b.hi, vcoef70, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata7b.hi, vdata7c.hi, vcoef71, vacc0_hi, vacc1_hi, -1, vacc0_hi, vacc1_hi);
vfilt4x2_hhw(vdata2a.lo, vdata2b.lo, vcoef00, vacc2_lo, vacc3_lo, 0, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata2b.lo, vdata2c.lo, vcoef01, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata3a.lo, vdata3b.lo, vcoef10, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata3b.lo, vdata3c.lo, vcoef11, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata4a.lo, vdata4b.lo, vcoef20, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata4b.lo, vdata4c.lo, vcoef21, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata5a.lo, vdata5b.lo, vcoef30, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata5b.lo, vdata5c.lo, vcoef31, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata6a.lo, vdata6b.lo, vcoef40, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata6b.lo, vdata6c.lo, vcoef41, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata7a.lo, vdata7b.lo, vcoef50, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata7b.lo, vdata7c.lo, vcoef51, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata8a.lo, vdata8b.lo, vcoef60, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata8b.lo, vdata8c.lo, vcoef61, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata9a.lo, vdata9b.lo, vcoef70, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata9b.lo, vdata9c.lo, vcoef71, vacc2_lo, vacc3_lo, -1, vacc2_lo, vacc3_lo);
vfilt4x2_hhw(vdata2a.hi, vdata2b.hi, vcoef00, vacc2_hi, vacc3_hi, 0, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata2b.hi, vdata2c.hi, vcoef01, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata3a.hi, vdata3b.hi, vcoef10, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata3b.hi, vdata3c.hi, vcoef11, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata4a.hi, vdata4b.hi, vcoef20, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata4b.hi, vdata4c.hi, vcoef21, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata5a.hi, vdata5b.hi, vcoef30, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata5b.hi, vdata5c.hi, vcoef31, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata6a.hi, vdata6b.hi, vcoef40, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata6b.hi, vdata6c.hi, vcoef41, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata7a.hi, vdata7b.hi, vcoef50, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata7b.hi, vdata7c.hi, vcoef51, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata8a.hi, vdata8b.hi, vcoef60, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata8b.hi, vdata8c.hi, vcoef61, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata9a.hi, vdata9b.hi, vcoef70, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vfilt4x2_hhw(vdata9b.hi, vdata9c.hi, vcoef71, vacc2_hi, vacc3_hi, -1, vacc2_hi, vacc3_hi);
vstore_i2(vacc0_lo, vacc0_hi, agen_st_dst);
vstore_i2(vacc1_lo, vacc1_hi, agen_st_dst);
vstore_i2(vacc2_lo, vacc2_hi, agen_st_dst);
vstore_i2(vacc3_lo, vacc3_hi, agen_st_dst);
}
}
else
{
static_assert(std::is_same_v<DataType, void>, "Unsupported combination of KernelSize and DataType");
}
}
};
} // namespace pvaApl
#endif /* PVA_APL_CONV2D_VPU_HPP */