vp9[loongarch]: Optimize fdct4x4/8x8_lsx
1. vpx_fdct4x4_lsx 2. vpx_fdct8x8_lsx Bug: webm:1755 Change-Id: If283fc08f9bedcbecd2c4052adb210f8fe00d4f0
This commit is contained in:
@@ -587,7 +587,9 @@ INSTANTIATE_TEST_SUITE_P(VSX, TransDCT,
|
|||||||
#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH &&
|
#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH &&
|
||||||
|
|
||||||
#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
|
#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
|
||||||
static const FuncInfo dct_lsx_func_info[2] = {
|
static const FuncInfo dct_lsx_func_info[4] = {
|
||||||
|
{ &fdct_wrapper<vpx_fdct4x4_lsx>, &idct_wrapper<vpx_idct4x4_16_add_c>, 4, 1 },
|
||||||
|
{ &fdct_wrapper<vpx_fdct8x8_lsx>, &idct_wrapper<vpx_idct8x8_64_add_c>, 8, 1 },
|
||||||
{ &fdct_wrapper<vpx_fdct16x16_lsx>, &idct_wrapper<vpx_idct16x16_256_add_c>,
|
{ &fdct_wrapper<vpx_fdct16x16_lsx>, &idct_wrapper<vpx_idct16x16_256_add_c>,
|
||||||
16, 1 },
|
16, 1 },
|
||||||
{ &fdct_wrapper<vpx_fdct32x32_lsx>, &idct_wrapper<vpx_idct32x32_1024_add_lsx>,
|
{ &fdct_wrapper<vpx_fdct32x32_lsx>, &idct_wrapper<vpx_idct32x32_1024_add_lsx>,
|
||||||
@@ -596,7 +598,7 @@ static const FuncInfo dct_lsx_func_info[2] = {
|
|||||||
|
|
||||||
INSTANTIATE_TEST_SUITE_P(
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
LSX, TransDCT,
|
LSX, TransDCT,
|
||||||
::testing::Combine(::testing::Range(0, 2),
|
::testing::Combine(::testing::Range(0, 4),
|
||||||
::testing::Values(dct_lsx_func_info),
|
::testing::Values(dct_lsx_func_info),
|
||||||
::testing::Values(0), ::testing::Values(VPX_BITS_8)));
|
::testing::Values(0), ::testing::Values(VPX_BITS_8)));
|
||||||
#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
|
#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
|
||||||
|
|||||||
@@ -768,4 +768,11 @@ INSTANTIATE_TEST_SUITE_P(VSX, FwdTrans8x8DCT,
|
|||||||
&vpx_idct8x8_64_add_vsx,
|
&vpx_idct8x8_64_add_vsx,
|
||||||
0, VPX_BITS_8)));
|
0, VPX_BITS_8)));
|
||||||
#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||||
|
|
||||||
|
#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||||
|
INSTANTIATE_TEST_SUITE_P(LSX, FwdTrans8x8DCT,
|
||||||
|
::testing::Values(make_tuple(&vpx_fdct8x8_lsx,
|
||||||
|
&vpx_idct8x8_64_add_c, 0,
|
||||||
|
VPX_BITS_8)));
|
||||||
|
#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|||||||
@@ -11,6 +11,20 @@
|
|||||||
#include "./vpx_dsp_rtcd.h"
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
|
#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
|
||||||
|
|
||||||
|
#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
|
||||||
|
{ \
|
||||||
|
__m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \
|
||||||
|
\
|
||||||
|
DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \
|
||||||
|
DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \
|
||||||
|
_t0 = __lsx_vilvl_h(_s1, _s0); \
|
||||||
|
_t1 = __lsx_vilvh_h(_s1, _s0); \
|
||||||
|
_t2 = __lsx_vilvl_h(_s3, _s2); \
|
||||||
|
_t3 = __lsx_vilvh_h(_s3, _s2); \
|
||||||
|
DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \
|
||||||
|
DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \
|
||||||
|
}
|
||||||
|
|
||||||
#if !CONFIG_VP9_HIGHBITDEPTH
|
#if !CONFIG_VP9_HIGHBITDEPTH
|
||||||
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
|
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
|
||||||
int32_t src_stride) {
|
int32_t src_stride) {
|
||||||
@@ -240,6 +254,84 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) {
|
|||||||
__lsx_vst(in7, output, 240);
|
__lsx_vst(in7, output, 240);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output,
|
||||||
|
int32_t src_stride) {
|
||||||
|
__m128i in0, in1, in2, in3;
|
||||||
|
|
||||||
|
int32_t src_stride2 = src_stride << 1;
|
||||||
|
int32_t src_stride4 = src_stride2 << 1;
|
||||||
|
int32_t src_stride6 = src_stride4 + src_stride2;
|
||||||
|
|
||||||
|
in0 = __lsx_vld(input, 0);
|
||||||
|
DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2);
|
||||||
|
in3 = __lsx_vldx(input, src_stride6);
|
||||||
|
|
||||||
|
/* fdct4 pre-process */
|
||||||
|
{
|
||||||
|
__m128i vec, mask;
|
||||||
|
__m128i zero = __lsx_vldi(0);
|
||||||
|
|
||||||
|
mask = __lsx_vinsgr2vr_b(zero, 1, 0);
|
||||||
|
DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2,
|
||||||
|
in3);
|
||||||
|
vec = __lsx_vseqi_h(in0, 0);
|
||||||
|
vec = __lsx_vxori_b(vec, 255);
|
||||||
|
vec = __lsx_vand_v(mask, vec);
|
||||||
|
in0 = __lsx_vadd_h(in0, vec);
|
||||||
|
}
|
||||||
|
|
||||||
|
VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
|
||||||
|
LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
|
||||||
|
VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
|
||||||
|
LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
|
||||||
|
DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
|
||||||
|
DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
|
||||||
|
DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2);
|
||||||
|
__lsx_vst(in0, output, 0);
|
||||||
|
__lsx_vst(in2, output, 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output,
|
||||||
|
int32_t src_stride) {
|
||||||
|
__m128i in0, in1, in2, in3, in4, in5, in6, in7;
|
||||||
|
int32_t src_stride2 = src_stride << 1;
|
||||||
|
int32_t src_stride4 = src_stride2 << 1;
|
||||||
|
int32_t src_stride6 = src_stride4 + src_stride2;
|
||||||
|
int16_t *input_tmp = (int16_t *)input;
|
||||||
|
|
||||||
|
in0 = __lsx_vld(input_tmp, 0);
|
||||||
|
DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1,
|
||||||
|
in2);
|
||||||
|
in3 = __lsx_vldx(input_tmp, src_stride6);
|
||||||
|
input_tmp += src_stride4;
|
||||||
|
in4 = __lsx_vld(input_tmp, 0);
|
||||||
|
DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5,
|
||||||
|
in6);
|
||||||
|
in7 = __lsx_vldx(input_tmp, src_stride6);
|
||||||
|
|
||||||
|
DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
|
||||||
|
DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
|
||||||
|
|
||||||
|
VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
|
||||||
|
in5, in6, in7);
|
||||||
|
LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
|
||||||
|
in4, in5, in6, in7);
|
||||||
|
VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
|
||||||
|
in5, in6, in7);
|
||||||
|
LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
|
||||||
|
in4, in5, in6, in7);
|
||||||
|
SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
|
||||||
|
|
||||||
|
__lsx_vst(in0, output, 0);
|
||||||
|
__lsx_vst(in1, output, 16);
|
||||||
|
__lsx_vst(in2, output, 32);
|
||||||
|
__lsx_vst(in3, output, 48);
|
||||||
|
__lsx_vst(in4, output, 64);
|
||||||
|
__lsx_vst(in5, output, 80);
|
||||||
|
__lsx_vst(in6, output, 96);
|
||||||
|
__lsx_vst(in7, output, 112);
|
||||||
|
}
|
||||||
|
|
||||||
void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
|
void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
|
||||||
int32_t src_stride) {
|
int32_t src_stride) {
|
||||||
int32_t i;
|
int32_t i;
|
||||||
|
|||||||
@@ -14,6 +14,105 @@
|
|||||||
#include "vpx_dsp/loongarch/txfm_macros_lsx.h"
|
#include "vpx_dsp/loongarch/txfm_macros_lsx.h"
|
||||||
#include "vpx_dsp/txfm_common.h"
|
#include "vpx_dsp/txfm_common.h"
|
||||||
|
|
||||||
|
#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \
|
||||||
|
{ \
|
||||||
|
__m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
|
||||||
|
__m128i vec0_m, vec1_m, vec2_m, vec3_m; \
|
||||||
|
__m128i vec4_m, vec5_m, vec6_m, vec7_m; \
|
||||||
|
__m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df }; \
|
||||||
|
\
|
||||||
|
LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \
|
||||||
|
DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
|
||||||
|
DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m); \
|
||||||
|
cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
|
||||||
|
vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m); \
|
||||||
|
DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m); \
|
||||||
|
cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m); \
|
||||||
|
vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \
|
||||||
|
\
|
||||||
|
vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m); \
|
||||||
|
cnst2_m = __lsx_vreplvei_h(coeff_m, 2); \
|
||||||
|
cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m); \
|
||||||
|
vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \
|
||||||
|
\
|
||||||
|
DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \
|
||||||
|
vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \
|
||||||
|
vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
|
||||||
|
out3, out4, out5, out6, out7) \
|
||||||
|
{ \
|
||||||
|
__m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
|
||||||
|
__m128i s7_m, x0_m, x1_m, x2_m, x3_m; \
|
||||||
|
__m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \
|
||||||
|
\
|
||||||
|
/* FDCT stage1 */ \
|
||||||
|
LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \
|
||||||
|
s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \
|
||||||
|
LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
|
||||||
|
DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
|
||||||
|
DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
|
||||||
|
DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \
|
||||||
|
x1_m = __lsx_vpackev_h(x1_m, x0_m); \
|
||||||
|
DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \
|
||||||
|
\
|
||||||
|
DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \
|
||||||
|
x2_m = __lsx_vneg_h(x2_m); \
|
||||||
|
x2_m = __lsx_vpackev_h(x3_m, x2_m); \
|
||||||
|
DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \
|
||||||
|
\
|
||||||
|
DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \
|
||||||
|
x2_m = __lsx_vreplvei_h(coeff_m, 2); \
|
||||||
|
x2_m = __lsx_vpackev_h(x2_m, x3_m); \
|
||||||
|
DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \
|
||||||
|
\
|
||||||
|
/* stage2 */ \
|
||||||
|
s1_m = __lsx_vilvl_h(s5_m, s6_m); \
|
||||||
|
s0_m = __lsx_vilvh_h(s5_m, s6_m); \
|
||||||
|
\
|
||||||
|
DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \
|
||||||
|
DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \
|
||||||
|
\
|
||||||
|
/* stage3 */ \
|
||||||
|
LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
|
||||||
|
\
|
||||||
|
/* stage4 */ \
|
||||||
|
DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
|
||||||
|
DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
|
||||||
|
\
|
||||||
|
DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \
|
||||||
|
x1_m = __lsx_vpackev_h(x0_m, x1_m); \
|
||||||
|
DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \
|
||||||
|
\
|
||||||
|
DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \
|
||||||
|
x2_m = __lsx_vpackev_h(x3_m, x2_m); \
|
||||||
|
DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \
|
||||||
|
\
|
||||||
|
x1_m = __lsx_vreplvei_h(coeff_m, 5); \
|
||||||
|
x0_m = __lsx_vneg_h(x0_m); \
|
||||||
|
x0_m = __lsx_vpackev_h(x1_m, x0_m); \
|
||||||
|
DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \
|
||||||
|
x2_m = __lsx_vreplvei_h(coeff_m, 6); \
|
||||||
|
x3_m = __lsx_vneg_h(x3_m); \
|
||||||
|
x2_m = __lsx_vpackev_h(x2_m, x3_m); \
|
||||||
|
DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \
|
||||||
|
{ \
|
||||||
|
__m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
|
||||||
|
\
|
||||||
|
DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \
|
||||||
|
vec1_m, vec2_m, vec3_m); \
|
||||||
|
DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m, \
|
||||||
|
vec5_m, vec6_m, vec7_m); \
|
||||||
|
DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, \
|
||||||
|
in3, in0, in1, in2, in3); \
|
||||||
|
DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \
|
||||||
|
in7, in4, in5, in6, in7); \
|
||||||
|
}
|
||||||
|
|
||||||
#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
|
#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
|
||||||
{ \
|
{ \
|
||||||
__m128i tp0_m, tp1_m; \
|
__m128i tp0_m, tp1_m; \
|
||||||
|
|||||||
@@ -573,13 +573,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||||||
add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
} else {
|
} else {
|
||||||
add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/vpx_fdct4x4 neon sse2 msa/;
|
specialize qw/vpx_fdct4x4 neon sse2 msa lsx/;
|
||||||
|
|
||||||
add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/vpx_fdct4x4_1 sse2 neon/;
|
specialize qw/vpx_fdct4x4_1 sse2 neon/;
|
||||||
|
|
||||||
add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
|
specialize qw/vpx_fdct8x8 sse2 neon msa lsx/, "$ssse3_x86_64";
|
||||||
|
|
||||||
add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
|
specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
|
||||||
|
|||||||
Reference in New Issue
Block a user