vp9[loongarch]: Optimize fdct4x4/8x8_lsx
1. vpx_fdct4x4_lsx 2. vpx_fdct8x8_lsx Bug: webm:1755 Change-Id: If283fc08f9bedcbecd2c4052adb210f8fe00d4f0
This commit is contained in:
@@ -587,7 +587,9 @@ INSTANTIATE_TEST_SUITE_P(VSX, TransDCT,
|
||||
#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH &&
|
||||
|
||||
#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
|
||||
static const FuncInfo dct_lsx_func_info[2] = {
|
||||
static const FuncInfo dct_lsx_func_info[4] = {
|
||||
{ &fdct_wrapper<vpx_fdct4x4_lsx>, &idct_wrapper<vpx_idct4x4_16_add_c>, 4, 1 },
|
||||
{ &fdct_wrapper<vpx_fdct8x8_lsx>, &idct_wrapper<vpx_idct8x8_64_add_c>, 8, 1 },
|
||||
{ &fdct_wrapper<vpx_fdct16x16_lsx>, &idct_wrapper<vpx_idct16x16_256_add_c>,
|
||||
16, 1 },
|
||||
{ &fdct_wrapper<vpx_fdct32x32_lsx>, &idct_wrapper<vpx_idct32x32_1024_add_lsx>,
|
||||
@@ -596,7 +598,7 @@ static const FuncInfo dct_lsx_func_info[2] = {
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
LSX, TransDCT,
|
||||
::testing::Combine(::testing::Range(0, 2),
|
||||
::testing::Combine(::testing::Range(0, 4),
|
||||
::testing::Values(dct_lsx_func_info),
|
||||
::testing::Values(0), ::testing::Values(VPX_BITS_8)));
|
||||
#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
@@ -768,4 +768,11 @@ INSTANTIATE_TEST_SUITE_P(VSX, FwdTrans8x8DCT,
|
||||
&vpx_idct8x8_64_add_vsx,
|
||||
0, VPX_BITS_8)));
|
||||
#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
|
||||
#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_SUITE_P(LSX, FwdTrans8x8DCT,
|
||||
::testing::Values(make_tuple(&vpx_fdct8x8_lsx,
|
||||
&vpx_idct8x8_64_add_c, 0,
|
||||
VPX_BITS_8)));
|
||||
#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
} // namespace
|
||||
|
||||
@@ -11,6 +11,20 @@
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
|
||||
|
||||
#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
|
||||
{ \
|
||||
__m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \
|
||||
\
|
||||
DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \
|
||||
DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \
|
||||
_t0 = __lsx_vilvl_h(_s1, _s0); \
|
||||
_t1 = __lsx_vilvh_h(_s1, _s0); \
|
||||
_t2 = __lsx_vilvl_h(_s3, _s2); \
|
||||
_t3 = __lsx_vilvh_h(_s3, _s2); \
|
||||
DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \
|
||||
DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \
|
||||
}
|
||||
|
||||
#if !CONFIG_VP9_HIGHBITDEPTH
|
||||
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
|
||||
int32_t src_stride) {
|
||||
@@ -240,6 +254,84 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) {
|
||||
__lsx_vst(in7, output, 240);
|
||||
}
|
||||
|
||||
void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output,
|
||||
int32_t src_stride) {
|
||||
__m128i in0, in1, in2, in3;
|
||||
|
||||
int32_t src_stride2 = src_stride << 1;
|
||||
int32_t src_stride4 = src_stride2 << 1;
|
||||
int32_t src_stride6 = src_stride4 + src_stride2;
|
||||
|
||||
in0 = __lsx_vld(input, 0);
|
||||
DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2);
|
||||
in3 = __lsx_vldx(input, src_stride6);
|
||||
|
||||
/* fdct4 pre-process */
|
||||
{
|
||||
__m128i vec, mask;
|
||||
__m128i zero = __lsx_vldi(0);
|
||||
|
||||
mask = __lsx_vinsgr2vr_b(zero, 1, 0);
|
||||
DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2,
|
||||
in3);
|
||||
vec = __lsx_vseqi_h(in0, 0);
|
||||
vec = __lsx_vxori_b(vec, 255);
|
||||
vec = __lsx_vand_v(mask, vec);
|
||||
in0 = __lsx_vadd_h(in0, vec);
|
||||
}
|
||||
|
||||
VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
|
||||
LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
|
||||
VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
|
||||
LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
|
||||
DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
|
||||
DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
|
||||
DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2);
|
||||
__lsx_vst(in0, output, 0);
|
||||
__lsx_vst(in2, output, 16);
|
||||
}
|
||||
|
||||
void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output,
|
||||
int32_t src_stride) {
|
||||
__m128i in0, in1, in2, in3, in4, in5, in6, in7;
|
||||
int32_t src_stride2 = src_stride << 1;
|
||||
int32_t src_stride4 = src_stride2 << 1;
|
||||
int32_t src_stride6 = src_stride4 + src_stride2;
|
||||
int16_t *input_tmp = (int16_t *)input;
|
||||
|
||||
in0 = __lsx_vld(input_tmp, 0);
|
||||
DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1,
|
||||
in2);
|
||||
in3 = __lsx_vldx(input_tmp, src_stride6);
|
||||
input_tmp += src_stride4;
|
||||
in4 = __lsx_vld(input_tmp, 0);
|
||||
DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5,
|
||||
in6);
|
||||
in7 = __lsx_vldx(input_tmp, src_stride6);
|
||||
|
||||
DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
|
||||
DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
|
||||
|
||||
VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
|
||||
in5, in6, in7);
|
||||
LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
|
||||
in4, in5, in6, in7);
|
||||
VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
|
||||
in5, in6, in7);
|
||||
LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
|
||||
in4, in5, in6, in7);
|
||||
SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
|
||||
|
||||
__lsx_vst(in0, output, 0);
|
||||
__lsx_vst(in1, output, 16);
|
||||
__lsx_vst(in2, output, 32);
|
||||
__lsx_vst(in3, output, 48);
|
||||
__lsx_vst(in4, output, 64);
|
||||
__lsx_vst(in5, output, 80);
|
||||
__lsx_vst(in6, output, 96);
|
||||
__lsx_vst(in7, output, 112);
|
||||
}
|
||||
|
||||
void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
|
||||
int32_t src_stride) {
|
||||
int32_t i;
|
||||
|
||||
@@ -14,6 +14,105 @@
|
||||
#include "vpx_dsp/loongarch/txfm_macros_lsx.h"
|
||||
#include "vpx_dsp/txfm_common.h"
|
||||
|
||||
#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \
|
||||
{ \
|
||||
__m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
|
||||
__m128i vec0_m, vec1_m, vec2_m, vec3_m; \
|
||||
__m128i vec4_m, vec5_m, vec6_m, vec7_m; \
|
||||
__m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df }; \
|
||||
\
|
||||
LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \
|
||||
DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
|
||||
DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m); \
|
||||
cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
|
||||
vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m); \
|
||||
DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m); \
|
||||
cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m); \
|
||||
vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \
|
||||
\
|
||||
vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m); \
|
||||
cnst2_m = __lsx_vreplvei_h(coeff_m, 2); \
|
||||
cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m); \
|
||||
vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \
|
||||
\
|
||||
DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \
|
||||
vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \
|
||||
vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \
|
||||
}
|
||||
|
||||
#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
|
||||
out3, out4, out5, out6, out7) \
|
||||
{ \
|
||||
__m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
|
||||
__m128i s7_m, x0_m, x1_m, x2_m, x3_m; \
|
||||
__m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \
|
||||
\
|
||||
/* FDCT stage1 */ \
|
||||
LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \
|
||||
s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \
|
||||
LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
|
||||
DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
|
||||
DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
|
||||
DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \
|
||||
x1_m = __lsx_vpackev_h(x1_m, x0_m); \
|
||||
DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \
|
||||
\
|
||||
DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \
|
||||
x2_m = __lsx_vneg_h(x2_m); \
|
||||
x2_m = __lsx_vpackev_h(x3_m, x2_m); \
|
||||
DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \
|
||||
\
|
||||
DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \
|
||||
x2_m = __lsx_vreplvei_h(coeff_m, 2); \
|
||||
x2_m = __lsx_vpackev_h(x2_m, x3_m); \
|
||||
DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \
|
||||
\
|
||||
/* stage2 */ \
|
||||
s1_m = __lsx_vilvl_h(s5_m, s6_m); \
|
||||
s0_m = __lsx_vilvh_h(s5_m, s6_m); \
|
||||
\
|
||||
DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \
|
||||
DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \
|
||||
\
|
||||
/* stage3 */ \
|
||||
LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
|
||||
\
|
||||
/* stage4 */ \
|
||||
DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
|
||||
DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
|
||||
\
|
||||
DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \
|
||||
x1_m = __lsx_vpackev_h(x0_m, x1_m); \
|
||||
DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \
|
||||
\
|
||||
DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \
|
||||
x2_m = __lsx_vpackev_h(x3_m, x2_m); \
|
||||
DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \
|
||||
\
|
||||
x1_m = __lsx_vreplvei_h(coeff_m, 5); \
|
||||
x0_m = __lsx_vneg_h(x0_m); \
|
||||
x0_m = __lsx_vpackev_h(x1_m, x0_m); \
|
||||
DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \
|
||||
x2_m = __lsx_vreplvei_h(coeff_m, 6); \
|
||||
x3_m = __lsx_vneg_h(x3_m); \
|
||||
x2_m = __lsx_vpackev_h(x2_m, x3_m); \
|
||||
DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \
|
||||
}
|
||||
|
||||
#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \
|
||||
{ \
|
||||
__m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
|
||||
\
|
||||
DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \
|
||||
vec1_m, vec2_m, vec3_m); \
|
||||
DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m, \
|
||||
vec5_m, vec6_m, vec7_m); \
|
||||
DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, \
|
||||
in3, in0, in1, in2, in3); \
|
||||
DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \
|
||||
in7, in4, in5, in6, in7); \
|
||||
}
|
||||
|
||||
#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
|
||||
{ \
|
||||
__m128i tp0_m, tp1_m; \
|
||||
|
||||
@@ -573,13 +573,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
} else {
|
||||
add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vpx_fdct4x4 neon sse2 msa/;
|
||||
specialize qw/vpx_fdct4x4 neon sse2 msa lsx/;
|
||||
|
||||
add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vpx_fdct4x4_1 sse2 neon/;
|
||||
|
||||
add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
|
||||
specialize qw/vpx_fdct8x8 sse2 neon msa lsx/, "$ssse3_x86_64";
|
||||
|
||||
add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
|
||||
|
||||
Reference in New Issue
Block a user