diff --git a/test/avg_test.cc b/test/avg_test.cc index 612aff018..c570bbc22 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -234,11 +234,11 @@ class SatdTest : public ::testing::Test, typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); -typedef std::tr1::tuple BlockErrorTestParam; +typedef std::tr1::tuple BlockErrorTestFPParam; -class BlockErrorTest +class BlockErrorTestFP : public ::testing::Test, - public ::testing::WithParamInterface { + public ::testing::WithParamInterface { protected: virtual void SetUp() { txfm_size_ = GET_PARAM(0); @@ -367,21 +367,21 @@ TEST_P(SatdTest, Random) { Check(expected); } -TEST_P(BlockErrorTest, MinValue) { +TEST_P(BlockErrorTestFP, MinValue) { const int64_t kMin = -32640; const int64_t expected = kMin * kMin * txfm_size_; FillConstant(kMin, 0); Check(expected); } -TEST_P(BlockErrorTest, MaxValue) { +TEST_P(BlockErrorTestFP, MaxValue) { const int64_t kMax = 32640; const int64_t expected = kMax * kMax * txfm_size_; FillConstant(kMax, 0); Check(expected); } -TEST_P(BlockErrorTest, Random) { +TEST_P(BlockErrorTestFP, Random) { int64_t expected; switch (txfm_size_) { case 16: expected = 2051681432; break; @@ -410,7 +410,7 @@ INSTANTIATE_TEST_CASE_P(C, SatdTest, make_tuple(1024, &vpx_satd_c))); INSTANTIATE_TEST_CASE_P( - C, BlockErrorTest, + C, BlockErrorTestFP, ::testing::Values(make_tuple(16, &vp9_block_error_fp_c), make_tuple(64, &vp9_block_error_fp_c), make_tuple(256, &vp9_block_error_fp_c), @@ -447,7 +447,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, SatdTest, make_tuple(1024, &vpx_satd_sse2))); INSTANTIATE_TEST_CASE_P( - SSE2, BlockErrorTest, + SSE2, BlockErrorTestFP, ::testing::Values(make_tuple(16, &vp9_block_error_fp_sse2), make_tuple(64, &vp9_block_error_fp_sse2), make_tuple(256, &vp9_block_error_fp_sse2), @@ -488,7 +488,7 @@ INSTANTIATE_TEST_CASE_P(NEON, SatdTest, // in place. #if !CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( - NEON, BlockErrorTest, + NEON, BlockErrorTestFP, ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon), make_tuple(64, &vp9_block_error_fp_neon), make_tuple(256, &vp9_block_error_fp_neon), diff --git a/test/test.mk b/test/test.mk index c5ed65385..6504edbe9 100644 --- a/test/test.mk +++ b/test/test.mk @@ -157,7 +157,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc diff --git a/test/vp9_error_block_test.cc b/test/vp9_block_error_test.cc similarity index 72% rename from test/vp9_error_block_test.cc rename to test/vp9_block_error_test.cc index 74436c09e..0b4d1df99 100644 --- a/test/vp9_error_block_test.cc +++ b/test/vp9_block_error_test.cc @@ -23,36 +23,36 @@ #include "vp9/common/vp9_entropy.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" using libvpx_test::ACMRandom; namespace { -#if CONFIG_VP9_HIGHBITDEPTH const int kNumIterations = 1000; -typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff, +typedef int64_t (*HBDBlockErrorFunc)(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, + int bps); + +typedef std::tr1::tuple + BlockErrorParam; + +typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff, const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz, int bps); + intptr_t block_size, int64_t *ssz); -typedef std::tr1::tuple - ErrorBlockParam; - -// wrapper for 8-bit block error functions without a 'bps' param. -typedef int64_t (*HighBdBlockError8bit)(const tran_low_t *coeff, - const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz); -template -int64_t HighBdBlockError8bitWrapper(const tran_low_t *coeff, - const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz, - int bps) { - EXPECT_EQ(8, bps); +template +int64_t BlockError8BitWrapper(const tran_low_t *coeff, + const tran_low_t *dqcoeff, intptr_t block_size, + int64_t *ssz, int bps) { + EXPECT_EQ(bps, 8); return fn(coeff, dqcoeff, block_size, ssz); } -class ErrorBlockTest : public ::testing::TestWithParam { +class BlockErrorTest : public ::testing::TestWithParam { public: - virtual ~ErrorBlockTest() {} + virtual ~BlockErrorTest() {} virtual void SetUp() { error_block_op_ = GET_PARAM(0); ref_error_block_op_ = GET_PARAM(1); @@ -63,11 +63,11 @@ class ErrorBlockTest : public ::testing::TestWithParam { protected: vpx_bit_depth_t bit_depth_; - ErrorBlockFunc error_block_op_; - ErrorBlockFunc ref_error_block_op_; + HBDBlockErrorFunc error_block_op_; + HBDBlockErrorFunc ref_error_block_op_; }; -TEST_P(ErrorBlockTest, OperationCheck) { +TEST_P(BlockErrorTest, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); DECLARE_ALIGNED(16, tran_low_t, coeff[4096]); DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]); @@ -110,7 +110,7 @@ TEST_P(ErrorBlockTest, OperationCheck) { << "First failed at test case " << first_failure; } -TEST_P(ErrorBlockTest, ExtremeValues) { +TEST_P(BlockErrorTest, ExtremeValues) { ACMRandom rnd(ACMRandom::DeterministicSeed()); DECLARE_ALIGNED(16, tran_low_t, coeff[4096]); DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]); @@ -171,29 +171,28 @@ TEST_P(ErrorBlockTest, ExtremeValues) { using std::tr1::make_tuple; #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( - SSE2, ErrorBlockTest, - ::testing::Values( - make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, - VPX_BITS_10), - make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, - VPX_BITS_12), - make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, - VPX_BITS_8), - make_tuple( - &HighBdBlockError8bitWrapper, - &HighBdBlockError8bitWrapper, - VPX_BITS_8))); +const BlockErrorParam sse2_block_error_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, + VPX_BITS_10), + make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, + VPX_BITS_12), + make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, + VPX_BITS_8), +#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, VPX_BITS_8) +}; + +INSTANTIATE_TEST_CASE_P(SSE2, BlockErrorTest, + ::testing::ValuesIn(sse2_block_error_tests)); #endif // HAVE_SSE2 -#if HAVE_AVX +#if HAVE_AVX2 INSTANTIATE_TEST_CASE_P( - AVX, ErrorBlockTest, - ::testing::Values(make_tuple( - &HighBdBlockError8bitWrapper, - &HighBdBlockError8bitWrapper, - VPX_BITS_8))); -#endif // HAVE_AVX - -#endif // CONFIG_VP9_HIGHBITDEPTH + AVX2, BlockErrorTest, + ::testing::Values(make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, + VPX_BITS_8))); +#endif // HAVE_AVX2 } // namespace diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 57af79d5b..77bebc7b9 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -130,9 +130,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; specialize qw/vp9_highbd_block_error sse2/; - add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; - specialize qw/vp9_highbd_block_error_8bit sse2 avx/; - add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; specialize qw/vp9_block_error_fp sse2/; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 0500e6025..1b82b29d4 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -284,22 +284,12 @@ int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, return error; } -int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff, - const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz) { - // Note that the C versions of these 2 functions (vp9_block_error and - // vp9_highbd_block_error_8bit are the same, but the optimized assembly - // routines are not compatible in the non high bitdepth configuration, so - // they still cannot share the same name. - return vp9_block_error_c(coeff, dqcoeff, block_size, ssz); -} - static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd) { if (bd == 8) { - return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz); + return vp9_block_error(coeff, dqcoeff, block_size, ssz); } else { return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd); } @@ -1130,16 +1120,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; -#if CONFIG_VP9_HIGHBITDEPTH - distortion += - vp9_highbd_block_error_8bit( - coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >> - 2; -#else distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >> 2; -#endif if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next; vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), dst, diff --git a/vp9/encoder/x86/vp9_highbd_error_avx.asm b/vp9/encoder/x86/vp9_highbd_error_avx.asm deleted file mode 100644 index e476323e1..000000000 --- a/vp9/encoder/x86/vp9_highbd_error_avx.asm +++ /dev/null @@ -1,261 +0,0 @@ -; -; Copyright (c) 2015 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text -ALIGN 16 - -; -; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff, -; intptr_t block_size, int64_t *ssz) -; - -INIT_XMM avx -cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz - vzeroupper - - ; If only one iteration is required, then handle this as a special case. - ; It is the most frequent case, so we can have a significant gain here - ; by not setting up a loop and accumulators. - cmp sizeq, 16 - jne .generic - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Common case of size == 16 - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; Load input vectors - mova xm0, [dqcq] - packssdw xm0, [dqcq+16] - mova xm2, [uqcq] - packssdw xm2, [uqcq+16] - - mova xm1, [dqcq+32] - packssdw xm1, [dqcq+48] - mova xm3, [uqcq+32] - packssdw xm3, [uqcq+48] - - ; Compute the errors. - psubw xm0, xm2 - psubw xm1, xm3 - - ; Individual errors are max 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit). - pmaddwd xm2, xm2 - pmaddwd xm3, xm3 - - pmaddwd xm0, xm0 - pmaddwd xm1, xm1 - - ; Squares are always positive, so we can use unsigned arithmetic after - ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will - ; fit in 32bits - paddd xm2, xm3 - paddd xm0, xm1 - - ; Accumulate horizontally in 64 bits, there is no chance of overflow here - pxor xm5, xm5 - - pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits - psrlq xm2, 32 ; Zero extended high of a pair of 32 bits - - pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits - psrlq xm0, 32 ; Zero extended high of a pair of 32 bits - - paddq xm2, xm3 - paddq xm0, xm1 - - psrldq xm3, xm2, 8 - psrldq xm1, xm0, 8 - - paddq xm2, xm3 - paddq xm0, xm1 - - ; Store the return value -%if ARCH_X86_64 - movq rax, xm0 - movq [sszq], xm2 -%else - movd eax, xm0 - pextrd edx, xm0, 1 - movq [sszd], xm2 -%endif - RET - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Generic case of size != 16, speculative low precision - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ALIGN 16 -.generic: - pxor xm4, xm4 ; sse accumulator - pxor xm5, xm5 ; overflow detection register for xm4 - pxor xm6, xm6 ; ssz accumulator - pxor xm7, xm7 ; overflow detection register for xm6 - lea uqcq, [uqcq+sizeq*4] - lea dqcq, [dqcq+sizeq*4] - neg sizeq - - ; Push the negative size as the high precision code might need it - push sizeq - -.loop: - ; Load input vectors - mova xm0, [dqcq+sizeq*4] - packssdw xm0, [dqcq+sizeq*4+16] - mova xm2, [uqcq+sizeq*4] - packssdw xm2, [uqcq+sizeq*4+16] - - mova xm1, [dqcq+sizeq*4+32] - packssdw xm1, [dqcq+sizeq*4+48] - mova xm3, [uqcq+sizeq*4+32] - packssdw xm3, [uqcq+sizeq*4+48] - - add sizeq, 16 - - ; Compute the squared errors. - ; Individual errors are max 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit). - psubw xm0, xm2 - pmaddwd xm2, xm2 - pmaddwd xm0, xm0 - - psubw xm1, xm3 - pmaddwd xm3, xm3 - pmaddwd xm1, xm1 - - ; Squares are always positive, so we can use unsigned arithmetic after - ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will - ; fit in 32bits - paddd xm2, xm3 - paddd xm0, xm1 - - ; We accumulate using 32 bit arithmetic, but detect potential overflow - ; by checking if the MSB of the accumulators have ever been a set bit. - ; If yes, we redo the whole compute at the end on higher precision, but - ; this happens extremely rarely, so we still achieve a net gain. - paddd xm4, xm0 - paddd xm6, xm2 - por xm5, xm4 ; OR in the accumulator for overflow detection - por xm7, xm6 ; OR in the accumulator for overflow detection - - jnz .loop - - ; Add pairs horizontally (still only on 32 bits) - phaddd xm4, xm4 - por xm5, xm4 ; OR in the accumulator for overflow detection - phaddd xm6, xm6 - por xm7, xm6 ; OR in the accumulator for overflow detection - - ; Check for possibility of overflow by testing if bit 32 of each dword lane - ; have ever been set. If they were not, then there was no overflow and the - ; final sum will fit in 32 bits. If overflow happened, then - ; we redo the whole computation on higher precision. - por xm7, xm5 - pmovmskb r4, xm7 - test r4, 0x8888 - jnz .highprec - - phaddd xm4, xm4 - phaddd xm6, xm6 - pmovzxdq xm4, xm4 - pmovzxdq xm6, xm6 - - ; Restore stack - pop sizeq - - ; Store the return value -%if ARCH_X86_64 - movq rax, xm4 - movq [sszq], xm6 -%else - movd eax, xm4 - pextrd edx, xm4, 1 - movq [sszd], xm6 -%endif - RET - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Generic case of size != 16, high precision case - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -.highprec: - pxor xm4, xm4 ; sse accumulator - pxor xm5, xm5 ; dedicated zero register - pxor xm6, xm6 ; ssz accumulator - pop sizeq - -.loophp: - mova xm0, [dqcq+sizeq*4] - packssdw xm0, [dqcq+sizeq*4+16] - mova xm2, [uqcq+sizeq*4] - packssdw xm2, [uqcq+sizeq*4+16] - - mova xm1, [dqcq+sizeq*4+32] - packssdw xm1, [dqcq+sizeq*4+48] - mova xm3, [uqcq+sizeq*4+32] - packssdw xm3, [uqcq+sizeq*4+48] - - add sizeq, 16 - - ; individual errors are max. 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) - - psubw xm0, xm2 - pmaddwd xm2, xm2 - pmaddwd xm0, xm0 - - psubw xm1, xm3 - pmaddwd xm3, xm3 - pmaddwd xm1, xm1 - - ; accumulate in 64bit - punpckldq xm7, xm0, xm5 - punpckhdq xm0, xm5 - paddq xm4, xm7 - - punpckldq xm7, xm2, xm5 - punpckhdq xm2, xm5 - paddq xm6, xm7 - - punpckldq xm7, xm1, xm5 - punpckhdq xm1, xm5 - paddq xm4, xm7 - - punpckldq xm7, xm3, xm5 - punpckhdq xm3, xm5 - paddq xm6, xm7 - - paddq xm4, xm0 - paddq xm4, xm1 - paddq xm6, xm2 - paddq xm6, xm3 - - jnz .loophp - - ; Accumulate horizontally - movhlps xm5, xm4 - movhlps xm7, xm6 - paddq xm4, xm5 - paddq xm6, xm7 - - ; Store the return value -%if ARCH_X86_64 - movq rax, xm4 - movq [sszq], xm6 -%else - movd eax, xm4 - pextrd edx, xm4, 1 - movq [sszd], xm6 -%endif - RET - -END diff --git a/vp9/encoder/x86/vp9_highbd_error_sse2.asm b/vp9/encoder/x86/vp9_highbd_error_sse2.asm deleted file mode 100644 index f3b8f0194..000000000 --- a/vp9/encoder/x86/vp9_highbd_error_sse2.asm +++ /dev/null @@ -1,98 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text -ALIGN 16 - -; -; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff, -; intptr_t block_size, int64_t *ssz) -; - -INIT_XMM sse2 -cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz - pxor m4, m4 ; sse accumulator - pxor m6, m6 ; ssz accumulator - pxor m5, m5 ; dedicated zero register - lea uqcq, [uqcq+sizeq*4] - lea dqcq, [dqcq+sizeq*4] - neg sizeq - - ALIGN 16 - -.loop: - mova m0, [dqcq+sizeq*4] - packssdw m0, [dqcq+sizeq*4+mmsize] - mova m2, [uqcq+sizeq*4] - packssdw m2, [uqcq+sizeq*4+mmsize] - - mova m1, [dqcq+sizeq*4+mmsize*2] - packssdw m1, [dqcq+sizeq*4+mmsize*3] - mova m3, [uqcq+sizeq*4+mmsize*2] - packssdw m3, [uqcq+sizeq*4+mmsize*3] - - add sizeq, mmsize - - ; individual errors are max. 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) - - psubw m0, m2 - pmaddwd m2, m2 - pmaddwd m0, m0 - - psubw m1, m3 - pmaddwd m3, m3 - pmaddwd m1, m1 - - ; accumulate in 64bit - punpckldq m7, m0, m5 - punpckhdq m0, m5 - paddq m4, m7 - - punpckldq m7, m2, m5 - punpckhdq m2, m5 - paddq m6, m7 - - punpckldq m7, m1, m5 - punpckhdq m1, m5 - paddq m4, m7 - - punpckldq m7, m3, m5 - punpckhdq m3, m5 - paddq m6, m7 - - paddq m4, m0 - paddq m4, m1 - paddq m6, m2 - paddq m6, m3 - - jnz .loop - - ; accumulate horizontally and store in return value - movhlps m5, m4 - movhlps m7, m6 - paddq m4, m5 - paddq m6, m7 - -%if ARCH_X86_64 - movq rax, m4 - movq [sszq], m6 -%else - mov eax, sszm - pshufd m5, m4, 0x1 - movq [eax], m6 - movd eax, m4 - movd edx, m5 -%endif - RET diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index a54e99e2c..e73535543 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -108,10 +108,6 @@ endif VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm -ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm -VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm -endif ifeq ($(ARCH_X86_64),yes) VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm