VPX: Add vp9_highbd_quantize_fp_32x32_avx2().

~4x faster than vp9_highbd_quantize_fp_32x32_c() for full
calculations.

Bug: b/237714063

Change-Id: Iff2182b8e7b1ac79811e33080d1f6cac6679382d
This commit is contained in:
Scott LaVarnway
2022-08-03 08:01:25 -07:00
parent a55e248349
commit 2e61a623d4
3 changed files with 75 additions and 0 deletions

View File

@@ -583,6 +583,9 @@ INSTANTIATE_TEST_SUITE_P(
make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_avx2>,
&QuantFPWrapper<vp9_highbd_quantize_fp_c>, VPX_BITS_12, 16,
true),
make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
32, true),
make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
VPX_BITS_8, 16, false),
make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,

View File

@@ -199,6 +199,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_quantize_fp avx2/;
add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
specialize qw/vp9_highbd_quantize_fp_32x32 avx2/;
# fdct functions
add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";

View File

@@ -366,4 +366,75 @@ void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
*eob_ptr = get_max_eob(eob_max);
}
static VPX_FORCE_INLINE void highbd_quantize_fp_32x32(
const __m256i *round, const __m256i *quant, const __m256i *dequant,
const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
const __m256i abs_coeff = _mm256_abs_epi32(coeff);
const __m256i thr_mask = _mm256_cmpgt_epi32(abs_coeff, *thr);
const __m256i tmp_rnd =
_mm256_and_si256(_mm256_add_epi32(abs_coeff, *round), thr_mask);
const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
const __m256i abs_dq =
_mm256_srli_epi32(_mm256_mullo_epi32(abs_q, *dequant), 1);
const __m256i q = _mm256_sign_epi32(abs_q, coeff);
const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
_mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
_mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
*eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
}
void vp9_highbd_quantize_fp_32x32_avx2(
const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
const int16_t *iscan) {
const int step = 8;
__m256i round, quant, dequant, thr;
__m256i eob_max = _mm256_setzero_si256();
(void)scan;
coeff_ptr += n_coeffs;
iscan += n_coeffs;
qcoeff_ptr += n_coeffs;
dqcoeff_ptr += n_coeffs;
n_coeffs = -n_coeffs;
// Setup global values
highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
&dequant);
thr = _mm256_srli_epi32(dequant, 2);
// Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
// calculating the zbin mask.
thr = _mm256_sub_epi32(thr, _mm256_set1_epi32(1));
quant = _mm256_slli_epi32(quant, 1);
round = _mm256_srai_epi32(_mm256_add_epi32(round, _mm256_set1_epi32(1)), 1);
highbd_quantize_fp_32x32(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
iscan + n_coeffs, qcoeff_ptr + n_coeffs,
dqcoeff_ptr + n_coeffs, &eob_max);
n_coeffs += step;
// remove dc constants
dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
quant = _mm256_permute2x128_si256(quant, quant, 0x31);
round = _mm256_permute2x128_si256(round, round, 0x31);
thr = _mm256_permute2x128_si256(thr, thr, 0x31);
// AC only loop
while (n_coeffs < 0) {
highbd_quantize_fp_32x32(
&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
n_coeffs += step;
}
*eob_ptr = get_max_eob(eob_max);
}
#endif // CONFIG_VP9_HIGHBITDEPTH