SSE2/SSSE3 optimizations and unit test for sub_pixel_avg_variance().

Encoding of bus @ 1500kbps (first 50 frames) goes from 3min57 to
3min35, i.e. approximately a 10.5% speedup. Note that the SIMD versions
which use a bilinear filter (x_offset & 7 || y_offset & 7) aren't
perfectly interleaved, and can probably be improved further in the
future. I've marked this with a few TODOs/FIXMEs in the code.

Change-Id: I5c9e900c0f0d32e431a50fecae213b510b2549f9
This commit is contained in:
Ronald S. Bultje
2013-06-20 15:59:48 -07:00
parent 8fb6c58191
commit 1e6a32f1af
4 changed files with 543 additions and 63 deletions

View File

@@ -76,6 +76,34 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
static unsigned int subpel_avg_variance_ref(const uint8_t *ref,
const uint8_t *src,
const uint8_t *second_pred,
int l2w, int l2h,
int xoff, int yoff,
unsigned int *sse_ptr) {
int se = 0;
unsigned int sse = 0;
const int w = 1 << l2w, h = 1 << l2h;
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
// bilinear interpolation at a 16th pel step
const int a1 = ref[(w + 1) * (y + 0) + x + 0];
const int a2 = ref[(w + 1) * (y + 0) + x + 1];
const int b1 = ref[(w + 1) * (y + 1) + x + 0];
const int b2 = ref[(w + 1) * (y + 1) + x + 1];
const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
const int r = a + (((b - a) * yoff + 8) >> 4);
int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
se += diff;
sse += diff * diff;
}
}
*sse_ptr = sse;
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
template<typename VarianceFunctionType>
class VarianceTest :
public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
@@ -174,6 +202,7 @@ class SubpelVarianceTest :
rnd(ACMRandom::DeterministicSeed());
block_size_ = width_ * height_;
src_ = new uint8_t[block_size_];
sec_ = new uint8_t[block_size_];
ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
ASSERT_TRUE(src_ != NULL);
ASSERT_TRUE(ref_ != NULL);
@@ -182,14 +211,16 @@ class SubpelVarianceTest :
virtual void TearDown() {
delete[] src_;
delete[] ref_;
delete[] sec_;
}
protected:
void RefTest();
ACMRandom rnd;
uint8_t* src_;
uint8_t* ref_;
uint8_t *src_;
uint8_t *ref_;
uint8_t *sec_;
int width_, log2width_;
int height_, log2height_;
int block_size_;
@@ -217,6 +248,29 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
}
}
template<>
void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
for (int x = 0; x < 16; ++x) {
for (int y = 0; y < 16; ++y) {
for (int j = 0; j < block_size_; j++) {
src_[j] = rnd.Rand8();
sec_[j] = rnd.Rand8();
}
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
ref_[j] = rnd.Rand8();
}
unsigned int sse1, sse2;
const unsigned int var1 = subpel_variance_(ref_, width_ + 1, x, y,
src_, width_, &sse1, sec_);
const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_,
log2width_, log2height_,
x, y, &sse2);
EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
}
}
}
// -----------------------------------------------------------------------------
// VP8 test cases.
@@ -283,10 +337,12 @@ namespace vp9 {
#if CONFIG_VP9_ENCODER
typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
TEST_P(VP9VarianceTest, Ref) { RefTest(); }
TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }
TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
@@ -360,6 +416,48 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(6, 5, subpel_variance64x32_c),
make_tuple(6, 6, subpel_variance64x64_c)));
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
vp9_sub_pixel_avg_variance4x4_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c =
vp9_sub_pixel_avg_variance4x8_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_c =
vp9_sub_pixel_avg_variance8x4_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_c =
vp9_sub_pixel_avg_variance8x8_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_c =
vp9_sub_pixel_avg_variance8x16_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_c =
vp9_sub_pixel_avg_variance16x8_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_c =
vp9_sub_pixel_avg_variance16x16_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_c =
vp9_sub_pixel_avg_variance16x32_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_c =
vp9_sub_pixel_avg_variance32x16_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_c =
vp9_sub_pixel_avg_variance32x32_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_c =
vp9_sub_pixel_avg_variance32x64_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_c =
vp9_sub_pixel_avg_variance64x32_c;
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c =
vp9_sub_pixel_avg_variance64x64_c;
INSTANTIATE_TEST_CASE_P(
C, VP9SubpelAvgVarianceTest,
::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c),
make_tuple(2, 3, subpel_avg_variance4x8_c),
make_tuple(3, 2, subpel_avg_variance8x4_c),
make_tuple(3, 3, subpel_avg_variance8x8_c),
make_tuple(3, 4, subpel_avg_variance8x16_c),
make_tuple(4, 3, subpel_avg_variance16x8_c),
make_tuple(4, 4, subpel_avg_variance16x16_c),
make_tuple(4, 5, subpel_avg_variance16x32_c),
make_tuple(5, 4, subpel_avg_variance32x16_c),
make_tuple(5, 5, subpel_avg_variance32x32_c),
make_tuple(5, 6, subpel_avg_variance32x64_c),
make_tuple(6, 5, subpel_avg_variance64x32_c),
make_tuple(6, 6, subpel_avg_variance64x64_c)));
#if HAVE_MMX
const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx;
@@ -446,6 +544,48 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(5, 6, subpel_variance32x64_sse2),
make_tuple(6, 5, subpel_variance64x32_sse2),
make_tuple(6, 6, subpel_variance64x64_sse2)));
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse =
vp9_sub_pixel_avg_variance4x4_sse;
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse =
vp9_sub_pixel_avg_variance4x8_sse;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_sse2 =
vp9_sub_pixel_avg_variance8x4_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_sse2 =
vp9_sub_pixel_avg_variance8x8_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_sse2 =
vp9_sub_pixel_avg_variance8x16_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_sse2 =
vp9_sub_pixel_avg_variance16x8_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_sse2 =
vp9_sub_pixel_avg_variance16x16_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_sse2 =
vp9_sub_pixel_avg_variance16x32_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_sse2 =
vp9_sub_pixel_avg_variance32x16_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_sse2 =
vp9_sub_pixel_avg_variance32x32_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_sse2 =
vp9_sub_pixel_avg_variance32x64_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_sse2 =
vp9_sub_pixel_avg_variance64x32_sse2;
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 =
vp9_sub_pixel_avg_variance64x64_sse2;
INSTANTIATE_TEST_CASE_P(
SSE2, VP9SubpelAvgVarianceTest,
::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse),
make_tuple(2, 3, subpel_avg_variance4x8_sse),
make_tuple(3, 2, subpel_avg_variance8x4_sse2),
make_tuple(3, 3, subpel_avg_variance8x8_sse2),
make_tuple(3, 4, subpel_avg_variance8x16_sse2),
make_tuple(4, 3, subpel_avg_variance16x8_sse2),
make_tuple(4, 4, subpel_avg_variance16x16_sse2),
make_tuple(4, 5, subpel_avg_variance16x32_sse2),
make_tuple(5, 4, subpel_avg_variance32x16_sse2),
make_tuple(5, 5, subpel_avg_variance32x32_sse2),
make_tuple(5, 6, subpel_avg_variance32x64_sse2),
make_tuple(6, 5, subpel_avg_variance64x32_sse2),
make_tuple(6, 6, subpel_avg_variance64x64_sse2)));
#endif
#if HAVE_SSSE3
@@ -490,6 +630,48 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(5, 6, subpel_variance32x64_ssse3),
make_tuple(6, 5, subpel_variance64x32_ssse3),
make_tuple(6, 6, subpel_variance64x64_ssse3)));
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 =
vp9_sub_pixel_avg_variance4x4_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 =
vp9_sub_pixel_avg_variance4x8_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_ssse3 =
vp9_sub_pixel_avg_variance8x4_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_ssse3 =
vp9_sub_pixel_avg_variance8x8_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_ssse3 =
vp9_sub_pixel_avg_variance8x16_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_ssse3 =
vp9_sub_pixel_avg_variance16x8_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_ssse3 =
vp9_sub_pixel_avg_variance16x16_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_ssse3 =
vp9_sub_pixel_avg_variance16x32_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_ssse3 =
vp9_sub_pixel_avg_variance32x16_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_ssse3 =
vp9_sub_pixel_avg_variance32x32_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_ssse3 =
vp9_sub_pixel_avg_variance32x64_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_ssse3 =
vp9_sub_pixel_avg_variance64x32_ssse3;
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 =
vp9_sub_pixel_avg_variance64x64_ssse3;
INSTANTIATE_TEST_CASE_P(
SSSE3, VP9SubpelAvgVarianceTest,
::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3),
make_tuple(2, 3, subpel_avg_variance4x8_ssse3),
make_tuple(3, 2, subpel_avg_variance8x4_ssse3),
make_tuple(3, 3, subpel_avg_variance8x8_ssse3),
make_tuple(3, 4, subpel_avg_variance8x16_ssse3),
make_tuple(4, 3, subpel_avg_variance16x8_ssse3),
make_tuple(4, 4, subpel_avg_variance16x16_ssse3),
make_tuple(4, 5, subpel_avg_variance16x32_ssse3),
make_tuple(5, 4, subpel_avg_variance32x16_ssse3),
make_tuple(5, 5, subpel_avg_variance32x32_ssse3),
make_tuple(5, 6, subpel_avg_variance32x64_ssse3),
make_tuple(6, 5, subpel_avg_variance64x32_ssse3),
make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
#endif
#endif // CONFIG_VP9_ENCODER