diff --git a/test/convolve_test.cc b/test/convolve_test.cc index b66da683c..90f9579a3 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -14,12 +14,15 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_filter.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" @@ -945,7 +948,7 @@ void wrap_convolve8_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, + vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 8); } @@ -957,7 +960,7 @@ void wrap_convolve8_avg_horiz_sse2_8(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 8); } @@ -969,7 +972,7 @@ void wrap_convolve8_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 8); } @@ -981,7 +984,7 @@ void wrap_convolve8_avg_vert_sse2_8(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 8); } @@ -993,7 +996,7 @@ void wrap_convolve8_sse2_8(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 8); } @@ -1005,7 +1008,7 @@ void wrap_convolve8_avg_sse2_8(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 8); } @@ -1017,7 +1020,7 @@ void wrap_convolve8_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 10); } @@ -1029,7 +1032,7 @@ void wrap_convolve8_avg_horiz_sse2_10(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 10); } @@ -1041,7 +1044,7 @@ void wrap_convolve8_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 10); } @@ -1053,7 +1056,7 @@ void wrap_convolve8_avg_vert_sse2_10(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 10); } @@ -1065,7 +1068,7 @@ void wrap_convolve8_sse2_10(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 10); } @@ -1077,7 +1080,7 @@ void wrap_convolve8_avg_sse2_10(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 10); } @@ -1089,7 +1092,7 @@ void wrap_convolve8_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 12); } @@ -1101,7 +1104,7 @@ void wrap_convolve8_avg_horiz_sse2_12(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 12); } @@ -1113,7 +1116,7 @@ void wrap_convolve8_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 12); } @@ -1125,7 +1128,7 @@ void wrap_convolve8_avg_vert_sse2_12(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 12); } @@ -1137,7 +1140,7 @@ void wrap_convolve8_sse2_12(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 12); } @@ -1149,7 +1152,7 @@ void wrap_convolve8_avg_sse2_12(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 12); } @@ -1162,7 +1165,7 @@ void wrap_convolve_copy_c_8(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 8); } @@ -1174,7 +1177,7 @@ void wrap_convolve_avg_c_8(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 8); } @@ -1186,7 +1189,7 @@ void wrap_convolve8_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 8); } @@ -1198,7 +1201,7 @@ void wrap_convolve8_avg_horiz_c_8(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 8); } @@ -1210,7 +1213,7 @@ void wrap_convolve8_vert_c_8(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 8); } @@ -1222,7 +1225,7 @@ void wrap_convolve8_avg_vert_c_8(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 8); } @@ -1234,7 +1237,7 @@ void wrap_convolve8_c_8(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 8); } @@ -1246,7 +1249,7 @@ void wrap_convolve8_avg_c_8(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 8); } @@ -1258,7 +1261,7 @@ void wrap_convolve_copy_c_10(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 10); } @@ -1270,7 +1273,7 @@ void wrap_convolve_avg_c_10(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 10); } @@ -1282,7 +1285,7 @@ void wrap_convolve8_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 10); } @@ -1294,7 +1297,7 @@ void wrap_convolve8_avg_horiz_c_10(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 10); } @@ -1306,7 +1309,7 @@ void wrap_convolve8_vert_c_10(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 10); } @@ -1318,7 +1321,7 @@ void wrap_convolve8_avg_vert_c_10(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 10); } @@ -1330,7 +1333,7 @@ void wrap_convolve8_c_10(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 10); } @@ -1342,7 +1345,7 @@ void wrap_convolve8_avg_c_10(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 10); } @@ -1354,7 +1357,7 @@ void wrap_convolve_copy_c_12(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 12); } @@ -1366,7 +1369,7 @@ void wrap_convolve_avg_c_12(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 12); } @@ -1378,7 +1381,7 @@ void wrap_convolve8_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 12); } @@ -1390,7 +1393,7 @@ void wrap_convolve8_avg_horiz_c_12(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 12); } @@ -1402,7 +1405,7 @@ void wrap_convolve8_vert_c_12(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 12); } @@ -1414,7 +1417,7 @@ void wrap_convolve8_avg_vert_c_12(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 12); } @@ -1426,7 +1429,7 @@ void wrap_convolve8_c_12(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 12); } @@ -1438,7 +1441,7 @@ void wrap_convolve8_avg_c_12(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { - vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, + vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, filter_x_stride, filter_y, filter_y_stride, w, h, 12); } @@ -1504,10 +1507,10 @@ INSTANTIATE_TEST_CASE_P(C_12, ConvolveTest, ::testing::Values( #else const ConvolveFunctions convolve8_c( - vp9_convolve_copy_c, vp9_convolve_avg_c, - vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c, - vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c, - vp9_convolve8_c, vp9_convolve8_avg_c, 0); + vpx_convolve_copy_c, vpx_convolve_avg_c, + vpx_convolve8_horiz_c, vpx_convolve8_avg_horiz_c, + vpx_convolve8_vert_c, vpx_convolve8_avg_vert_c, + vpx_convolve8_c, vpx_convolve8_avg_c, 0); INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values( make_tuple(4, 4, &convolve8_c), @@ -1585,13 +1588,13 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( #else const ConvolveFunctions convolve8_sse2( #if CONFIG_USE_X86INC - vp9_convolve_copy_sse2, vp9_convolve_avg_sse2, + vpx_convolve_copy_sse2, vpx_convolve_avg_sse2, #else - vp9_convolve_copy_c, vp9_convolve_avg_c, + vpx_convolve_copy_c, vpx_convolve_avg_c, #endif // CONFIG_USE_X86INC - vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2, - vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2, - vp9_convolve8_sse2, vp9_convolve8_avg_sse2, 0); + vpx_convolve8_horiz_sse2, vpx_convolve8_avg_horiz_sse2, + vpx_convolve8_vert_sse2, vpx_convolve8_avg_vert_sse2, + vpx_convolve8_sse2, vpx_convolve8_avg_sse2, 0); INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( make_tuple(4, 4, &convolve8_sse2), @@ -1612,10 +1615,10 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values( #if HAVE_SSSE3 const ConvolveFunctions convolve8_ssse3( - vp9_convolve_copy_c, vp9_convolve_avg_c, - vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3, - vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_ssse3, - vp9_convolve8_ssse3, vp9_convolve8_avg_ssse3, 0); + vpx_convolve_copy_c, vpx_convolve_avg_c, + vpx_convolve8_horiz_ssse3, vpx_convolve8_avg_horiz_ssse3, + vpx_convolve8_vert_ssse3, vpx_convolve8_avg_vert_ssse3, + vpx_convolve8_ssse3, vpx_convolve8_avg_ssse3, 0); INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values( make_tuple(4, 4, &convolve8_ssse3), @@ -1635,10 +1638,10 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values( #if HAVE_AVX2 && HAVE_SSSE3 const ConvolveFunctions convolve8_avx2( - vp9_convolve_copy_c, vp9_convolve_avg_c, - vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3, - vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3, - vp9_convolve8_avx2, vp9_convolve8_avg_ssse3, 0); + vpx_convolve_copy_c, vpx_convolve_avg_c, + vpx_convolve8_horiz_avx2, vpx_convolve8_avg_horiz_ssse3, + vpx_convolve8_vert_avx2, vpx_convolve8_avg_vert_ssse3, + vpx_convolve8_avx2, vpx_convolve8_avg_ssse3, 0); INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values( make_tuple(4, 4, &convolve8_avx2), @@ -1659,16 +1662,16 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values( #if HAVE_NEON #if HAVE_NEON_ASM const ConvolveFunctions convolve8_neon( - vp9_convolve_copy_neon, vp9_convolve_avg_neon, - vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon, - vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon, - vp9_convolve8_neon, vp9_convolve8_avg_neon, 0); + vpx_convolve_copy_neon, vpx_convolve_avg_neon, + vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon, + vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon, + vpx_convolve8_neon, vpx_convolve8_avg_neon, 0); #else // HAVE_NEON const ConvolveFunctions convolve8_neon( - vp9_convolve_copy_neon, vp9_convolve_avg_neon, - vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon, - vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon, - vp9_convolve8_neon, vp9_convolve8_avg_neon, 0); + vpx_convolve_copy_neon, vpx_convolve_avg_neon, + vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon, + vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon, + vpx_convolve8_neon, vpx_convolve8_avg_neon, 0); #endif // HAVE_NEON_ASM INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values( @@ -1689,10 +1692,10 @@ INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values( #if HAVE_DSPR2 const ConvolveFunctions convolve8_dspr2( - vp9_convolve_copy_dspr2, vp9_convolve_avg_dspr2, - vp9_convolve8_horiz_dspr2, vp9_convolve8_avg_horiz_dspr2, - vp9_convolve8_vert_dspr2, vp9_convolve8_avg_vert_dspr2, - vp9_convolve8_dspr2, vp9_convolve8_avg_dspr2, 0); + vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2, + vpx_convolve8_horiz_dspr2, vpx_convolve8_avg_horiz_dspr2, + vpx_convolve8_vert_dspr2, vpx_convolve8_avg_vert_dspr2, + vpx_convolve8_dspr2, vpx_convolve8_avg_dspr2, 0); INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values( make_tuple(4, 4, &convolve8_dspr2), @@ -1712,10 +1715,10 @@ INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values( #if HAVE_MSA const ConvolveFunctions convolve8_msa( - vp9_convolve_copy_msa, vp9_convolve_avg_msa, - vp9_convolve8_horiz_msa, vp9_convolve8_avg_horiz_msa, - vp9_convolve8_vert_msa, vp9_convolve8_avg_vert_msa, - vp9_convolve8_msa, vp9_convolve8_avg_msa, 0); + vpx_convolve_copy_msa, vpx_convolve_avg_msa, + vpx_convolve8_horiz_msa, vpx_convolve8_avg_horiz_msa, + vpx_convolve8_vert_msa, vpx_convolve8_avg_vert_msa, + vpx_convolve8_msa, vpx_convolve8_avg_msa, 0); INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values( make_tuple(4, 4, &convolve8_msa), diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c b/vp9/common/arm/neon/vp9_convolve8_avg_neon.c deleted file mode 100644 index dd569d348..000000000 --- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.c +++ /dev/null @@ -1,390 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include - -#include "./vpx_config.h" -#include "vpx_ports/mem.h" - -void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); -void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); - -static INLINE int32x4_t MULTIPLY_BY_Q0( - int16x4_t dsrc0, - int16x4_t dsrc1, - int16x4_t dsrc2, - int16x4_t dsrc3, - int16x4_t dsrc4, - int16x4_t dsrc5, - int16x4_t dsrc6, - int16x4_t dsrc7, - int16x8_t q0s16) { - int32x4_t qdst; - int16x4_t d0s16, d1s16; - - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - - qdst = vmull_lane_s16(dsrc0, d0s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); - qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); - return qdst; -} - -void vp9_convolve8_avg_horiz_neon( - uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, - const int16_t *filter_x, - int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, - int h) { - int width; - uint8_t *s, *d; - uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; - uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32; - uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - uint16x8x2_t q0x2u16; - uint8x8x2_t d0x2u8, d1x2u8; - uint32x2x2_t d0x2u32; - uint16x4x2_t d0x2u16, d1x2u16; - uint32x4x2_t q0x2u32; - - if (x_step_q4 != 16) { - vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, w, h); - return; - } - - q0s16 = vld1q_s16(filter_x); - - src -= 3; // adjust for taps - for (; h > 0; h -= 4) { // loop_horiz_v - s = src; - d24u8 = vld1_u8(s); - s += src_stride; - d25u8 = vld1_u8(s); - s += src_stride; - d26u8 = vld1_u8(s); - s += src_stride; - d27u8 = vld1_u8(s); - - q12u8 = vcombine_u8(d24u8, d25u8); - q13u8 = vcombine_u8(d26u8, d27u8); - - q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), - vreinterpretq_u16_u8(q13u8)); - d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); - d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); - d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); - d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); - d0x2u8 = vtrn_u8(d24u8, d25u8); - d1x2u8 = vtrn_u8(d26u8, d27u8); - - __builtin_prefetch(src + src_stride * 4); - __builtin_prefetch(src + src_stride * 5); - - q8u16 = vmovl_u8(d0x2u8.val[0]); - q9u16 = vmovl_u8(d0x2u8.val[1]); - q10u16 = vmovl_u8(d1x2u8.val[0]); - q11u16 = vmovl_u8(d1x2u8.val[1]); - - src += 7; - d16u16 = vget_low_u16(q8u16); - d17u16 = vget_high_u16(q8u16); - d18u16 = vget_low_u16(q9u16); - d19u16 = vget_high_u16(q9u16); - q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 - q9u16 = vcombine_u16(d17u16, d19u16); - - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 - for (width = w; - width > 0; - width -= 4, src += 4, dst += 4) { // loop_horiz - s = src; - d28u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d29u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d31u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d30u32 = vld1_dup_u32((const uint32_t *)s); - - __builtin_prefetch(src + 64); - - d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), - vreinterpret_u16_u32(d31u32)); - d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), - vreinterpret_u16_u32(d30u32)); - d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 - vreinterpret_u8_u16(d1x2u16.val[0])); // d29 - d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 - vreinterpret_u8_u16(d1x2u16.val[1])); // d30 - - __builtin_prefetch(src + 64 + src_stride); - - q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); - q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); - q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), - vreinterpretq_u32_u8(q15u8)); - - d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); - d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); - q12u16 = vmovl_u8(d28u8); - q13u16 = vmovl_u8(d29u8); - - __builtin_prefetch(src + 64 + src_stride * 2); - - d = dst; - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); - d += dst_stride; - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); - - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, - d18s16, d19s16, d23s16, d24s16, q0s16); - q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, - d19s16, d23s16, d24s16, d26s16, q0s16); - q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, - d23s16, d24s16, d26s16, d27s16, q0s16); - q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, - d24s16, d26s16, d27s16, d25s16, q0s16); - - __builtin_prefetch(src + 64 + src_stride * 3); - - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u8 = vqmovn_u16(q1u16); - d3u8 = vqmovn_u16(q2u16); - - d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), - vreinterpret_u16_u8(d3u8)); - d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), - vreinterpret_u32_u16(d0x2u16.val[1])); - d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), - vreinterpret_u8_u32(d0x2u32.val[1])); - - q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); - q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); - - q1u8 = vrhaddq_u8(q1u8, q3u8); - - d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); - d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); - - d = dst; - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - - q8u16 = q9u16; - d20s16 = d23s16; - q11u16 = q12u16; - q9u16 = q13u16; - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - } - src += src_stride * 4 - w - 7; - dst += dst_stride * 4 - w; - } - return; -} - -void vp9_convolve8_avg_vert_neon( - uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, - int y_step_q4, - int w, - int h) { - int height; - uint8_t *s, *d; - uint8x8_t d2u8, d3u8; - uint32x2_t d2u32, d3u32, d6u32, d7u32; - uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; - uint8x16_t q1u8, q3u8; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - - if (y_step_q4 != 16) { - vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, w, h); - return; - } - - src -= src_stride * 3; - q0s16 = vld1q_s16(filter_y); - for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h - s = src; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); - s += src_stride; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); - s += src_stride; - d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); - s += src_stride; - - q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); - q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); - q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); - q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); - - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d = dst; - for (height = h; height > 0; height -= 4) { // loop_vert - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); - s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); - s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); - s += src_stride; - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); - s += src_stride; - - q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); - q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); - - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); - d += dst_stride; - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); - d -= dst_stride * 3; - - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - - __builtin_prefetch(s); - __builtin_prefetch(s + src_stride); - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, - d20s16, d21s16, d22s16, d24s16, q0s16); - __builtin_prefetch(s + src_stride * 2); - __builtin_prefetch(s + src_stride * 3); - q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, - d21s16, d22s16, d24s16, d26s16, q0s16); - __builtin_prefetch(d); - __builtin_prefetch(d + dst_stride); - q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, - d22s16, d24s16, d26s16, d27s16, q0s16); - __builtin_prefetch(d + dst_stride * 2); - __builtin_prefetch(d + dst_stride * 3); - q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, - d24s16, d26s16, d27s16, d25s16, q0s16); - - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u8 = vqmovn_u16(q1u16); - d3u8 = vqmovn_u16(q2u16); - - q1u8 = vcombine_u8(d2u8, d3u8); - q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); - - q1u8 = vrhaddq_u8(q1u8, q3u8); - - d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); - d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); - - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - d += dst_stride; - - q8u16 = q10u16; - d18s16 = d22s16; - d19s16 = d24s16; - q10u16 = q13u16; - d22s16 = d25s16; - } - } - return; -} diff --git a/vp9/common/arm/neon/vp9_convolve8_neon.c b/vp9/common/arm/neon/vp9_convolve8_neon.c deleted file mode 100644 index 5c555c458..000000000 --- a/vp9/common/arm/neon/vp9_convolve8_neon.c +++ /dev/null @@ -1,357 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include - -#include "./vpx_config.h" -#include "vpx_ports/mem.h" - -void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); -void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); - -static INLINE int32x4_t MULTIPLY_BY_Q0( - int16x4_t dsrc0, - int16x4_t dsrc1, - int16x4_t dsrc2, - int16x4_t dsrc3, - int16x4_t dsrc4, - int16x4_t dsrc5, - int16x4_t dsrc6, - int16x4_t dsrc7, - int16x8_t q0s16) { - int32x4_t qdst; - int16x4_t d0s16, d1s16; - - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - - qdst = vmull_lane_s16(dsrc0, d0s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); - qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); - return qdst; -} - -void vp9_convolve8_horiz_neon( - uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, - const int16_t *filter_x, - int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, - int h) { - int width; - uint8_t *s, *d, *psrc, *pdst; - uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; - uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; - uint8x16_t q12u8, q13u8, q14u8, q15u8; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - uint16x8x2_t q0x2u16; - uint8x8x2_t d0x2u8, d1x2u8; - uint32x2x2_t d0x2u32; - uint16x4x2_t d0x2u16, d1x2u16; - uint32x4x2_t q0x2u32; - - if (x_step_q4 != 16) { - vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, w, h); - return; - } - - q0s16 = vld1q_s16(filter_x); - - src -= 3; // adjust for taps - for (; h > 0; h -= 4, - src += src_stride * 4, - dst += dst_stride * 4) { // loop_horiz_v - s = src; - d24u8 = vld1_u8(s); - s += src_stride; - d25u8 = vld1_u8(s); - s += src_stride; - d26u8 = vld1_u8(s); - s += src_stride; - d27u8 = vld1_u8(s); - - q12u8 = vcombine_u8(d24u8, d25u8); - q13u8 = vcombine_u8(d26u8, d27u8); - - q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), - vreinterpretq_u16_u8(q13u8)); - d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); - d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); - d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); - d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); - d0x2u8 = vtrn_u8(d24u8, d25u8); - d1x2u8 = vtrn_u8(d26u8, d27u8); - - __builtin_prefetch(src + src_stride * 4); - __builtin_prefetch(src + src_stride * 5); - __builtin_prefetch(src + src_stride * 6); - - q8u16 = vmovl_u8(d0x2u8.val[0]); - q9u16 = vmovl_u8(d0x2u8.val[1]); - q10u16 = vmovl_u8(d1x2u8.val[0]); - q11u16 = vmovl_u8(d1x2u8.val[1]); - - d16u16 = vget_low_u16(q8u16); - d17u16 = vget_high_u16(q8u16); - d18u16 = vget_low_u16(q9u16); - d19u16 = vget_high_u16(q9u16); - q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 - q9u16 = vcombine_u16(d17u16, d19u16); - - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 - for (width = w, psrc = src + 7, pdst = dst; - width > 0; - width -= 4, psrc += 4, pdst += 4) { // loop_horiz - s = psrc; - d28u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d29u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d31u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d30u32 = vld1_dup_u32((const uint32_t *)s); - - __builtin_prefetch(psrc + 64); - - d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), - vreinterpret_u16_u32(d31u32)); - d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), - vreinterpret_u16_u32(d30u32)); - d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 - vreinterpret_u8_u16(d1x2u16.val[0])); // d29 - d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 - vreinterpret_u8_u16(d1x2u16.val[1])); // d30 - - __builtin_prefetch(psrc + 64 + src_stride); - - q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); - q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); - q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), - vreinterpretq_u32_u8(q15u8)); - - d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); - d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); - q12u16 = vmovl_u8(d28u8); - q13u16 = vmovl_u8(d29u8); - - __builtin_prefetch(psrc + 64 + src_stride * 2); - - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, - d18s16, d19s16, d23s16, d24s16, q0s16); - q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, - d19s16, d23s16, d24s16, d26s16, q0s16); - q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, - d23s16, d24s16, d26s16, d27s16, q0s16); - q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, - d24s16, d26s16, d27s16, d25s16, q0s16); - - __builtin_prefetch(psrc + 60 + src_stride * 3); - - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u8 = vqmovn_u16(q1u16); - d3u8 = vqmovn_u16(q2u16); - - d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), - vreinterpret_u16_u8(d3u8)); - d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), - vreinterpret_u32_u16(d0x2u16.val[1])); - d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), - vreinterpret_u8_u32(d0x2u32.val[1])); - - d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); - d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); - - d = pdst; - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - - q8u16 = q9u16; - d20s16 = d23s16; - q11u16 = q12u16; - q9u16 = q13u16; - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - } - } - return; -} - -void vp9_convolve8_vert_neon( - uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, - int y_step_q4, - int w, - int h) { - int height; - uint8_t *s, *d; - uint32x2_t d2u32, d3u32; - uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - - if (y_step_q4 != 16) { - vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, w, h); - return; - } - - src -= src_stride * 3; - q0s16 = vld1q_s16(filter_y); - for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h - s = src; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); - s += src_stride; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); - s += src_stride; - d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); - s += src_stride; - - q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); - q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); - q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); - q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); - - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d = dst; - for (height = h; height > 0; height -= 4) { // loop_vert - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); - s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); - s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); - s += src_stride; - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); - s += src_stride; - - q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); - q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); - - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - - __builtin_prefetch(d); - __builtin_prefetch(d + dst_stride); - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, - d20s16, d21s16, d22s16, d24s16, q0s16); - __builtin_prefetch(d + dst_stride * 2); - __builtin_prefetch(d + dst_stride * 3); - q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, - d21s16, d22s16, d24s16, d26s16, q0s16); - __builtin_prefetch(s); - __builtin_prefetch(s + src_stride); - q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, - d22s16, d24s16, d26s16, d27s16, q0s16); - __builtin_prefetch(s + src_stride * 2); - __builtin_prefetch(s + src_stride * 3); - q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, - d24s16, d26s16, d27s16, d25s16, q0s16); - - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16)); - d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16)); - - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - d += dst_stride; - - q8u16 = q10u16; - d18s16 = d22s16; - d19s16 = d24s16; - q10u16 = q13u16; - d22s16 = d25s16; - } - } - return; -} diff --git a/vp9/common/arm/neon/vp9_convolve_avg_neon.c b/vp9/common/arm/neon/vp9_convolve_avg_neon.c deleted file mode 100644 index 3a3db353e..000000000 --- a/vp9/common/arm/neon/vp9_convolve_avg_neon.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include - -void vp9_convolve_avg_neon( - const uint8_t *src, // r0 - ptrdiff_t src_stride, // r1 - uint8_t *dst, // r2 - ptrdiff_t dst_stride, // r3 - const int16_t *filter_x, - int filter_x_stride, - const int16_t *filter_y, - int filter_y_stride, - int w, - int h) { - uint8_t *d; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - uint32x2_t d0u32, d2u32; - uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; - (void)filter_x; (void)filter_x_stride; - (void)filter_y; (void)filter_y_stride; - - d = dst; - if (w > 32) { // avg64 - for (; h > 0; h -= 1) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - q2u8 = vld1q_u8(src + 32); - q3u8 = vld1q_u8(src + 48); - src += src_stride; - q8u8 = vld1q_u8(d); - q9u8 = vld1q_u8(d + 16); - q10u8 = vld1q_u8(d + 32); - q11u8 = vld1q_u8(d + 48); - d += dst_stride; - - q0u8 = vrhaddq_u8(q0u8, q8u8); - q1u8 = vrhaddq_u8(q1u8, q9u8); - q2u8 = vrhaddq_u8(q2u8, q10u8); - q3u8 = vrhaddq_u8(q3u8, q11u8); - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - vst1q_u8(dst + 32, q2u8); - vst1q_u8(dst + 48, q3u8); - dst += dst_stride; - } - } else if (w == 32) { // avg32 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - src += src_stride; - q2u8 = vld1q_u8(src); - q3u8 = vld1q_u8(src + 16); - src += src_stride; - q8u8 = vld1q_u8(d); - q9u8 = vld1q_u8(d + 16); - d += dst_stride; - q10u8 = vld1q_u8(d); - q11u8 = vld1q_u8(d + 16); - d += dst_stride; - - q0u8 = vrhaddq_u8(q0u8, q8u8); - q1u8 = vrhaddq_u8(q1u8, q9u8); - q2u8 = vrhaddq_u8(q2u8, q10u8); - q3u8 = vrhaddq_u8(q3u8, q11u8); - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - dst += dst_stride; - vst1q_u8(dst, q2u8); - vst1q_u8(dst + 16, q3u8); - dst += dst_stride; - } - } else if (w > 8) { // avg16 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - src += src_stride; - q1u8 = vld1q_u8(src); - src += src_stride; - q2u8 = vld1q_u8(d); - d += dst_stride; - q3u8 = vld1q_u8(d); - d += dst_stride; - - q0u8 = vrhaddq_u8(q0u8, q2u8); - q1u8 = vrhaddq_u8(q1u8, q3u8); - - vst1q_u8(dst, q0u8); - dst += dst_stride; - vst1q_u8(dst, q1u8); - dst += dst_stride; - } - } else if (w == 8) { // avg8 - for (; h > 0; h -= 2) { - d0u8 = vld1_u8(src); - src += src_stride; - d1u8 = vld1_u8(src); - src += src_stride; - d2u8 = vld1_u8(d); - d += dst_stride; - d3u8 = vld1_u8(d); - d += dst_stride; - - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - q0u8 = vrhaddq_u8(q0u8, q1u8); - - vst1_u8(dst, vget_low_u8(q0u8)); - dst += dst_stride; - vst1_u8(dst, vget_high_u8(q0u8)); - dst += dst_stride; - } - } else { // avg4 - for (; h > 0; h -= 2) { - d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0); - src += src_stride; - d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1); - src += src_stride; - d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0); - d += dst_stride; - d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); - d += dst_stride; - - d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), - vreinterpret_u8_u32(d2u32)); - - d0u32 = vreinterpret_u32_u8(d0u8); - vst1_lane_u32((uint32_t *)dst, d0u32, 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, d0u32, 1); - dst += dst_stride; - } - } - return; -} diff --git a/vp9/common/arm/neon/vp9_copy_neon.c b/vp9/common/arm/neon/vp9_copy_neon.c deleted file mode 100644 index f334abe11..000000000 --- a/vp9/common/arm/neon/vp9_copy_neon.c +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include - -void vp9_convolve_copy_neon( - const uint8_t *src, // r0 - ptrdiff_t src_stride, // r1 - uint8_t *dst, // r2 - ptrdiff_t dst_stride, // r3 - const int16_t *filter_x, - int filter_x_stride, - const int16_t *filter_y, - int filter_y_stride, - int w, - int h) { - uint8x8_t d0u8, d2u8; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - (void)filter_x; (void)filter_x_stride; - (void)filter_y; (void)filter_y_stride; - - if (w > 32) { // copy64 - for (; h > 0; h--) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - q2u8 = vld1q_u8(src + 32); - q3u8 = vld1q_u8(src + 48); - src += src_stride; - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - vst1q_u8(dst + 32, q2u8); - vst1q_u8(dst + 48, q3u8); - dst += dst_stride; - } - } else if (w == 32) { // copy32 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - src += src_stride; - q2u8 = vld1q_u8(src); - q3u8 = vld1q_u8(src + 16); - src += src_stride; - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - dst += dst_stride; - vst1q_u8(dst, q2u8); - vst1q_u8(dst + 16, q3u8); - dst += dst_stride; - } - } else if (w > 8) { // copy16 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - src += src_stride; - q1u8 = vld1q_u8(src); - src += src_stride; - - vst1q_u8(dst, q0u8); - dst += dst_stride; - vst1q_u8(dst, q1u8); - dst += dst_stride; - } - } else if (w == 8) { // copy8 - for (; h > 0; h -= 2) { - d0u8 = vld1_u8(src); - src += src_stride; - d2u8 = vld1_u8(src); - src += src_stride; - - vst1_u8(dst, d0u8); - dst += dst_stride; - vst1_u8(dst, d2u8); - dst += dst_stride; - } - } else { // copy4 - for (; h > 0; h--) { - *(uint32_t *)dst = *(const uint32_t *)src; - src += src_stride; - dst += dst_stride; - } - } - return; -} diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h index 371738aba..0285be155 100644 --- a/vp9/common/vp9_entropymode.h +++ b/vp9/common/vp9_entropymode.h @@ -11,9 +11,10 @@ #ifndef VP9_COMMON_VP9_ENTROPYMODE_H_ #define VP9_COMMON_VP9_ENTROPYMODE_H_ -#include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_filter.h" +#include "vpx_dsp/vpx_filter.h" #ifdef __cplusplus extern "C" { diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h index 40d6a0d6a..efa24bc67 100644 --- a/vp9/common/vp9_filter.h +++ b/vp9/common/vp9_filter.h @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" @@ -20,13 +21,6 @@ extern "C" { #endif -#define FILTER_BITS 7 - -#define SUBPEL_BITS 4 -#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) -#define SUBPEL_SHIFTS (1 << SUBPEL_BITS) -#define SUBPEL_TAPS 8 - #define EIGHTTAP 0 #define EIGHTTAP_SMOOTH 1 #define EIGHTTAP_SHARP 2 @@ -36,9 +30,8 @@ extern "C" { // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three. #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1) #define SWITCHABLE 4 /* should be the last one */ -typedef uint8_t INTERP_FILTER; -typedef int16_t InterpKernel[SUBPEL_TAPS]; +typedef uint8_t INTERP_FILTER; extern const InterpKernel *vp9_filter_kernels[4]; diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index 2aa8ee978..1a4ea2f04 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -15,6 +15,9 @@ #include "./vpx_config.h" #include "vpx_dsp/txfm_common.h" +#if CONFIG_VP9_HIGHBITDEPTH +#include "vpx_dsp/vpx_dsp_common.h" +#endif // CONFIG_VP9_HIGHBITDEPTH #include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index 6d38fabd1..db9971da7 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -16,7 +16,6 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" -#include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h index 7f63f5a30..9bc62900a 100644 --- a/vp9/common/vp9_reconinter.h +++ b/vp9/common/vp9_reconinter.h @@ -11,8 +11,10 @@ #ifndef VP9_COMMON_VP9_RECONINTER_H_ #define VP9_COMMON_VP9_RECONINTER_H_ -#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_onyxc_int.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_filter.h" #ifdef __cplusplus extern "C" { diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c index d87a0430b..46f08a7bd 100644 --- a/vp9/common/vp9_reconintra.c +++ b/vp9/common/vp9_reconintra.c @@ -11,6 +11,9 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#if CONFIG_VP9_HIGHBITDEPTH +#include "vpx_dsp/vpx_dsp_common.h" +#endif // CONFIG_VP9_HIGHBITDEPTH #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" #include "vpx_ports/vpx_once.h" diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index ee99d7a5e..883733f55 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -54,12 +54,6 @@ if ($opts{arch} eq "x86_64") { $avx2_x86_64 = 'avx2'; } -# optimizations which depend on multiple features -$avx2_ssse3 = ''; -if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) { - $avx2_ssse3 = 'avx2'; -} - # # post proc # @@ -87,33 +81,6 @@ add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, specialize qw/vp9_filter_by_weight8x8 sse2 msa/; } -# -# Sub Pixel Filters -# -add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve_copy neon dspr2 msa/, "$sse2_x86inc"; - -add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve_avg neon dspr2 msa/, "$sse2_x86inc"; - -add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3"; - -add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3"; - -add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3"; - -add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2 msa/; - -add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/; - -add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/; - # # dct # diff --git a/vp9/common/vp9_scale.c b/vp9/common/vp9_scale.c index 6db8f9caa..8f5c72e7c 100644 --- a/vp9/common/vp9_scale.c +++ b/vp9/common/vp9_scale.c @@ -8,9 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_scale.h" +#include "vpx_dsp/vpx_filter.h" static INLINE int scaled_x(int val, const struct scale_factors *sf) { return (int)((int64_t)val * sf->x_scale_fp >> REF_SCALE_SHIFT); @@ -81,85 +82,85 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, if (sf->x_step_q4 == 16) { if (sf->y_step_q4 == 16) { // No scaling in either direction. - sf->predict[0][0][0] = vp9_convolve_copy; - sf->predict[0][0][1] = vp9_convolve_avg; - sf->predict[0][1][0] = vp9_convolve8_vert; - sf->predict[0][1][1] = vp9_convolve8_avg_vert; - sf->predict[1][0][0] = vp9_convolve8_horiz; - sf->predict[1][0][1] = vp9_convolve8_avg_horiz; + sf->predict[0][0][0] = vpx_convolve_copy; + sf->predict[0][0][1] = vpx_convolve_avg; + sf->predict[0][1][0] = vpx_convolve8_vert; + sf->predict[0][1][1] = vpx_convolve8_avg_vert; + sf->predict[1][0][0] = vpx_convolve8_horiz; + sf->predict[1][0][1] = vpx_convolve8_avg_horiz; } else { // No scaling in x direction. Must always scale in the y direction. - sf->predict[0][0][0] = vp9_convolve8_vert; - sf->predict[0][0][1] = vp9_convolve8_avg_vert; - sf->predict[0][1][0] = vp9_convolve8_vert; - sf->predict[0][1][1] = vp9_convolve8_avg_vert; - sf->predict[1][0][0] = vp9_convolve8; - sf->predict[1][0][1] = vp9_convolve8_avg; + sf->predict[0][0][0] = vpx_convolve8_vert; + sf->predict[0][0][1] = vpx_convolve8_avg_vert; + sf->predict[0][1][0] = vpx_convolve8_vert; + sf->predict[0][1][1] = vpx_convolve8_avg_vert; + sf->predict[1][0][0] = vpx_convolve8; + sf->predict[1][0][1] = vpx_convolve8_avg; } } else { if (sf->y_step_q4 == 16) { // No scaling in the y direction. Must always scale in the x direction. - sf->predict[0][0][0] = vp9_convolve8_horiz; - sf->predict[0][0][1] = vp9_convolve8_avg_horiz; - sf->predict[0][1][0] = vp9_convolve8; - sf->predict[0][1][1] = vp9_convolve8_avg; - sf->predict[1][0][0] = vp9_convolve8_horiz; - sf->predict[1][0][1] = vp9_convolve8_avg_horiz; + sf->predict[0][0][0] = vpx_convolve8_horiz; + sf->predict[0][0][1] = vpx_convolve8_avg_horiz; + sf->predict[0][1][0] = vpx_convolve8; + sf->predict[0][1][1] = vpx_convolve8_avg; + sf->predict[1][0][0] = vpx_convolve8_horiz; + sf->predict[1][0][1] = vpx_convolve8_avg_horiz; } else { // Must always scale in both directions. - sf->predict[0][0][0] = vp9_convolve8; - sf->predict[0][0][1] = vp9_convolve8_avg; - sf->predict[0][1][0] = vp9_convolve8; - sf->predict[0][1][1] = vp9_convolve8_avg; - sf->predict[1][0][0] = vp9_convolve8; - sf->predict[1][0][1] = vp9_convolve8_avg; + sf->predict[0][0][0] = vpx_convolve8; + sf->predict[0][0][1] = vpx_convolve8_avg; + sf->predict[0][1][0] = vpx_convolve8; + sf->predict[0][1][1] = vpx_convolve8_avg; + sf->predict[1][0][0] = vpx_convolve8; + sf->predict[1][0][1] = vpx_convolve8_avg; } } // 2D subpel motion always gets filtered in both directions - sf->predict[1][1][0] = vp9_convolve8; - sf->predict[1][1][1] = vp9_convolve8_avg; + sf->predict[1][1][0] = vpx_convolve8; + sf->predict[1][1][1] = vpx_convolve8_avg; #if CONFIG_VP9_HIGHBITDEPTH if (use_highbd) { if (sf->x_step_q4 == 16) { if (sf->y_step_q4 == 16) { // No scaling in either direction. - sf->highbd_predict[0][0][0] = vp9_highbd_convolve_copy; - sf->highbd_predict[0][0][1] = vp9_highbd_convolve_avg; - sf->highbd_predict[0][1][0] = vp9_highbd_convolve8_vert; - sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg_vert; - sf->highbd_predict[1][0][0] = vp9_highbd_convolve8_horiz; - sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg_horiz; + sf->highbd_predict[0][0][0] = vpx_highbd_convolve_copy; + sf->highbd_predict[0][0][1] = vpx_highbd_convolve_avg; + sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert; + sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert; + sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz; + sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz; } else { // No scaling in x direction. Must always scale in the y direction. - sf->highbd_predict[0][0][0] = vp9_highbd_convolve8_vert; - sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg_vert; - sf->highbd_predict[0][1][0] = vp9_highbd_convolve8_vert; - sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg_vert; - sf->highbd_predict[1][0][0] = vp9_highbd_convolve8; - sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg; + sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_vert; + sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_vert; + sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert; + sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert; + sf->highbd_predict[1][0][0] = vpx_highbd_convolve8; + sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg; } } else { if (sf->y_step_q4 == 16) { // No scaling in the y direction. Must always scale in the x direction. - sf->highbd_predict[0][0][0] = vp9_highbd_convolve8_horiz; - sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg_horiz; - sf->highbd_predict[0][1][0] = vp9_highbd_convolve8; - sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg; - sf->highbd_predict[1][0][0] = vp9_highbd_convolve8_horiz; - sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg_horiz; + sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_horiz; + sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_horiz; + sf->highbd_predict[0][1][0] = vpx_highbd_convolve8; + sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg; + sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz; + sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz; } else { // Must always scale in both directions. - sf->highbd_predict[0][0][0] = vp9_highbd_convolve8; - sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg; - sf->highbd_predict[0][1][0] = vp9_highbd_convolve8; - sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg; - sf->highbd_predict[1][0][0] = vp9_highbd_convolve8; - sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg; + sf->highbd_predict[0][0][0] = vpx_highbd_convolve8; + sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg; + sf->highbd_predict[0][1][0] = vpx_highbd_convolve8; + sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg; + sf->highbd_predict[1][0][0] = vpx_highbd_convolve8; + sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg; } } // 2D subpel motion always gets filtered in both directions. - sf->highbd_predict[1][1][0] = vp9_highbd_convolve8; - sf->highbd_predict[1][1][1] = vp9_highbd_convolve8_avg; + sf->highbd_predict[1][1][0] = vpx_highbd_convolve8; + sf->highbd_predict[1][1][1] = vpx_highbd_convolve8_avg; } #endif } diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h index a1601a72f..5e9104107 100644 --- a/vp9/common/vp9_scale.h +++ b/vp9/common/vp9_scale.h @@ -12,7 +12,7 @@ #define VP9_COMMON_VP9_SCALE_H_ #include "vp9/common/vp9_mv.h" -#include "vp9/common/vp9_convolve.h" +#include "vpx_dsp/vpx_convolve.h" #ifdef __cplusplus extern "C" { diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index c6d3bf19e..ecebe1efb 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -12,6 +12,7 @@ #include // qsort() #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vpx_dsp/bitreader_buffer.h" diff --git a/vp9/encoder/vp9_blockiness.c b/vp9/encoder/vp9_blockiness.c index b8629bd3b..fc3eac6c7 100644 --- a/vp9/encoder/vp9_blockiness.c +++ b/vp9/encoder/vp9_blockiness.c @@ -8,12 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vpx_config.h" #include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_convolve.h" #include "vp9/common/vp9_filter.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_convolve.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" static int horizontal_filter(const uint8_t *s) { diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index 08134e152..f1d73790a 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -10,6 +10,7 @@ #include #include +#include "./vpx_dsp_rtcd.h" #include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" #include "vp9/common/vp9_reconinter.h" @@ -336,12 +337,12 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, } if (decision == FILTER_BLOCK) { - vp9_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, + vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2, num_4x4_blocks_high_lookup[bs] << 2); } else { // COPY_BLOCK - vp9_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, + vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2, num_4x4_blocks_high_lookup[bs] << 2); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 53e747c26..8aee22756 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -12,11 +12,12 @@ #include #include -#include "./vpx_config.h" #include "./vp9_rtcd.h" +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vpx/internal/vpx_psnr.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" #include "vpx_ports/vpx_timer.h" #include "vpx_scale/vpx_scale.h" @@ -2580,18 +2581,18 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, #if CONFIG_VP9_HIGHBITDEPTH if (src->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, + vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, kernel[x_q4 & 0xf], 16 * src_w / dst_w, kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor, 16 / factor, bd); } else { - vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, + vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, kernel[x_q4 & 0xf], 16 * src_w / dst_w, kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor, 16 / factor); } #else - vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, + vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, kernel[x_q4 & 0xf], 16 * src_w / dst_w, kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor, 16 / factor); diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 34bf8a6a1..00f7dcdf3 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1504,15 +1504,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, this_mode_pred = &tmp[get_pred_buffer(tmp, 3)]; #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) - vp9_highbd_convolve_copy(best_pred->data, best_pred->stride, + vpx_highbd_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, NULL, 0, NULL, 0, bw, bh, xd->bd); else - vp9_convolve_copy(best_pred->data, best_pred->stride, + vpx_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, NULL, 0, NULL, 0, bw, bh); #else - vp9_convolve_copy(best_pred->data, best_pred->stride, + vpx_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, NULL, 0, NULL, 0, bw, bh); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -1577,15 +1577,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) { #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) - vp9_highbd_convolve_copy(best_pred->data, best_pred->stride, + vpx_highbd_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0, bw, bh, xd->bd); else - vp9_convolve_copy(best_pred->data, best_pred->stride, + vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0, bw, bh); #else - vp9_convolve_copy(best_pred->data, best_pred->stride, + vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0, bw, bh); #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c index f46cad804..e5ae9594c 100644 --- a/vp9/encoder/vp9_resize.c +++ b/vp9/encoder/vp9_resize.c @@ -15,6 +15,9 @@ #include #include +#if CONFIG_VP9_HIGHBITDEPTH +#include "vpx_dsp/vpx_dsp_common.h" +#endif // CONFIG_VP9_HIGHBITDEPTH #include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" #include "vp9/encoder/vp9_resize.h" diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 78ea63fa1..339bc8b6e 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -13,14 +13,10 @@ VP9_COMMON_SRCS-yes += vp9_iface_common.h VP9_COMMON_SRCS-yes += common/vp9_ppflags.h VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c VP9_COMMON_SRCS-yes += common/vp9_blockd.c -VP9_COMMON_SRCS-yes += common/vp9_convolve.c -VP9_COMMON_SRCS-yes += common/vp9_convolve.h VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c VP9_COMMON_SRCS-yes += common/vp9_entropy.c VP9_COMMON_SRCS-yes += common/vp9_entropymode.c VP9_COMMON_SRCS-yes += common/vp9_entropymv.c -VP9_COMMON_SRCS-yes += common/vp9_filter.c -VP9_COMMON_SRCS-yes += common/vp9_filter.h VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.c VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.h VP9_COMMON_SRCS-yes += common/vp9_idct.c @@ -31,6 +27,8 @@ VP9_COMMON_SRCS-yes += common/vp9_entropy.h VP9_COMMON_SRCS-yes += common/vp9_entropymode.h VP9_COMMON_SRCS-yes += common/vp9_entropymv.h VP9_COMMON_SRCS-yes += common/vp9_enums.h +VP9_COMMON_SRCS-yes += common/vp9_filter.h +VP9_COMMON_SRCS-yes += common/vp9_filter.c VP9_COMMON_SRCS-yes += common/vp9_idct.h VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h VP9_COMMON_SRCS-yes += common/vp9_thread_common.h @@ -64,33 +62,16 @@ VP9_COMMON_SRCS-yes += common/vp9_common_data.h VP9_COMMON_SRCS-yes += common/vp9_scan.c VP9_COMMON_SRCS-yes += common/vp9_scan.h -VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/convolve.h -VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm -VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm -VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c -VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm endif -ifeq ($(CONFIG_USE_X86INC),yes) -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm -endif - -ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_8t_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_bilinear_sse2.asm -endif - # common (c) VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_common_dspr2.h VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_avg_dspr2.c @@ -113,15 +94,6 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_dspr2.c endif # common (msa) -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_vert_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_horiz_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_vert_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_avg_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_copy_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_msa.h VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c @@ -151,11 +123,6 @@ endif # neon with assembly and intrinsics implementations. If both are available # prefer assembly. ifeq ($(HAVE_NEON_ASM), yes) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_avg_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c @@ -167,11 +134,6 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM) else ifeq ($(HAVE_NEON), yes) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_avg_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c diff --git a/vpx_dsp/arm/vpx_convolve8_avg_neon.c b/vpx_dsp/arm/vpx_convolve8_avg_neon.c new file mode 100644 index 000000000..5464250e6 --- /dev/null +++ b/vpx_dsp/arm/vpx_convolve8_avg_neon.c @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +static INLINE int32x4_t MULTIPLY_BY_Q0( + int16x4_t dsrc0, + int16x4_t dsrc1, + int16x4_t dsrc2, + int16x4_t dsrc3, + int16x4_t dsrc4, + int16x4_t dsrc5, + int16x4_t dsrc6, + int16x4_t dsrc7, + int16x8_t q0s16) { + int32x4_t qdst; + int16x4_t d0s16, d1s16; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + + qdst = vmull_lane_s16(dsrc0, d0s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); + qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); + return qdst; +} + +void vpx_convolve8_avg_horiz_neon( + const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, + int x_step_q4, + const int16_t *filter_y, // unused + int y_step_q4, // unused + int w, + int h) { + int width; + const uint8_t *s; + uint8_t *d; + uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; + uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32; + uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + uint16x8x2_t q0x2u16; + uint8x8x2_t d0x2u8, d1x2u8; + uint32x2x2_t d0x2u32; + uint16x4x2_t d0x2u16, d1x2u16; + uint32x4x2_t q0x2u32; + + if (x_step_q4 != 16) { + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + return; +} + + q0s16 = vld1q_s16(filter_x); + + src -= 3; // adjust for taps + for (; h > 0; h -= 4) { // loop_horiz_v + s = src; + d24u8 = vld1_u8(s); + s += src_stride; + d25u8 = vld1_u8(s); + s += src_stride; + d26u8 = vld1_u8(s); + s += src_stride; + d27u8 = vld1_u8(s); + + q12u8 = vcombine_u8(d24u8, d25u8); + q13u8 = vcombine_u8(d26u8, d27u8); + + q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), + vreinterpretq_u16_u8(q13u8)); + d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); + d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); + d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); + d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); + d0x2u8 = vtrn_u8(d24u8, d25u8); + d1x2u8 = vtrn_u8(d26u8, d27u8); + + __builtin_prefetch(src + src_stride * 4); + __builtin_prefetch(src + src_stride * 5); + + q8u16 = vmovl_u8(d0x2u8.val[0]); + q9u16 = vmovl_u8(d0x2u8.val[1]); + q10u16 = vmovl_u8(d1x2u8.val[0]); + q11u16 = vmovl_u8(d1x2u8.val[1]); + + src += 7; + d16u16 = vget_low_u16(q8u16); + d17u16 = vget_high_u16(q8u16); + d18u16 = vget_low_u16(q9u16); + d19u16 = vget_high_u16(q9u16); + q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 + q9u16 = vcombine_u16(d17u16, d19u16); + + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 + for (width = w; + width > 0; + width -= 4, src += 4, dst += 4) { // loop_horiz + s = src; + d28u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d29u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d31u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d30u32 = vld1_dup_u32((const uint32_t *)s); + + __builtin_prefetch(src + 64); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), + vreinterpret_u16_u32(d31u32)); + d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), + vreinterpret_u16_u32(d30u32)); + d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 + vreinterpret_u8_u16(d1x2u16.val[0])); // d29 + d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 + vreinterpret_u8_u16(d1x2u16.val[1])); // d30 + + __builtin_prefetch(src + 64 + src_stride); + + q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); + q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), + vreinterpretq_u32_u8(q15u8)); + + d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); + d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); + q12u16 = vmovl_u8(d28u8); + q13u16 = vmovl_u8(d29u8); + + __builtin_prefetch(src + 64 + src_stride * 2); + + d = dst; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); + d += dst_stride; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, + d18s16, d19s16, d23s16, d24s16, q0s16); + q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, + d19s16, d23s16, d24s16, d26s16, q0s16); + q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, + d23s16, d24s16, d26s16, d27s16, q0s16); + q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, + d24s16, d26s16, d27s16, d25s16, q0s16); + + __builtin_prefetch(src + 64 + src_stride * 3); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), + vreinterpret_u16_u8(d3u8)); + d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), + vreinterpret_u32_u16(d0x2u16.val[1])); + d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), + vreinterpret_u8_u32(d0x2u32.val[1])); + + q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); + + q1u8 = vrhaddq_u8(q1u8, q3u8); + + d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); + d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); + + d = dst; + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + + q8u16 = q9u16; + d20s16 = d23s16; + q11u16 = q12u16; + q9u16 = q13u16; + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + } + src += src_stride * 4 - w - 7; + dst += dst_stride * 4 - w; + } + return; +} + +void vpx_convolve8_avg_vert_neon( + const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, // unused + int x_step_q4, // unused + const int16_t *filter_y, + int y_step_q4, + int w, + int h) { + int height; + const uint8_t *s; + uint8_t *d; + uint8x8_t d2u8, d3u8; + uint32x2_t d2u32, d3u32, d6u32, d7u32; + uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; + uint8x16_t q1u8, q3u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + + if (y_step_q4 != 16) { + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + return; + } + + src -= src_stride * 3; + q0s16 = vld1q_s16(filter_y); + for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h + s = src; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); + s += src_stride; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); + s += src_stride; + d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); + s += src_stride; + + q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); + q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); + q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); + q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); + + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d = dst; + for (height = h; height > 0; height -= 4) { // loop_vert + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); + s += src_stride; + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); + s += src_stride; + + q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); + q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); + + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); + d += dst_stride; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); + d -= dst_stride * 3; + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + __builtin_prefetch(s); + __builtin_prefetch(s + src_stride); + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, + d20s16, d21s16, d22s16, d24s16, q0s16); + __builtin_prefetch(s + src_stride * 2); + __builtin_prefetch(s + src_stride * 3); + q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, + d21s16, d22s16, d24s16, d26s16, q0s16); + __builtin_prefetch(d); + __builtin_prefetch(d + dst_stride); + q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, + d22s16, d24s16, d26s16, d27s16, q0s16); + __builtin_prefetch(d + dst_stride * 2); + __builtin_prefetch(d + dst_stride * 3); + q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, + d24s16, d26s16, d27s16, d25s16, q0s16); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + q1u8 = vcombine_u8(d2u8, d3u8); + q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); + + q1u8 = vrhaddq_u8(q1u8, q3u8); + + d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); + d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); + + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + d += dst_stride; + + q8u16 = q10u16; + d18s16 = d22s16; + d19s16 = d24s16; + q10u16 = q13u16; + d22s16 = d25s16; + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm b/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm similarity index 92% rename from vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm rename to vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm index 4d85846f0..a19f97db7 100644 --- a/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm +++ b/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm @@ -17,10 +17,10 @@ ; VP9_FILTER_WEIGHT == 128 ; VP9_FILTER_SHIFT == 7 - EXPORT |vp9_convolve8_avg_horiz_neon| - EXPORT |vp9_convolve8_avg_vert_neon| - IMPORT |vp9_convolve8_avg_horiz_c| - IMPORT |vp9_convolve8_avg_vert_c| + EXPORT |vpx_convolve8_avg_horiz_neon| + EXPORT |vpx_convolve8_avg_vert_neon| + IMPORT |vpx_convolve8_avg_horiz_c| + IMPORT |vpx_convolve8_avg_vert_c| ARM REQUIRE8 PRESERVE8 @@ -51,10 +51,10 @@ ; sp[]int w ; sp[]int h -|vp9_convolve8_avg_horiz_neon| PROC +|vpx_convolve8_avg_horiz_neon| PROC ldr r12, [sp, #4] ; x_step_q4 cmp r12, #16 - bne vp9_convolve8_avg_horiz_c + bne vpx_convolve8_avg_horiz_c push {r4-r10, lr} @@ -78,7 +78,7 @@ mov r10, r6 ; w loop counter -vp9_convolve8_avg_loop_horiz_v +vpx_convolve8_avg_loop_horiz_v vld1.8 {d24}, [r0], r1 vld1.8 {d25}, [r0], r1 vld1.8 {d26}, [r0], r1 @@ -101,7 +101,7 @@ vp9_convolve8_avg_loop_horiz_v add r0, r0, #3 -vp9_convolve8_avg_loop_horiz +vpx_convolve8_avg_loop_horiz add r5, r0, #64 vld1.32 {d28[]}, [r0], r1 @@ -170,23 +170,23 @@ vp9_convolve8_avg_loop_horiz vmov q9, q13 subs r6, r6, #4 ; w -= 4 - bgt vp9_convolve8_avg_loop_horiz + bgt vpx_convolve8_avg_loop_horiz ; outer loop mov r6, r10 ; restore w counter add r0, r0, r9 ; src += src_stride * 4 - w add r2, r2, r12 ; dst += dst_stride * 4 - w subs r7, r7, #4 ; h -= 4 - bgt vp9_convolve8_avg_loop_horiz_v + bgt vpx_convolve8_avg_loop_horiz_v pop {r4-r10, pc} ENDP -|vp9_convolve8_avg_vert_neon| PROC +|vpx_convolve8_avg_vert_neon| PROC ldr r12, [sp, #12] cmp r12, #16 - bne vp9_convolve8_avg_vert_c + bne vpx_convolve8_avg_vert_c push {r4-r8, lr} @@ -203,7 +203,7 @@ vp9_convolve8_avg_loop_horiz lsl r1, r1, #1 lsl r3, r3, #1 -vp9_convolve8_avg_loop_vert_h +vpx_convolve8_avg_loop_vert_h mov r4, r0 add r7, r0, r1, asr #1 mov r5, r2 @@ -223,7 +223,7 @@ vp9_convolve8_avg_loop_vert_h vmovl.u8 q10, d20 vmovl.u8 q11, d22 -vp9_convolve8_avg_loop_vert +vpx_convolve8_avg_loop_vert ; always process a 4x4 block at a time vld1.u32 {d24[0]}, [r7], r1 vld1.u32 {d26[0]}, [r4], r1 @@ -288,13 +288,13 @@ vp9_convolve8_avg_loop_vert vmov d22, d25 subs r12, r12, #4 ; h -= 4 - bgt vp9_convolve8_avg_loop_vert + bgt vpx_convolve8_avg_loop_vert ; outer loop add r0, r0, #4 add r2, r2, #4 subs r6, r6, #4 ; w -= 4 - bgt vp9_convolve8_avg_loop_vert_h + bgt vpx_convolve8_avg_loop_vert_h pop {r4-r8, pc} diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c new file mode 100644 index 000000000..6f634b3c7 --- /dev/null +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +static INLINE int32x4_t MULTIPLY_BY_Q0( + int16x4_t dsrc0, + int16x4_t dsrc1, + int16x4_t dsrc2, + int16x4_t dsrc3, + int16x4_t dsrc4, + int16x4_t dsrc5, + int16x4_t dsrc6, + int16x4_t dsrc7, + int16x8_t q0s16) { + int32x4_t qdst; + int16x4_t d0s16, d1s16; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + + qdst = vmull_lane_s16(dsrc0, d0s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); + qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); + return qdst; +} + +void vpx_convolve8_horiz_neon( + const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, + int x_step_q4, + const int16_t *filter_y, // unused + int y_step_q4, // unused + int w, + int h) { + int width; + const uint8_t *s, *psrc; + uint8_t *d, *pdst; + uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; + uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; + uint8x16_t q12u8, q13u8, q14u8, q15u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + uint16x8x2_t q0x2u16; + uint8x8x2_t d0x2u8, d1x2u8; + uint32x2x2_t d0x2u32; + uint16x4x2_t d0x2u16, d1x2u16; + uint32x4x2_t q0x2u32; + + if (x_step_q4 != 16) { + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + return; + } + + q0s16 = vld1q_s16(filter_x); + + src -= 3; // adjust for taps + for (; h > 0; h -= 4, + src += src_stride * 4, + dst += dst_stride * 4) { // loop_horiz_v + s = src; + d24u8 = vld1_u8(s); + s += src_stride; + d25u8 = vld1_u8(s); + s += src_stride; + d26u8 = vld1_u8(s); + s += src_stride; + d27u8 = vld1_u8(s); + + q12u8 = vcombine_u8(d24u8, d25u8); + q13u8 = vcombine_u8(d26u8, d27u8); + + q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), + vreinterpretq_u16_u8(q13u8)); + d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); + d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); + d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); + d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); + d0x2u8 = vtrn_u8(d24u8, d25u8); + d1x2u8 = vtrn_u8(d26u8, d27u8); + + __builtin_prefetch(src + src_stride * 4); + __builtin_prefetch(src + src_stride * 5); + __builtin_prefetch(src + src_stride * 6); + + q8u16 = vmovl_u8(d0x2u8.val[0]); + q9u16 = vmovl_u8(d0x2u8.val[1]); + q10u16 = vmovl_u8(d1x2u8.val[0]); + q11u16 = vmovl_u8(d1x2u8.val[1]); + + d16u16 = vget_low_u16(q8u16); + d17u16 = vget_high_u16(q8u16); + d18u16 = vget_low_u16(q9u16); + d19u16 = vget_high_u16(q9u16); + q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 + q9u16 = vcombine_u16(d17u16, d19u16); + + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 + for (width = w, psrc = src + 7, pdst = dst; + width > 0; + width -= 4, psrc += 4, pdst += 4) { // loop_horiz + s = psrc; + d28u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d29u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d31u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d30u32 = vld1_dup_u32((const uint32_t *)s); + + __builtin_prefetch(psrc + 64); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), + vreinterpret_u16_u32(d31u32)); + d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), + vreinterpret_u16_u32(d30u32)); + d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 + vreinterpret_u8_u16(d1x2u16.val[0])); // d29 + d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 + vreinterpret_u8_u16(d1x2u16.val[1])); // d30 + + __builtin_prefetch(psrc + 64 + src_stride); + + q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); + q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), + vreinterpretq_u32_u8(q15u8)); + + d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); + d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); + q12u16 = vmovl_u8(d28u8); + q13u16 = vmovl_u8(d29u8); + + __builtin_prefetch(psrc + 64 + src_stride * 2); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, + d18s16, d19s16, d23s16, d24s16, q0s16); + q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, + d19s16, d23s16, d24s16, d26s16, q0s16); + q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, + d23s16, d24s16, d26s16, d27s16, q0s16); + q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, + d24s16, d26s16, d27s16, d25s16, q0s16); + + __builtin_prefetch(psrc + 60 + src_stride * 3); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), + vreinterpret_u16_u8(d3u8)); + d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), + vreinterpret_u32_u16(d0x2u16.val[1])); + d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), + vreinterpret_u8_u32(d0x2u32.val[1])); + + d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); + d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); + + d = pdst; + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + + q8u16 = q9u16; + d20s16 = d23s16; + q11u16 = q12u16; + q9u16 = q13u16; + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + } + } + return; +} + +void vpx_convolve8_vert_neon( + const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, // unused + int x_step_q4, // unused + const int16_t *filter_y, + int y_step_q4, + int w, + int h) { + int height; + const uint8_t *s; + uint8_t *d; + uint32x2_t d2u32, d3u32; + uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + + if (y_step_q4 != 16) { + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + return; + } + + src -= src_stride * 3; + q0s16 = vld1q_s16(filter_y); + for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h + s = src; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); + s += src_stride; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); + s += src_stride; + d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); + s += src_stride; + + q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); + q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); + q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); + q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); + + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d = dst; + for (height = h; height > 0; height -= 4) { // loop_vert + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); + s += src_stride; + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); + s += src_stride; + + q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); + q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + __builtin_prefetch(d); + __builtin_prefetch(d + dst_stride); + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, + d20s16, d21s16, d22s16, d24s16, q0s16); + __builtin_prefetch(d + dst_stride * 2); + __builtin_prefetch(d + dst_stride * 3); + q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, + d21s16, d22s16, d24s16, d26s16, q0s16); + __builtin_prefetch(s); + __builtin_prefetch(s + src_stride); + q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, + d22s16, d24s16, d26s16, d27s16, q0s16); + __builtin_prefetch(s + src_stride * 2); + __builtin_prefetch(s + src_stride * 3); + q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, + d24s16, d26s16, d27s16, d25s16, q0s16); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16)); + d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16)); + + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + d += dst_stride; + + q8u16 = q10u16; + d18s16 = d22s16; + d19s16 = d24s16; + q10u16 = q13u16; + d22s16 = d25s16; + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm b/vpx_dsp/arm/vpx_convolve8_neon_asm.asm similarity index 92% rename from vp9/common/arm/neon/vp9_convolve8_neon_asm.asm rename to vpx_dsp/arm/vpx_convolve8_neon_asm.asm index 184c3ad67..bc530de41 100644 --- a/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm +++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.asm @@ -17,10 +17,10 @@ ; VP9_FILTER_WEIGHT == 128 ; VP9_FILTER_SHIFT == 7 - EXPORT |vp9_convolve8_horiz_neon| - EXPORT |vp9_convolve8_vert_neon| - IMPORT |vp9_convolve8_horiz_c| - IMPORT |vp9_convolve8_vert_c| + EXPORT |vpx_convolve8_horiz_neon| + EXPORT |vpx_convolve8_vert_neon| + IMPORT |vpx_convolve8_horiz_c| + IMPORT |vpx_convolve8_vert_c| ARM REQUIRE8 PRESERVE8 @@ -51,10 +51,10 @@ ; sp[]int w ; sp[]int h -|vp9_convolve8_horiz_neon| PROC +|vpx_convolve8_horiz_neon| PROC ldr r12, [sp, #4] ; x_step_q4 cmp r12, #16 - bne vp9_convolve8_horiz_c + bne vpx_convolve8_horiz_c push {r4-r10, lr} @@ -78,7 +78,7 @@ mov r10, r6 ; w loop counter -vp9_convolve8_loop_horiz_v +vpx_convolve8_loop_horiz_v vld1.8 {d24}, [r0], r1 vld1.8 {d25}, [r0], r1 vld1.8 {d26}, [r0], r1 @@ -101,7 +101,7 @@ vp9_convolve8_loop_horiz_v add r0, r0, #3 -vp9_convolve8_loop_horiz +vpx_convolve8_loop_horiz add r5, r0, #64 vld1.32 {d28[]}, [r0], r1 @@ -159,23 +159,23 @@ vp9_convolve8_loop_horiz vmov q9, q13 subs r6, r6, #4 ; w -= 4 - bgt vp9_convolve8_loop_horiz + bgt vpx_convolve8_loop_horiz ; outer loop mov r6, r10 ; restore w counter add r0, r0, r9 ; src += src_stride * 4 - w add r2, r2, r12 ; dst += dst_stride * 4 - w subs r7, r7, #4 ; h -= 4 - bgt vp9_convolve8_loop_horiz_v + bgt vpx_convolve8_loop_horiz_v pop {r4-r10, pc} ENDP -|vp9_convolve8_vert_neon| PROC +|vpx_convolve8_vert_neon| PROC ldr r12, [sp, #12] cmp r12, #16 - bne vp9_convolve8_vert_c + bne vpx_convolve8_vert_c push {r4-r8, lr} @@ -192,7 +192,7 @@ vp9_convolve8_loop_horiz lsl r1, r1, #1 lsl r3, r3, #1 -vp9_convolve8_loop_vert_h +vpx_convolve8_loop_vert_h mov r4, r0 add r7, r0, r1, asr #1 mov r5, r2 @@ -212,7 +212,7 @@ vp9_convolve8_loop_vert_h vmovl.u8 q10, d20 vmovl.u8 q11, d22 -vp9_convolve8_loop_vert +vpx_convolve8_loop_vert ; always process a 4x4 block at a time vld1.u32 {d24[0]}, [r7], r1 vld1.u32 {d26[0]}, [r4], r1 @@ -266,13 +266,13 @@ vp9_convolve8_loop_vert vmov d22, d25 subs r12, r12, #4 ; h -= 4 - bgt vp9_convolve8_loop_vert + bgt vpx_convolve8_loop_vert ; outer loop add r0, r0, #4 add r2, r2, #4 subs r6, r6, #4 ; w -= 4 - bgt vp9_convolve8_loop_vert_h + bgt vpx_convolve8_loop_vert_h pop {r4-r8, pc} diff --git a/vpx_dsp/arm/vpx_convolve_avg_neon.c b/vpx_dsp/arm/vpx_convolve_avg_neon.c new file mode 100644 index 000000000..dc58a332f --- /dev/null +++ b/vpx_dsp/arm/vpx_convolve_avg_neon.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_convolve_avg_neon( + const uint8_t *src, // r0 + ptrdiff_t src_stride, // r1 + uint8_t *dst, // r2 + ptrdiff_t dst_stride, // r3 + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, + int h) { + uint8_t *d; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + uint32x2_t d0u32, d2u32; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; + (void)filter_x; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; + + d = dst; + if (w > 32) { // avg64 + for (; h > 0; h -= 1) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + q2u8 = vld1q_u8(src + 32); + q3u8 = vld1q_u8(src + 48); + src += src_stride; + q8u8 = vld1q_u8(d); + q9u8 = vld1q_u8(d + 16); + q10u8 = vld1q_u8(d + 32); + q11u8 = vld1q_u8(d + 48); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q8u8); + q1u8 = vrhaddq_u8(q1u8, q9u8); + q2u8 = vrhaddq_u8(q2u8, q10u8); + q3u8 = vrhaddq_u8(q3u8, q11u8); + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + vst1q_u8(dst + 32, q2u8); + vst1q_u8(dst + 48, q3u8); + dst += dst_stride; + } + } else if (w == 32) { // avg32 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + src += src_stride; + q2u8 = vld1q_u8(src); + q3u8 = vld1q_u8(src + 16); + src += src_stride; + q8u8 = vld1q_u8(d); + q9u8 = vld1q_u8(d + 16); + d += dst_stride; + q10u8 = vld1q_u8(d); + q11u8 = vld1q_u8(d + 16); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q8u8); + q1u8 = vrhaddq_u8(q1u8, q9u8); + q2u8 = vrhaddq_u8(q2u8, q10u8); + q3u8 = vrhaddq_u8(q3u8, q11u8); + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + dst += dst_stride; + vst1q_u8(dst, q2u8); + vst1q_u8(dst + 16, q3u8); + dst += dst_stride; + } + } else if (w > 8) { // avg16 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(src); + src += src_stride; + q2u8 = vld1q_u8(d); + d += dst_stride; + q3u8 = vld1q_u8(d); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q2u8); + q1u8 = vrhaddq_u8(q1u8, q3u8); + + vst1q_u8(dst, q0u8); + dst += dst_stride; + vst1q_u8(dst, q1u8); + dst += dst_stride; + } + } else if (w == 8) { // avg8 + for (; h > 0; h -= 2) { + d0u8 = vld1_u8(src); + src += src_stride; + d1u8 = vld1_u8(src); + src += src_stride; + d2u8 = vld1_u8(d); + d += dst_stride; + d3u8 = vld1_u8(d); + d += dst_stride; + + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + q0u8 = vrhaddq_u8(q0u8, q1u8); + + vst1_u8(dst, vget_low_u8(q0u8)); + dst += dst_stride; + vst1_u8(dst, vget_high_u8(q0u8)); + dst += dst_stride; + } + } else { // avg4 + for (; h > 0; h -= 2) { + d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0); + src += src_stride; + d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1); + src += src_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0); + d += dst_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); + d += dst_stride; + + d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), + vreinterpret_u8_u32(d2u32)); + + d0u32 = vreinterpret_u32_u8(d0u8); + vst1_lane_u32((uint32_t *)dst, d0u32, 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, d0u32, 1); + dst += dst_stride; + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_convolve_avg_neon_asm.asm b/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm similarity index 98% rename from vp9/common/arm/neon/vp9_convolve_avg_neon_asm.asm rename to vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm index 7d2453021..97e6189fd 100644 --- a/vp9/common/arm/neon/vp9_convolve_avg_neon_asm.asm +++ b/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm @@ -8,14 +8,14 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_convolve_avg_neon| + EXPORT |vpx_convolve_avg_neon| ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -|vp9_convolve_avg_neon| PROC +|vpx_convolve_avg_neon| PROC push {r4-r6, lr} ldrd r4, r5, [sp, #32] mov r6, r2 diff --git a/vpx_dsp/arm/vpx_convolve_copy_neon.c b/vpx_dsp/arm/vpx_convolve_copy_neon.c new file mode 100644 index 000000000..d8fb97a86 --- /dev/null +++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_convolve_copy_neon( + const uint8_t *src, // r0 + ptrdiff_t src_stride, // r1 + uint8_t *dst, // r2 + ptrdiff_t dst_stride, // r3 + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, + int h) { + uint8x8_t d0u8, d2u8; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + (void)filter_x; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; + + if (w > 32) { // copy64 + for (; h > 0; h--) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + q2u8 = vld1q_u8(src + 32); + q3u8 = vld1q_u8(src + 48); + src += src_stride; + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + vst1q_u8(dst + 32, q2u8); + vst1q_u8(dst + 48, q3u8); + dst += dst_stride; + } + } else if (w == 32) { // copy32 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + src += src_stride; + q2u8 = vld1q_u8(src); + q3u8 = vld1q_u8(src + 16); + src += src_stride; + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + dst += dst_stride; + vst1q_u8(dst, q2u8); + vst1q_u8(dst + 16, q3u8); + dst += dst_stride; + } + } else if (w > 8) { // copy16 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(src); + src += src_stride; + + vst1q_u8(dst, q0u8); + dst += dst_stride; + vst1q_u8(dst, q1u8); + dst += dst_stride; + } + } else if (w == 8) { // copy8 + for (; h > 0; h -= 2) { + d0u8 = vld1_u8(src); + src += src_stride; + d2u8 = vld1_u8(src); + src += src_stride; + + vst1_u8(dst, d0u8); + dst += dst_stride; + vst1_u8(dst, d2u8); + dst += dst_stride; + } + } else { // copy4 + for (; h > 0; h--) { + *(uint32_t *)dst = *(const uint32_t *)src; + src += src_stride; + dst += dst_stride; + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_copy_neon_asm.asm b/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm similarity index 97% rename from vp9/common/arm/neon/vp9_copy_neon_asm.asm rename to vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm index a0bd04a35..89164ad48 100644 --- a/vp9/common/arm/neon/vp9_copy_neon_asm.asm +++ b/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm @@ -8,14 +8,14 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_convolve_copy_neon| + EXPORT |vpx_convolve_copy_neon| ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -|vp9_convolve_copy_neon| PROC +|vpx_convolve_copy_neon| PROC push {r4-r5, lr} ldrd r4, r5, [sp, #28] diff --git a/vp9/common/arm/neon/vp9_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c similarity index 85% rename from vp9/common/arm/neon/vp9_convolve_neon.c rename to vpx_dsp/arm/vpx_convolve_neon.c index 2e28cb20e..2c0f7e9e7 100644 --- a/vp9/common/arm/neon/vp9_convolve_neon.c +++ b/vpx_dsp/arm/vpx_convolve_neon.c @@ -8,11 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vp9_rtcd.h" -#include "vp9/common/vp9_common.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" #include "vpx_ports/mem.h" -void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -26,7 +26,7 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, int intermediate_height = h + 7; if (x_step_q4 != 16 || y_step_q4 != 16) { - vp9_convolve8_c(src, src_stride, + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, @@ -39,19 +39,19 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, * the temp buffer which has lots of extra room and is subsequently discarded * this is safe if somewhat less than ideal. */ - vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride, + vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height); /* Step into the temp buffer 3 lines to get the actual frame data */ - vp9_convolve8_vert_neon(temp + 64 * 3, 64, + vpx_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); } -void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -60,7 +60,7 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, int intermediate_height = h + 7; if (x_step_q4 != 16 || y_step_q4 != 16) { - vp9_convolve8_avg_c(src, src_stride, + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, @@ -71,11 +71,11 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, /* This implementation has the same issues as above. In addition, we only want * to average the values after both passes. */ - vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride, + vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height); - vp9_convolve8_avg_vert_neon(temp + 64 * 3, + vpx_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c index dc8aca5c3..66f4d9576 100644 --- a/vpx_dsp/loopfilter.c +++ b/vpx_dsp/loopfilter.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include + #include "./vpx_config.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_ports/mem.h" diff --git a/vp9/common/mips/msa/vp9_convolve8_avg_horiz_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c similarity index 98% rename from vp9/common/mips/msa/vp9_convolve8_avg_horiz_msa.c rename to vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c index 89364cb95..c5ad08cc9 100644 --- a/vp9/common/mips/msa/vp9_convolve8_avg_horiz_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vp9_rtcd.h" -#include "vp9/common/mips/msa/vp9_convolve_msa.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/vpx_convolve_msa.h" static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, @@ -687,7 +687,7 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, } } -void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -695,14 +695,14 @@ void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, int8_t cnt, filt_hor[8]; if (16 != x_step_q4) { - vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); return; } if (((const int32_t *)filter_x)[1] == 0x800000) { - vp9_convolve_avg(src, src_stride, dst, dst_stride, + vpx_convolve_avg(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); return; @@ -740,7 +740,7 @@ void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_hor[3], h); break; default: - vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); break; @@ -773,7 +773,7 @@ void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, filt_hor, h); break; default: - vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); break; diff --git a/vp9/common/mips/msa/vp9_convolve8_avg_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_msa.c similarity index 98% rename from vp9/common/mips/msa/vp9_convolve8_avg_msa.c rename to vpx_dsp/mips/vpx_convolve8_avg_msa.c index e9f3a9dc3..f46df6782 100644 --- a/vp9/common/mips/msa/vp9_convolve8_avg_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_avg_msa.c @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vp9_rtcd.h" -#include "vp9/common/mips/msa/vp9_convolve_msa.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/vpx_convolve_msa.h" static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, @@ -576,7 +576,7 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src, } } -void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -584,7 +584,7 @@ void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, int8_t cnt, filt_hor[8], filt_ver[8]; if (16 != x_step_q4 || 16 != y_step_q4) { - vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); return; @@ -592,7 +592,7 @@ void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, if (((const int32_t *)filter_x)[1] == 0x800000 && ((const int32_t *)filter_y)[1] == 0x800000) { - vp9_convolve_avg(src, src_stride, dst, dst_stride, + vpx_convolve_avg(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); return; @@ -632,14 +632,14 @@ void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_hor[3], &filt_ver[3], h); break; default: - vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); break; } } else if (((const int32_t *)filter_x)[0] == 0 || ((const int32_t *)filter_y)[0] == 0) { - vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); } else { @@ -670,7 +670,7 @@ void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, filt_hor, filt_ver, h); break; default: - vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); break; diff --git a/vp9/common/mips/msa/vp9_convolve8_avg_vert_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c similarity index 98% rename from vp9/common/mips/msa/vp9_convolve8_avg_vert_msa.c rename to vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c index 85dfd30c7..4d4b3d9e3 100644 --- a/vp9/common/mips/msa/vp9_convolve8_avg_vert_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vp9_rtcd.h" -#include "vp9/common/mips/msa/vp9_convolve_msa.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/vpx_convolve_msa.h" static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, @@ -657,7 +657,7 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, } } -void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -665,14 +665,14 @@ void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, int8_t cnt, filt_ver[8]; if (16 != y_step_q4) { - vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); return; } if (((const int32_t *)filter_y)[1] == 0x800000) { - vp9_convolve_avg(src, src_stride, dst, dst_stride, + vpx_convolve_avg(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); return; @@ -710,7 +710,7 @@ void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_ver[3], h); break; default: - vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); break; @@ -744,7 +744,7 @@ void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver, h); break; default: - vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); break; diff --git a/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c similarity index 98% rename from vp9/common/mips/msa/vp9_convolve8_horiz_msa.c rename to vpx_dsp/mips/vpx_convolve8_horiz_msa.c index f175bf9b6..2bb314d7f 100644 --- a/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vp9_rtcd.h" -#include "vp9/common/mips/msa/vp9_convolve_msa.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/vpx_convolve_msa.h" static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, @@ -647,7 +647,7 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, } } -void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -655,14 +655,14 @@ void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, int8_t cnt, filt_hor[8]; if (16 != x_step_q4) { - vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); return; } if (((const int32_t *)filter_x)[1] == 0x800000) { - vp9_convolve_copy(src, src_stride, dst, dst_stride, + vpx_convolve_copy(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); return; @@ -700,7 +700,7 @@ void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_hor[3], h); break; default: - vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); break; @@ -733,7 +733,7 @@ void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, filt_hor, h); break; default: - vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); break; diff --git a/vp9/common/mips/msa/vp9_convolve8_msa.c b/vpx_dsp/mips/vpx_convolve8_msa.c similarity index 98% rename from vp9/common/mips/msa/vp9_convolve8_msa.c rename to vpx_dsp/mips/vpx_convolve8_msa.c index b1279d97c..52f2028cd 100644 --- a/vp9/common/mips/msa/vp9_convolve8_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_msa.c @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vp9_rtcd.h" -#include "vp9/common/mips/msa/vp9_convolve_msa.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/vpx_convolve_msa.h" const uint8_t mc_filt_mask_arr[16 * 3] = { /* 8 width cases */ @@ -551,7 +551,7 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, } } -void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int32_t x_step_q4, const int16_t *filter_y, int32_t y_step_q4, @@ -559,7 +559,7 @@ void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, int8_t cnt, filt_hor[8], filt_ver[8]; if (16 != x_step_q4 || 16 != y_step_q4) { - vp9_convolve8_c(src, src_stride, dst, dst_stride, + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); return; @@ -567,7 +567,7 @@ void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, if (((const int32_t *)filter_x)[1] == 0x800000 && ((const int32_t *)filter_y)[1] == 0x800000) { - vp9_convolve_copy(src, src_stride, dst, dst_stride, + vpx_convolve_copy(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); return; @@ -607,14 +607,14 @@ void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_hor[3], &filt_ver[3], (int32_t)h); break; default: - vp9_convolve8_c(src, src_stride, dst, dst_stride, + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); break; } } else if (((const int32_t *)filter_x)[0] == 0 || ((const int32_t *)filter_y)[0] == 0) { - vp9_convolve8_c(src, src_stride, dst, dst_stride, + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); } else { @@ -645,7 +645,7 @@ void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, filt_hor, filt_ver, (int32_t)h); break; default: - vp9_convolve8_c(src, src_stride, dst, dst_stride, + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); break; diff --git a/vp9/common/mips/msa/vp9_convolve8_vert_msa.c b/vpx_dsp/mips/vpx_convolve8_vert_msa.c similarity index 98% rename from vp9/common/mips/msa/vp9_convolve8_vert_msa.c rename to vpx_dsp/mips/vpx_convolve8_vert_msa.c index e9ec2507a..85f175760 100644 --- a/vp9/common/mips/msa/vp9_convolve8_vert_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_vert_msa.c @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vp9_rtcd.h" -#include "vp9/common/mips/msa/vp9_convolve_msa.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/vpx_convolve_msa.h" static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, @@ -650,7 +650,7 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, } } -void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -658,14 +658,14 @@ void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, int8_t cnt, filt_ver[8]; if (16 != y_step_q4) { - vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); return; } if (((const int32_t *)filter_y)[1] == 0x800000) { - vp9_convolve_copy(src, src_stride, dst, dst_stride, + vpx_convolve_copy(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); return; @@ -703,7 +703,7 @@ void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_ver[3], h); break; default: - vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); break; @@ -736,7 +736,7 @@ void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver, h); break; default: - vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); break; diff --git a/vp9/common/mips/msa/vp9_convolve_avg_msa.c b/vpx_dsp/mips/vpx_convolve_avg_msa.c similarity index 99% rename from vp9/common/mips/msa/vp9_convolve_avg_msa.c rename to vpx_dsp/mips/vpx_convolve_avg_msa.c index 7c11e4065..4c3d97803 100644 --- a/vp9/common/mips/msa/vp9_convolve_avg_msa.c +++ b/vpx_dsp/mips/vpx_convolve_avg_msa.c @@ -186,7 +186,7 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride, } } -void vp9_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int32_t filter_x_stride, const int16_t *filter_y, int32_t filter_y_stride, diff --git a/vp9/common/mips/msa/vp9_convolve_copy_msa.c b/vpx_dsp/mips/vpx_convolve_copy_msa.c similarity index 99% rename from vp9/common/mips/msa/vp9_convolve_copy_msa.c rename to vpx_dsp/mips/vpx_convolve_copy_msa.c index 39a0b24d5..ba4012281 100644 --- a/vp9/common/mips/msa/vp9_convolve_copy_msa.c +++ b/vpx_dsp/mips/vpx_convolve_copy_msa.c @@ -196,7 +196,7 @@ static void copy_width64_msa(const uint8_t *src, int32_t src_stride, copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); } -void vp9_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int32_t filter_x_stride, const int16_t *filter_y, int32_t filter_y_stride, diff --git a/vp9/common/mips/msa/vp9_convolve_msa.h b/vpx_dsp/mips/vpx_convolve_msa.h similarity index 97% rename from vp9/common/mips/msa/vp9_convolve_msa.h rename to vpx_dsp/mips/vpx_convolve_msa.h index 71c616b67..e0013983a 100644 --- a/vp9/common/mips/msa/vp9_convolve_msa.h +++ b/vpx_dsp/mips/vpx_convolve_msa.h @@ -8,11 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ -#define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ +#ifndef VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ +#define VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ -#include "vp9/common/vp9_filter.h" #include "vpx_dsp/mips/macros_msa.h" +#include "vpx_dsp/vpx_filter.h" extern const uint8_t mc_filt_mask_arr[16 * 3]; @@ -116,4 +116,4 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ } -#endif /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */ +#endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */ diff --git a/vp9/common/vp9_convolve.c b/vpx_dsp/vpx_convolve.c similarity index 93% rename from vp9/common/vp9_convolve.c rename to vpx_dsp/vpx_convolve.c index 90e337fd6..f06da3d34 100644 --- a/vp9/common/vp9_convolve.c +++ b/vpx_dsp/vpx_convolve.c @@ -9,13 +9,14 @@ */ #include +#include #include "./vpx_config.h" -#include "./vp9_rtcd.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_convolve.h" -#include "vp9/common/vp9_filter.h" +#include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_convolve.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, @@ -154,7 +155,7 @@ static int get_filter_offset(const int16_t *f, const InterpKernel *base) { return (int)((const InterpKernel *)(intptr_t)f - base); } -void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -169,7 +170,7 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, x0_q4, x_step_q4, w, h); } -void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -184,7 +185,7 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, x0_q4, x_step_q4, w, h); } -void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -199,7 +200,7 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, y0_q4, y_step_q4, w, h); } -void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -214,7 +215,7 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, y0_q4, y_step_q4, w, h); } -void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -230,7 +231,7 @@ void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, filters_y, y0_q4, y_step_q4, w, h); } -void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -240,12 +241,12 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, assert(w <= 64); assert(h <= 64); - vp9_convolve8_c(src, src_stride, temp, 64, + vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y, y_step_q4, w, h); - vp9_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); + vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); } -void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, @@ -262,7 +263,7 @@ void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, } } -void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, @@ -423,7 +424,7 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, } -void vp9_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, +void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -437,7 +438,7 @@ void vp9_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, x0_q4, x_step_q4, w, h, bd); } -void vp9_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, +void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -451,7 +452,7 @@ void vp9_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, x0_q4, x_step_q4, w, h, bd); } -void vp9_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, +void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -465,7 +466,7 @@ void vp9_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, y0_q4, y_step_q4, w, h, bd); } -void vp9_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, +void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -479,7 +480,7 @@ void vp9_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, y0_q4, y_step_q4, w, h, bd); } -void vp9_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, +void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -495,7 +496,7 @@ void vp9_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, filters_y, y0_q4, y_step_q4, w, h, bd); } -void vp9_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, +void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -505,13 +506,13 @@ void vp9_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, assert(w <= 64); assert(h <= 64); - vp9_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64, + vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); - vp9_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, + vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL, 0, NULL, 0, w, h, bd); } -void vp9_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, +void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, @@ -532,7 +533,7 @@ void vp9_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, } } -void vp9_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, +void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, diff --git a/vp9/common/vp9_convolve.h b/vpx_dsp/vpx_convolve.h similarity index 92% rename from vp9/common/vp9_convolve.h rename to vpx_dsp/vpx_convolve.h index 8b044c897..9ed3f1750 100644 --- a/vp9/common/vp9_convolve.h +++ b/vpx_dsp/vpx_convolve.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_CONVOLVE_H_ -#define VP9_COMMON_VP9_CONVOLVE_H_ +#ifndef VPX_DSP_VPX_CONVOLVE_H_ +#define VPX_DSP_VPX_CONVOLVE_H_ #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -35,4 +35,4 @@ typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, } // extern "C" #endif -#endif // VP9_COMMON_VP9_CONVOLVE_H_ +#endif // VPX_DSP_VPX_CONVOLVE_H_ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index ff2bb5236..de4578257 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -54,6 +54,54 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred8_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c endif # CONFIG_VP9 +# interpolation filters +DSP_SRCS-yes += vpx_convolve.c +DSP_SRCS-yes += vpx_convolve.h +DSP_SRCS-yes += vpx_filter.h + +DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h +DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c +DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_8t_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_bilinear_sse2.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_bilinear_ssse3.asm +DSP_SRCS-$(HAVE_AVX2) += x86/vpx_subpixel_8t_intrin_avx2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_8t_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_bilinear_sse2.asm +endif +ifeq ($(CONFIG_USE_X86INC),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm +endif + +ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_avg_neon_asm$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_neon_asm$(ASM) +DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM) +DSP_SRCS-yes += arm/vpx_convolve_neon.c +else +ifeq ($(HAVE_NEON),yes) +DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c +DSP_SRCS-yes += arm/vpx_convolve8_avg_neon.c +DSP_SRCS-yes += arm/vpx_convolve8_neon.c +DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c +DSP_SRCS-yes += arm/vpx_convolve_neon.c +endif # HAVE_NEON +endif # HAVE_NEON_ASM + +# common (msa) +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_horiz_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_vert_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_horiz_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h + # loop filters DSP_SRCS-yes += loopfilter.c diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h index 1fd7c153a..ccb818959 100644 --- a/vpx_dsp/vpx_dsp_common.h +++ b/vpx_dsp/vpx_dsp_common.h @@ -11,8 +11,6 @@ #ifndef VPX_DSP_COMMON_H_ #define VPX_DSP_COMMON_H_ -#include - #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 85b25229a..6e63271eb 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -34,6 +34,12 @@ if (vpx_config("CONFIG_USE_X86INC") eq "yes") { } } +# optimizations which depend on multiple features +$avx2_ssse3 = ''; +if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) { + $avx2_ssse3 = 'avx2'; +} + # functions that are 64 bit only. $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = ''; if ($opts{arch} eq "x86_64") { @@ -365,6 +371,62 @@ if (vpx_config("CONFIG_VP9") eq "yes") { } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9 +# +# Sub Pixel Filters +# +add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vpx_convolve_copy neon msa/, "$sse2_x86inc"; + +add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vpx_convolve_avg neon msa/, "$sse2_x86inc"; + +add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vpx_convolve8 sse2 ssse3 neon msa/, "$avx2_ssse3"; + +add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vpx_convolve8_horiz sse2 ssse3 neon msa/, "$avx2_ssse3"; + +add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vpx_convolve8_vert sse2 ssse3 neon msa/, "$avx2_ssse3"; + +add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vpx_convolve8_avg sse2 ssse3 neon msa/; + +add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon msa/; + +add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon msa/; + +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + # + # Sub Pixel Filters + # + add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve_copy/; + + add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve_avg/; + + add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64"; + + add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve8_horiz/, "$sse2_x86_64"; + + add_proto qw/void vpx_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve8_vert/, "$sse2_x86_64"; + + add_proto qw/void vpx_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve8_avg/, "$sse2_x86_64"; + + add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve8_avg_horiz/, "$sse2_x86_64"; + + add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + specialize qw/vpx_highbd_convolve8_avg_vert/, "$sse2_x86_64"; +} # CONFIG_VP9_HIGHBITDEPTH + # # Loopfilter # diff --git a/vpx_dsp/vpx_filter.h b/vpx_dsp/vpx_filter.h new file mode 100644 index 000000000..2617febf3 --- /dev/null +++ b/vpx_dsp/vpx_filter.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_VPX_FILTER_H_ +#define VPX_DSP_VPX_FILTER_H_ + +#include "vpx/vpx_integer.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +#define FILTER_BITS 7 + +#define SUBPEL_BITS 4 +#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) +#define SUBPEL_SHIFTS (1 << SUBPEL_BITS) +#define SUBPEL_TAPS 8 + +typedef int16_t InterpKernel[SUBPEL_TAPS]; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_VPX_FILTER_H_ diff --git a/vp9/common/x86/convolve.h b/vpx_dsp/x86/convolve.h similarity index 85% rename from vp9/common/x86/convolve.h rename to vpx_dsp/x86/convolve.h index de2df47e5..c0144981b 100644 --- a/vp9/common/x86/convolve.h +++ b/vpx_dsp/x86/convolve.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_X86_CONVOLVE_H_ -#define VP9_COMMON_X86_CONVOLVE_H_ +#ifndef VPX_DSP_X86_CONVOLVE_H_ +#define VPX_DSP_X86_CONVOLVE_H_ #include @@ -26,7 +26,7 @@ typedef void filter8_1dfunction ( ); #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ + void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ uint8_t *dst, ptrdiff_t dst_stride, \ const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, \ @@ -34,7 +34,7 @@ typedef void filter8_1dfunction ( if (step_q4 == 16 && filter[3] != 128) { \ if (filter[0] || filter[1] || filter[2]) { \ while (w >= 16) { \ - vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ + vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \ src_stride, \ dst, \ dst_stride, \ @@ -45,7 +45,7 @@ typedef void filter8_1dfunction ( w -= 16; \ } \ while (w >= 8) { \ - vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ + vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \ src_stride, \ dst, \ dst_stride, \ @@ -56,7 +56,7 @@ typedef void filter8_1dfunction ( w -= 8; \ } \ while (w >= 4) { \ - vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ + vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \ src_stride, \ dst, \ dst_stride, \ @@ -68,7 +68,7 @@ typedef void filter8_1dfunction ( } \ } else { \ while (w >= 16) { \ - vp9_filter_block1d16_##dir##2_##avg##opt(src, \ + vpx_filter_block1d16_##dir##2_##avg##opt(src, \ src_stride, \ dst, \ dst_stride, \ @@ -79,7 +79,7 @@ typedef void filter8_1dfunction ( w -= 16; \ } \ while (w >= 8) { \ - vp9_filter_block1d8_##dir##2_##avg##opt(src, \ + vpx_filter_block1d8_##dir##2_##avg##opt(src, \ src_stride, \ dst, \ dst_stride, \ @@ -90,7 +90,7 @@ typedef void filter8_1dfunction ( w -= 8; \ } \ while (w >= 4) { \ - vp9_filter_block1d4_##dir##2_##avg##opt(src, \ + vpx_filter_block1d4_##dir##2_##avg##opt(src, \ src_stride, \ dst, \ dst_stride, \ @@ -103,14 +103,14 @@ typedef void filter8_1dfunction ( } \ } \ if (w) { \ - vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ + vpx_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ filter_x, x_step_q4, filter_y, y_step_q4, \ w, h); \ } \ } #define FUN_CONV_2D(avg, opt) \ -void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ +void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ uint8_t *dst, ptrdiff_t dst_stride, \ const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, \ @@ -121,23 +121,23 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ - vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ + vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ filter_x, x_step_q4, filter_y, y_step_q4, \ w, h + 7); \ - vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ + vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ filter_x, x_step_q4, filter_y, \ y_step_q4, w, h); \ } else { \ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ - vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ + vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ filter_x, x_step_q4, filter_y, y_step_q4, \ w, h + 1); \ - vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ filter_x, x_step_q4, filter_y, \ y_step_q4, w, h); \ } \ } else { \ - vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + vpx_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ } \ } @@ -155,7 +155,7 @@ typedef void highbd_filter8_1dfunction ( ); #define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \ + void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \ ptrdiff_t src_stride, \ uint8_t *dst8, \ ptrdiff_t dst_stride, \ @@ -169,7 +169,7 @@ typedef void highbd_filter8_1dfunction ( uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ if (filter[0] || filter[1] || filter[2]) { \ while (w >= 16) { \ - vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \ + vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \ src_stride, \ dst, \ dst_stride, \ @@ -181,7 +181,7 @@ typedef void highbd_filter8_1dfunction ( w -= 16; \ } \ while (w >= 8) { \ - vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \ + vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \ src_stride, \ dst, \ dst_stride, \ @@ -193,7 +193,7 @@ typedef void highbd_filter8_1dfunction ( w -= 8; \ } \ while (w >= 4) { \ - vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \ + vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \ src_stride, \ dst, \ dst_stride, \ @@ -206,7 +206,7 @@ typedef void highbd_filter8_1dfunction ( } \ } else { \ while (w >= 16) { \ - vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \ + vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \ src_stride, \ dst, \ dst_stride, \ @@ -218,7 +218,7 @@ typedef void highbd_filter8_1dfunction ( w -= 16; \ } \ while (w >= 8) { \ - vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \ + vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \ src_stride, \ dst, \ dst_stride, \ @@ -230,7 +230,7 @@ typedef void highbd_filter8_1dfunction ( w -= 8; \ } \ while (w >= 4) { \ - vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \ + vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \ src_stride, \ dst, \ dst_stride, \ @@ -244,14 +244,14 @@ typedef void highbd_filter8_1dfunction ( } \ } \ if (w) { \ - vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ + vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ filter_x, x_step_q4, filter_y, y_step_q4, \ w, h, bd); \ } \ } #define HIGH_FUN_CONV_2D(avg, opt) \ -void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ +void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ uint8_t *dst, ptrdiff_t dst_stride, \ const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, \ @@ -262,35 +262,35 @@ void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ - vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ CONVERT_TO_BYTEPTR(fdata2), 64, \ filter_x, x_step_q4, \ filter_y, y_step_q4, \ w, h + 7, bd); \ - vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ + vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ 64, dst, dst_stride, \ filter_x, x_step_q4, \ filter_y, y_step_q4, \ w, h, bd); \ } else { \ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ - vp9_highbd_convolve8_horiz_##opt(src, src_stride, \ + vpx_highbd_convolve8_horiz_##opt(src, src_stride, \ CONVERT_TO_BYTEPTR(fdata2), 64, \ filter_x, x_step_q4, \ filter_y, y_step_q4, \ w, h + 1, bd); \ - vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ + vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ dst, dst_stride, \ filter_x, x_step_q4, \ filter_y, y_step_q4, \ w, h, bd); \ } \ } else { \ - vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ filter_x, x_step_q4, filter_y, y_step_q4, w, \ h, bd); \ } \ } #endif // CONFIG_VP9_HIGHBITDEPTH -#endif // VP9_COMMON_X86_CONVOLVE_H_ +#endif // VPX_DSP_X86_CONVOLVE_H_ diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vpx_dsp/x86/vpx_asm_stubs.c similarity index 60% rename from vp9/common/x86/vp9_asm_stubs.c rename to vpx_dsp/x86/vpx_asm_stubs.c index fd55fb8c6..422b0fc42 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vpx_dsp/x86/vpx_asm_stubs.c @@ -8,53 +8,53 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vp9_rtcd.h" #include "./vpx_config.h" -#include "vp9/common/x86/convolve.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/convolve.h" #if HAVE_SSE2 -filter8_1dfunction vp9_filter_block1d16_v8_sse2; -filter8_1dfunction vp9_filter_block1d16_h8_sse2; -filter8_1dfunction vp9_filter_block1d8_v8_sse2; -filter8_1dfunction vp9_filter_block1d8_h8_sse2; -filter8_1dfunction vp9_filter_block1d4_v8_sse2; -filter8_1dfunction vp9_filter_block1d4_h8_sse2; -filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; -filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; -filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; -filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; -filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; -filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_v8_sse2; +filter8_1dfunction vpx_filter_block1d16_h8_sse2; +filter8_1dfunction vpx_filter_block1d8_v8_sse2; +filter8_1dfunction vpx_filter_block1d8_h8_sse2; +filter8_1dfunction vpx_filter_block1d4_v8_sse2; +filter8_1dfunction vpx_filter_block1d4_h8_sse2; +filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2; -filter8_1dfunction vp9_filter_block1d16_v2_sse2; -filter8_1dfunction vp9_filter_block1d16_h2_sse2; -filter8_1dfunction vp9_filter_block1d8_v2_sse2; -filter8_1dfunction vp9_filter_block1d8_h2_sse2; -filter8_1dfunction vp9_filter_block1d4_v2_sse2; -filter8_1dfunction vp9_filter_block1d4_h2_sse2; -filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; -filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; -filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_v2_sse2; +filter8_1dfunction vpx_filter_block1d16_h2_sse2; +filter8_1dfunction vpx_filter_block1d8_v2_sse2; +filter8_1dfunction vpx_filter_block1d8_h2_sse2; +filter8_1dfunction vpx_filter_block1d4_v2_sse2; +filter8_1dfunction vpx_filter_block1d4_h2_sse2; +filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; -// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, @@ -64,12 +64,12 @@ FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); -// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, @@ -78,33 +78,33 @@ FUN_CONV_2D(, sse2); FUN_CONV_2D(avg_ , sse2); #if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 -highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_avg_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_avg_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_avg_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_avg_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_avg_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_avg_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_avg_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_avg_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_avg_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_avg_sse2; -highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; -// void vp9_highbd_convolve8_horiz_sse2(const uint8_t *src, +// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, // ptrdiff_t src_stride, // uint8_t *dst, // ptrdiff_t dst_stride, @@ -113,7 +113,7 @@ highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2; // const int16_t *filter_y, // int y_step_q4, // int w, int h, int bd); -// void vp9_highbd_convolve8_vert_sse2(const uint8_t *src, +// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, // ptrdiff_t src_stride, // uint8_t *dst, // ptrdiff_t dst_stride, @@ -122,7 +122,7 @@ highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2; // const int16_t *filter_y, // int y_step_q4, // int w, int h, int bd); -// void vp9_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, +// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, // ptrdiff_t src_stride, // uint8_t *dst, // ptrdiff_t dst_stride, @@ -131,7 +131,7 @@ highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2; // const int16_t *filter_y, // int y_step_q4, // int w, int h, int bd); -// void vp9_highbd_convolve8_avg_vert_sse2(const uint8_t *src, +// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, // ptrdiff_t src_stride, // uint8_t *dst, // ptrdiff_t dst_stride, @@ -146,12 +146,12 @@ HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); -// void vp9_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h, int bd); -// void vp9_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, diff --git a/vp9/common/x86/vp9_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm similarity index 99% rename from vp9/common/x86/vp9_copy_sse2.asm rename to vpx_dsp/x86/vpx_convolve_copy_sse2.asm index b26383708..6cd620a59 100644 --- a/vp9/common/x86/vp9_copy_sse2.asm +++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm @@ -8,6 +8,8 @@ ; be found in the AUTHORS file in the root of the source tree. ; +%define program_name vpx + %include "third_party/x86inc/x86inc.asm" SECTION .text diff --git a/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm b/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm similarity index 94% rename from vp9/common/x86/vp9_high_subpixel_8t_sse2.asm rename to vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm index 29ec151ed..bfc816f23 100644 --- a/vp9/common/x86/vp9_high_subpixel_8t_sse2.asm +++ b/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm @@ -197,7 +197,7 @@ movdqu [rdi + %2], xmm0 %endm -;void vp9_filter_block1d4_v8_sse2 +;void vpx_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -206,8 +206,8 @@ ; unsigned int output_height, ; short *filter ;) -global sym(vp9_highbd_filter_block1d4_v8_sse2) PRIVATE -sym(vp9_highbd_filter_block1d4_v8_sse2): +global sym(vpx_highbd_filter_block1d4_v8_sse2) PRIVATE +sym(vpx_highbd_filter_block1d4_v8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -267,7 +267,7 @@ sym(vp9_highbd_filter_block1d4_v8_sse2): pop rbp ret -;void vp9_filter_block1d8_v8_sse2 +;void vpx_filter_block1d8_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -276,8 +276,8 @@ sym(vp9_highbd_filter_block1d4_v8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_highbd_filter_block1d8_v8_sse2) PRIVATE -sym(vp9_highbd_filter_block1d8_v8_sse2): +global sym(vpx_highbd_filter_block1d8_v8_sse2) PRIVATE +sym(vpx_highbd_filter_block1d8_v8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -326,7 +326,7 @@ sym(vp9_highbd_filter_block1d8_v8_sse2): pop rbp ret -;void vp9_filter_block1d16_v8_sse2 +;void vpx_filter_block1d16_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -335,8 +335,8 @@ sym(vp9_highbd_filter_block1d8_v8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_highbd_filter_block1d16_v8_sse2) PRIVATE -sym(vp9_highbd_filter_block1d16_v8_sse2): +global sym(vpx_highbd_filter_block1d16_v8_sse2) PRIVATE +sym(vpx_highbd_filter_block1d16_v8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -389,8 +389,8 @@ sym(vp9_highbd_filter_block1d16_v8_sse2): pop rbp ret -global sym(vp9_highbd_filter_block1d4_v8_avg_sse2) PRIVATE -sym(vp9_highbd_filter_block1d4_v8_avg_sse2): +global sym(vpx_highbd_filter_block1d4_v8_avg_sse2) PRIVATE +sym(vpx_highbd_filter_block1d4_v8_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -450,8 +450,8 @@ sym(vp9_highbd_filter_block1d4_v8_avg_sse2): pop rbp ret -global sym(vp9_highbd_filter_block1d8_v8_avg_sse2) PRIVATE -sym(vp9_highbd_filter_block1d8_v8_avg_sse2): +global sym(vpx_highbd_filter_block1d8_v8_avg_sse2) PRIVATE +sym(vpx_highbd_filter_block1d8_v8_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -499,8 +499,8 @@ sym(vp9_highbd_filter_block1d8_v8_avg_sse2): pop rbp ret -global sym(vp9_highbd_filter_block1d16_v8_avg_sse2) PRIVATE -sym(vp9_highbd_filter_block1d16_v8_avg_sse2): +global sym(vpx_highbd_filter_block1d16_v8_avg_sse2) PRIVATE +sym(vpx_highbd_filter_block1d16_v8_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -552,7 +552,7 @@ sym(vp9_highbd_filter_block1d16_v8_avg_sse2): pop rbp ret -;void vp9_filter_block1d4_h8_sse2 +;void vpx_filter_block1d4_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -561,8 +561,8 @@ sym(vp9_highbd_filter_block1d16_v8_avg_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_highbd_filter_block1d4_h8_sse2) PRIVATE -sym(vp9_highbd_filter_block1d4_h8_sse2): +global sym(vpx_highbd_filter_block1d4_h8_sse2) PRIVATE +sym(vpx_highbd_filter_block1d4_h8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -627,7 +627,7 @@ sym(vp9_highbd_filter_block1d4_h8_sse2): pop rbp ret -;void vp9_filter_block1d8_h8_sse2 +;void vpx_filter_block1d8_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -636,8 +636,8 @@ sym(vp9_highbd_filter_block1d4_h8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_highbd_filter_block1d8_h8_sse2) PRIVATE -sym(vp9_highbd_filter_block1d8_h8_sse2): +global sym(vpx_highbd_filter_block1d8_h8_sse2) PRIVATE +sym(vpx_highbd_filter_block1d8_h8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -693,7 +693,7 @@ sym(vp9_highbd_filter_block1d8_h8_sse2): pop rbp ret -;void vp9_filter_block1d16_h8_sse2 +;void vpx_filter_block1d16_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -702,8 +702,8 @@ sym(vp9_highbd_filter_block1d8_h8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_highbd_filter_block1d16_h8_sse2) PRIVATE -sym(vp9_highbd_filter_block1d16_h8_sse2): +global sym(vpx_highbd_filter_block1d16_h8_sse2) PRIVATE +sym(vpx_highbd_filter_block1d16_h8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -770,8 +770,8 @@ sym(vp9_highbd_filter_block1d16_h8_sse2): pop rbp ret -global sym(vp9_highbd_filter_block1d4_h8_avg_sse2) PRIVATE -sym(vp9_highbd_filter_block1d4_h8_avg_sse2): +global sym(vpx_highbd_filter_block1d4_h8_avg_sse2) PRIVATE +sym(vpx_highbd_filter_block1d4_h8_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -836,8 +836,8 @@ sym(vp9_highbd_filter_block1d4_h8_avg_sse2): pop rbp ret -global sym(vp9_highbd_filter_block1d8_h8_avg_sse2) PRIVATE -sym(vp9_highbd_filter_block1d8_h8_avg_sse2): +global sym(vpx_highbd_filter_block1d8_h8_avg_sse2) PRIVATE +sym(vpx_highbd_filter_block1d8_h8_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -893,8 +893,8 @@ sym(vp9_highbd_filter_block1d8_h8_avg_sse2): pop rbp ret -global sym(vp9_highbd_filter_block1d16_h8_avg_sse2) PRIVATE -sym(vp9_highbd_filter_block1d16_h8_avg_sse2): +global sym(vpx_highbd_filter_block1d16_h8_avg_sse2) PRIVATE +sym(vpx_highbd_filter_block1d16_h8_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 diff --git a/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm b/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm similarity index 89% rename from vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm rename to vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm index 93784121c..72f2ff71d 100644 --- a/vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm +++ b/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm @@ -171,8 +171,8 @@ %endm %endif -global sym(vp9_highbd_filter_block1d4_v2_sse2) PRIVATE -sym(vp9_highbd_filter_block1d4_v2_sse2): +global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE +sym(vpx_highbd_filter_block1d4_v2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -196,8 +196,8 @@ sym(vp9_highbd_filter_block1d4_v2_sse2): ret %if ARCH_X86_64 -global sym(vp9_highbd_filter_block1d8_v2_sse2) PRIVATE -sym(vp9_highbd_filter_block1d8_v2_sse2): +global sym(vpx_highbd_filter_block1d8_v2_sse2) PRIVATE +sym(vpx_highbd_filter_block1d8_v2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -222,8 +222,8 @@ sym(vp9_highbd_filter_block1d8_v2_sse2): pop rbp ret -global sym(vp9_highbd_filter_block1d16_v2_sse2) PRIVATE -sym(vp9_highbd_filter_block1d16_v2_sse2): +global sym(vpx_highbd_filter_block1d16_v2_sse2) PRIVATE +sym(vpx_highbd_filter_block1d16_v2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -251,8 +251,8 @@ sym(vp9_highbd_filter_block1d16_v2_sse2): ret %endif -global sym(vp9_highbd_filter_block1d4_v2_avg_sse2) PRIVATE -sym(vp9_highbd_filter_block1d4_v2_avg_sse2): +global sym(vpx_highbd_filter_block1d4_v2_avg_sse2) PRIVATE +sym(vpx_highbd_filter_block1d4_v2_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -276,8 +276,8 @@ sym(vp9_highbd_filter_block1d4_v2_avg_sse2): ret %if ARCH_X86_64 -global sym(vp9_highbd_filter_block1d8_v2_avg_sse2) PRIVATE -sym(vp9_highbd_filter_block1d8_v2_avg_sse2): +global sym(vpx_highbd_filter_block1d8_v2_avg_sse2) PRIVATE +sym(vpx_highbd_filter_block1d8_v2_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -302,8 +302,8 @@ sym(vp9_highbd_filter_block1d8_v2_avg_sse2): pop rbp ret -global sym(vp9_highbd_filter_block1d16_v2_avg_sse2) PRIVATE -sym(vp9_highbd_filter_block1d16_v2_avg_sse2): +global sym(vpx_highbd_filter_block1d16_v2_avg_sse2) PRIVATE +sym(vpx_highbd_filter_block1d16_v2_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -331,8 +331,8 @@ sym(vp9_highbd_filter_block1d16_v2_avg_sse2): ret %endif -global sym(vp9_highbd_filter_block1d4_h2_sse2) PRIVATE -sym(vp9_highbd_filter_block1d4_h2_sse2): +global sym(vpx_highbd_filter_block1d4_h2_sse2) PRIVATE +sym(vpx_highbd_filter_block1d4_h2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -357,8 +357,8 @@ sym(vp9_highbd_filter_block1d4_h2_sse2): ret %if ARCH_X86_64 -global sym(vp9_highbd_filter_block1d8_h2_sse2) PRIVATE -sym(vp9_highbd_filter_block1d8_h2_sse2): +global sym(vpx_highbd_filter_block1d8_h2_sse2) PRIVATE +sym(vpx_highbd_filter_block1d8_h2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -383,8 +383,8 @@ sym(vp9_highbd_filter_block1d8_h2_sse2): pop rbp ret -global sym(vp9_highbd_filter_block1d16_h2_sse2) PRIVATE -sym(vp9_highbd_filter_block1d16_h2_sse2): +global sym(vpx_highbd_filter_block1d16_h2_sse2) PRIVATE +sym(vpx_highbd_filter_block1d16_h2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -412,8 +412,8 @@ sym(vp9_highbd_filter_block1d16_h2_sse2): ret %endif -global sym(vp9_highbd_filter_block1d4_h2_avg_sse2) PRIVATE -sym(vp9_highbd_filter_block1d4_h2_avg_sse2): +global sym(vpx_highbd_filter_block1d4_h2_avg_sse2) PRIVATE +sym(vpx_highbd_filter_block1d4_h2_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -438,8 +438,8 @@ sym(vp9_highbd_filter_block1d4_h2_avg_sse2): ret %if ARCH_X86_64 -global sym(vp9_highbd_filter_block1d8_h2_avg_sse2) PRIVATE -sym(vp9_highbd_filter_block1d8_h2_avg_sse2): +global sym(vpx_highbd_filter_block1d8_h2_avg_sse2) PRIVATE +sym(vpx_highbd_filter_block1d8_h2_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 @@ -464,8 +464,8 @@ sym(vp9_highbd_filter_block1d8_h2_avg_sse2): pop rbp ret -global sym(vp9_highbd_filter_block1d16_h2_avg_sse2) PRIVATE -sym(vp9_highbd_filter_block1d16_h2_avg_sse2): +global sym(vpx_highbd_filter_block1d16_h2_avg_sse2) PRIVATE +sym(vpx_highbd_filter_block1d16_h2_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c similarity index 92% rename from vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c rename to vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index cee8d1e76..29ede19f7 100644 --- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -11,11 +11,11 @@ // Due to a header conflict between math.h and intrinsics includes with ceil() // in certain configurations under vs9 this include needs to precede // immintrin.h. -#include "./vp9_rtcd.h" #include -#include "vp9/common/x86/convolve.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/convolve.h" #include "vpx_ports/mem.h" // filters for 16_h8 and 16_v8 @@ -60,7 +60,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) #endif // __clang__ -static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr, +static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, @@ -304,7 +304,7 @@ static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr, } } -static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr, +static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, @@ -551,41 +551,41 @@ static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr, } #if HAVE_AVX2 && HAVE_SSSE3 -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vpx_filter_block1d4_v8_ssse3; #if ARCH_X86_64 -filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; -#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 -#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 -#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 +filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; +#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3 +#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3 +#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3 #else // ARCH_X86 -filter8_1dfunction vp9_filter_block1d8_v8_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_ssse3; -#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 -#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 -#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 +filter8_1dfunction vpx_filter_block1d8_v8_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_ssse3; +#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3 +#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3 +#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 #endif // ARCH_X86_64 -filter8_1dfunction vp9_filter_block1d16_v2_ssse3; -filter8_1dfunction vp9_filter_block1d16_h2_ssse3; -filter8_1dfunction vp9_filter_block1d8_v2_ssse3; -filter8_1dfunction vp9_filter_block1d8_h2_ssse3; -filter8_1dfunction vp9_filter_block1d4_v2_ssse3; -filter8_1dfunction vp9_filter_block1d4_h2_ssse3; -#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 -#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 -#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 -#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 -#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 -#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 -#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 -// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +filter8_1dfunction vpx_filter_block1d16_v2_ssse3; +filter8_1dfunction vpx_filter_block1d16_h2_ssse3; +filter8_1dfunction vpx_filter_block1d8_v2_ssse3; +filter8_1dfunction vpx_filter_block1d8_h2_ssse3; +filter8_1dfunction vpx_filter_block1d4_v2_ssse3; +filter8_1dfunction vpx_filter_block1d4_h2_ssse3; +#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3 +#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3 +#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3 +#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3 +#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3 +#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3 +#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3 +// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, @@ -593,7 +593,7 @@ filter8_1dfunction vp9_filter_block1d4_h2_ssse3; FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); -// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c similarity index 89% rename from vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c rename to vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 5fd2857e1..01771dec9 100644 --- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -11,11 +11,11 @@ // Due to a header conflict between math.h and intrinsics includes with ceil() // in certain configurations under vs9 this include needs to precede // tmmintrin.h. -#include "./vp9_rtcd.h" #include -#include "vp9/common/x86/convolve.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/convolve.h" #include "vpx_ports/mem.h" #include "vpx_ports/emmintrin_compat.h" @@ -46,11 +46,11 @@ DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { }; // These are reused by the avx2 intrinsics. -filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; -void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, +void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, @@ -121,7 +121,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, } } -void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, +void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, @@ -201,7 +201,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, } } -static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, +static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, @@ -318,7 +318,7 @@ static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, } } -void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, +void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, @@ -406,7 +406,7 @@ void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, } } -static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, +static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, @@ -522,61 +522,61 @@ static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, } #if ARCH_X86_64 -filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; -#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 -#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 -#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 -#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 -#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 +filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d4_v8_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; +#define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3 +#define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3 +#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3 +#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3 +#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3 #else // ARCH_X86 -filter8_1dfunction vp9_filter_block1d16_v8_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +filter8_1dfunction vpx_filter_block1d16_v8_ssse3; +filter8_1dfunction vpx_filter_block1d16_h8_ssse3; +filter8_1dfunction vpx_filter_block1d8_v8_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_ssse3; +filter8_1dfunction vpx_filter_block1d4_v8_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_ssse3; #endif // ARCH_X86_64 -filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d16_v2_ssse3; -filter8_1dfunction vp9_filter_block1d16_h2_ssse3; -filter8_1dfunction vp9_filter_block1d8_v2_ssse3; -filter8_1dfunction vp9_filter_block1d8_h2_ssse3; -filter8_1dfunction vp9_filter_block1d4_v2_ssse3; -filter8_1dfunction vp9_filter_block1d4_h2_ssse3; -filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; -filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d16_v2_ssse3; +filter8_1dfunction vpx_filter_block1d16_h2_ssse3; +filter8_1dfunction vpx_filter_block1d8_v2_ssse3; +filter8_1dfunction vpx_filter_block1d8_h2_ssse3; +filter8_1dfunction vpx_filter_block1d4_v2_ssse3; +filter8_1dfunction vpx_filter_block1d4_h2_ssse3; +filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; -// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, @@ -587,12 +587,12 @@ FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, ssse3); -// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, diff --git a/vp9/common/x86/vp9_subpixel_8t_sse2.asm b/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm similarity index 94% rename from vp9/common/x86/vp9_subpixel_8t_sse2.asm rename to vpx_dsp/x86/vpx_subpixel_8t_sse2.asm index 9dc8d0abb..08f3d6a6c 100644 --- a/vp9/common/x86/vp9_subpixel_8t_sse2.asm +++ b/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm @@ -176,7 +176,7 @@ movq [rdi + %2], xmm0 %endm -;void vp9_filter_block1d4_v8_sse2 +;void vpx_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -185,8 +185,8 @@ ; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d4_v8_sse2) PRIVATE -sym(vp9_filter_block1d4_v8_sse2): +global sym(vpx_filter_block1d4_v8_sse2) PRIVATE +sym(vpx_filter_block1d4_v8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -243,7 +243,7 @@ sym(vp9_filter_block1d4_v8_sse2): pop rbp ret -;void vp9_filter_block1d8_v8_sse2 +;void vpx_filter_block1d8_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -252,8 +252,8 @@ sym(vp9_filter_block1d4_v8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d8_v8_sse2) PRIVATE -sym(vp9_filter_block1d8_v8_sse2): +global sym(vpx_filter_block1d8_v8_sse2) PRIVATE +sym(vpx_filter_block1d8_v8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -302,7 +302,7 @@ sym(vp9_filter_block1d8_v8_sse2): pop rbp ret -;void vp9_filter_block1d16_v8_sse2 +;void vpx_filter_block1d16_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -311,8 +311,8 @@ sym(vp9_filter_block1d8_v8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d16_v8_sse2) PRIVATE -sym(vp9_filter_block1d16_v8_sse2): +global sym(vpx_filter_block1d16_v8_sse2) PRIVATE +sym(vpx_filter_block1d16_v8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -365,8 +365,8 @@ sym(vp9_filter_block1d16_v8_sse2): pop rbp ret -global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE -sym(vp9_filter_block1d4_v8_avg_sse2): +global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE +sym(vpx_filter_block1d4_v8_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -423,8 +423,8 @@ sym(vp9_filter_block1d4_v8_avg_sse2): pop rbp ret -global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE -sym(vp9_filter_block1d8_v8_avg_sse2): +global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE +sym(vpx_filter_block1d8_v8_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -472,8 +472,8 @@ sym(vp9_filter_block1d8_v8_avg_sse2): pop rbp ret -global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE -sym(vp9_filter_block1d16_v8_avg_sse2): +global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE +sym(vpx_filter_block1d16_v8_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -525,7 +525,7 @@ sym(vp9_filter_block1d16_v8_avg_sse2): pop rbp ret -;void vp9_filter_block1d4_h8_sse2 +;void vpx_filter_block1d4_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -534,8 +534,8 @@ sym(vp9_filter_block1d16_v8_avg_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d4_h8_sse2) PRIVATE -sym(vp9_filter_block1d4_h8_sse2): +global sym(vpx_filter_block1d4_h8_sse2) PRIVATE +sym(vpx_filter_block1d4_h8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -599,7 +599,7 @@ sym(vp9_filter_block1d4_h8_sse2): pop rbp ret -;void vp9_filter_block1d8_h8_sse2 +;void vpx_filter_block1d8_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -608,8 +608,8 @@ sym(vp9_filter_block1d4_h8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d8_h8_sse2) PRIVATE -sym(vp9_filter_block1d8_h8_sse2): +global sym(vpx_filter_block1d8_h8_sse2) PRIVATE +sym(vpx_filter_block1d8_h8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -674,7 +674,7 @@ sym(vp9_filter_block1d8_h8_sse2): pop rbp ret -;void vp9_filter_block1d16_h8_sse2 +;void vpx_filter_block1d16_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -683,8 +683,8 @@ sym(vp9_filter_block1d8_h8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d16_h8_sse2) PRIVATE -sym(vp9_filter_block1d16_h8_sse2): +global sym(vpx_filter_block1d16_h8_sse2) PRIVATE +sym(vpx_filter_block1d16_h8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -769,8 +769,8 @@ sym(vp9_filter_block1d16_h8_sse2): pop rbp ret -global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE -sym(vp9_filter_block1d4_h8_avg_sse2): +global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE +sym(vpx_filter_block1d4_h8_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -834,8 +834,8 @@ sym(vp9_filter_block1d4_h8_avg_sse2): pop rbp ret -global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE -sym(vp9_filter_block1d8_h8_avg_sse2): +global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE +sym(vpx_filter_block1d8_h8_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -900,8 +900,8 @@ sym(vp9_filter_block1d8_h8_avg_sse2): pop rbp ret -global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE -sym(vp9_filter_block1d16_h8_avg_sse2): +global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE +sym(vpx_filter_block1d16_h8_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm similarity index 95% rename from vp9/common/x86/vp9_subpixel_8t_ssse3.asm rename to vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm index 4a5bf1b60..68acc03ce 100644 --- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm @@ -310,7 +310,7 @@ jnz .loop %endm -;void vp9_filter_block1d8_v8_ssse3 +;void vpx_filter_block1d8_v8_ssse3 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -319,8 +319,8 @@ ; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE -sym(vp9_filter_block1d4_v8_ssse3): +global sym(vpx_filter_block1d4_v8_ssse3) PRIVATE +sym(vpx_filter_block1d4_v8_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -351,7 +351,7 @@ sym(vp9_filter_block1d4_v8_ssse3): pop rbp ret -;void vp9_filter_block1d8_v8_ssse3 +;void vpx_filter_block1d8_v8_ssse3 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -360,8 +360,8 @@ sym(vp9_filter_block1d4_v8_ssse3): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE -sym(vp9_filter_block1d8_v8_ssse3): +global sym(vpx_filter_block1d8_v8_ssse3) PRIVATE +sym(vpx_filter_block1d8_v8_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -392,7 +392,7 @@ sym(vp9_filter_block1d8_v8_ssse3): pop rbp ret -;void vp9_filter_block1d16_v8_ssse3 +;void vpx_filter_block1d16_v8_ssse3 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -401,8 +401,8 @@ sym(vp9_filter_block1d8_v8_ssse3): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE -sym(vp9_filter_block1d16_v8_ssse3): +global sym(vpx_filter_block1d16_v8_ssse3) PRIVATE +sym(vpx_filter_block1d16_v8_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -436,8 +436,8 @@ sym(vp9_filter_block1d16_v8_ssse3): ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE -sym(vp9_filter_block1d4_v8_avg_ssse3): +global sym(vpx_filter_block1d4_v8_avg_ssse3) PRIVATE +sym(vpx_filter_block1d4_v8_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -468,8 +468,8 @@ sym(vp9_filter_block1d4_v8_avg_ssse3): pop rbp ret -global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE -sym(vp9_filter_block1d8_v8_avg_ssse3): +global sym(vpx_filter_block1d8_v8_avg_ssse3) PRIVATE +sym(vpx_filter_block1d8_v8_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -500,8 +500,8 @@ sym(vp9_filter_block1d8_v8_avg_ssse3): pop rbp ret -global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE -sym(vp9_filter_block1d16_v8_avg_ssse3): +global sym(vpx_filter_block1d16_v8_avg_ssse3) PRIVATE +sym(vpx_filter_block1d16_v8_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -838,7 +838,7 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): jnz .loop %endm -;void vp9_filter_block1d4_h8_ssse3 +;void vpx_filter_block1d4_h8_ssse3 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -847,8 +847,8 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE -sym(vp9_filter_block1d4_h8_ssse3): +global sym(vpx_filter_block1d4_h8_ssse3) PRIVATE +sym(vpx_filter_block1d4_h8_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -877,7 +877,7 @@ sym(vp9_filter_block1d4_h8_ssse3): pop rbp ret -;void vp9_filter_block1d8_h8_ssse3 +;void vpx_filter_block1d8_h8_ssse3 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -886,8 +886,8 @@ sym(vp9_filter_block1d4_h8_ssse3): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE -sym(vp9_filter_block1d8_h8_ssse3): +global sym(vpx_filter_block1d8_h8_ssse3) PRIVATE +sym(vpx_filter_block1d8_h8_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -919,7 +919,7 @@ sym(vp9_filter_block1d8_h8_ssse3): pop rbp ret -;void vp9_filter_block1d16_h8_ssse3 +;void vpx_filter_block1d16_h8_ssse3 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -928,8 +928,8 @@ sym(vp9_filter_block1d8_h8_ssse3): ; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE -sym(vp9_filter_block1d16_h8_ssse3): +global sym(vpx_filter_block1d16_h8_ssse3) PRIVATE +sym(vpx_filter_block1d16_h8_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -961,8 +961,8 @@ sym(vp9_filter_block1d16_h8_ssse3): pop rbp ret -global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE -sym(vp9_filter_block1d4_h8_avg_ssse3): +global sym(vpx_filter_block1d4_h8_avg_ssse3) PRIVATE +sym(vpx_filter_block1d4_h8_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -991,8 +991,8 @@ sym(vp9_filter_block1d4_h8_avg_ssse3): pop rbp ret -global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE -sym(vp9_filter_block1d8_h8_avg_ssse3): +global sym(vpx_filter_block1d8_h8_avg_ssse3) PRIVATE +sym(vpx_filter_block1d8_h8_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -1024,8 +1024,8 @@ sym(vp9_filter_block1d8_h8_avg_ssse3): pop rbp ret -global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE -sym(vp9_filter_block1d16_h8_avg_ssse3): +global sym(vpx_filter_block1d16_h8_avg_ssse3) PRIVATE +sym(vpx_filter_block1d16_h8_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 diff --git a/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm b/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm similarity index 89% rename from vp9/common/x86/vp9_subpixel_bilinear_sse2.asm rename to vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm index d94ccf2e9..a378dd040 100644 --- a/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm +++ b/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm @@ -131,8 +131,8 @@ dec rcx %endm -global sym(vp9_filter_block1d4_v2_sse2) PRIVATE -sym(vp9_filter_block1d4_v2_sse2): +global sym(vpx_filter_block1d4_v2_sse2) PRIVATE +sym(vpx_filter_block1d4_v2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -155,8 +155,8 @@ sym(vp9_filter_block1d4_v2_sse2): pop rbp ret -global sym(vp9_filter_block1d8_v2_sse2) PRIVATE -sym(vp9_filter_block1d8_v2_sse2): +global sym(vpx_filter_block1d8_v2_sse2) PRIVATE +sym(vpx_filter_block1d8_v2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -181,8 +181,8 @@ sym(vp9_filter_block1d8_v2_sse2): pop rbp ret -global sym(vp9_filter_block1d16_v2_sse2) PRIVATE -sym(vp9_filter_block1d16_v2_sse2): +global sym(vpx_filter_block1d16_v2_sse2) PRIVATE +sym(vpx_filter_block1d16_v2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -209,8 +209,8 @@ sym(vp9_filter_block1d16_v2_sse2): pop rbp ret -global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE -sym(vp9_filter_block1d4_v2_avg_sse2): +global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE +sym(vpx_filter_block1d4_v2_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -233,8 +233,8 @@ sym(vp9_filter_block1d4_v2_avg_sse2): pop rbp ret -global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE -sym(vp9_filter_block1d8_v2_avg_sse2): +global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE +sym(vpx_filter_block1d8_v2_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -259,8 +259,8 @@ sym(vp9_filter_block1d8_v2_avg_sse2): pop rbp ret -global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE -sym(vp9_filter_block1d16_v2_avg_sse2): +global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE +sym(vpx_filter_block1d16_v2_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -287,8 +287,8 @@ sym(vp9_filter_block1d16_v2_avg_sse2): pop rbp ret -global sym(vp9_filter_block1d4_h2_sse2) PRIVATE -sym(vp9_filter_block1d4_h2_sse2): +global sym(vpx_filter_block1d4_h2_sse2) PRIVATE +sym(vpx_filter_block1d4_h2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -312,8 +312,8 @@ sym(vp9_filter_block1d4_h2_sse2): pop rbp ret -global sym(vp9_filter_block1d8_h2_sse2) PRIVATE -sym(vp9_filter_block1d8_h2_sse2): +global sym(vpx_filter_block1d8_h2_sse2) PRIVATE +sym(vpx_filter_block1d8_h2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -339,8 +339,8 @@ sym(vp9_filter_block1d8_h2_sse2): pop rbp ret -global sym(vp9_filter_block1d16_h2_sse2) PRIVATE -sym(vp9_filter_block1d16_h2_sse2): +global sym(vpx_filter_block1d16_h2_sse2) PRIVATE +sym(vpx_filter_block1d16_h2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -367,8 +367,8 @@ sym(vp9_filter_block1d16_h2_sse2): pop rbp ret -global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE -sym(vp9_filter_block1d4_h2_avg_sse2): +global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE +sym(vpx_filter_block1d4_h2_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -392,8 +392,8 @@ sym(vp9_filter_block1d4_h2_avg_sse2): pop rbp ret -global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE -sym(vp9_filter_block1d8_h2_avg_sse2): +global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE +sym(vpx_filter_block1d8_h2_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -419,8 +419,8 @@ sym(vp9_filter_block1d8_h2_avg_sse2): pop rbp ret -global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE -sym(vp9_filter_block1d16_h2_avg_sse2): +global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE +sym(vpx_filter_block1d16_h2_avg_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 diff --git a/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm b/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm similarity index 88% rename from vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm rename to vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm index b5e18fe6d..3c8cfd225 100644 --- a/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm +++ b/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm @@ -109,8 +109,8 @@ dec rcx %endm -global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE -sym(vp9_filter_block1d4_v2_ssse3): +global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE +sym(vpx_filter_block1d4_v2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -133,8 +133,8 @@ sym(vp9_filter_block1d4_v2_ssse3): pop rbp ret -global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE -sym(vp9_filter_block1d8_v2_ssse3): +global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE +sym(vpx_filter_block1d8_v2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -159,8 +159,8 @@ sym(vp9_filter_block1d8_v2_ssse3): pop rbp ret -global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE -sym(vp9_filter_block1d16_v2_ssse3): +global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE +sym(vpx_filter_block1d16_v2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -186,8 +186,8 @@ sym(vp9_filter_block1d16_v2_ssse3): pop rbp ret -global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE -sym(vp9_filter_block1d4_v2_avg_ssse3): +global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE +sym(vpx_filter_block1d4_v2_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -210,8 +210,8 @@ sym(vp9_filter_block1d4_v2_avg_ssse3): pop rbp ret -global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE -sym(vp9_filter_block1d8_v2_avg_ssse3): +global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE +sym(vpx_filter_block1d8_v2_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -236,8 +236,8 @@ sym(vp9_filter_block1d8_v2_avg_ssse3): pop rbp ret -global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE -sym(vp9_filter_block1d16_v2_avg_ssse3): +global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE +sym(vpx_filter_block1d16_v2_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -263,8 +263,8 @@ sym(vp9_filter_block1d16_v2_avg_ssse3): pop rbp ret -global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE -sym(vp9_filter_block1d4_h2_ssse3): +global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE +sym(vpx_filter_block1d4_h2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -288,8 +288,8 @@ sym(vp9_filter_block1d4_h2_ssse3): pop rbp ret -global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE -sym(vp9_filter_block1d8_h2_ssse3): +global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE +sym(vpx_filter_block1d8_h2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -315,8 +315,8 @@ sym(vp9_filter_block1d8_h2_ssse3): pop rbp ret -global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE -sym(vp9_filter_block1d16_h2_ssse3): +global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE +sym(vpx_filter_block1d16_h2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -342,8 +342,8 @@ sym(vp9_filter_block1d16_h2_ssse3): pop rbp ret -global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE -sym(vp9_filter_block1d4_h2_avg_ssse3): +global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE +sym(vpx_filter_block1d4_h2_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -367,8 +367,8 @@ sym(vp9_filter_block1d4_h2_avg_ssse3): pop rbp ret -global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE -sym(vp9_filter_block1d8_h2_avg_ssse3): +global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE +sym(vpx_filter_block1d8_h2_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -394,8 +394,8 @@ sym(vp9_filter_block1d8_h2_avg_ssse3): pop rbp ret -global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE -sym(vp9_filter_block1d16_h2_avg_ssse3): +global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE +sym(vpx_filter_block1d16_h2_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6