Commit 145a5f09dc for aom
commit 145a5f09dcb5e26bc9f40a00f7ef3559bd9c5159
Author: Diksha Singh <diksha.singh@ittiam.com>
Date: Wed Apr 15 13:23:15 2026 +0530
Add SSE4.1 optimization for interp_cubic()
This patch refactors av1_model_rd_curvfit() by consolidating two
interp_cubic() calls for computing rate and distortion.
Also, the relevant SSE4.1 implementation for the combined interp_cubic()
function along with its corresponding unit test is added.
The scaling of the SSE4.1 implementation w.r.t. C is ~1.23x.
Change-Id: I941cda972a9b5551dee009339e143933538cb88e
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 5e730f9be5..4b92eb4add 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -363,6 +363,7 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
"${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_sse4.c"
"${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
"${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/model_rd_sse4.c"
"${AOM_ROOT}/av1/encoder/x86/rdopt_sse4.c"
"${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index d5d63703e7..bd5934044d 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -460,6 +460,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void av1_get_horver_correlation_full/, "const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
specialize qw/av1_get_horver_correlation_full sse4_1 avx2 neon/;
+ add_proto qw/void av1_interp_cubic_rate_dist/, "const double *p1, const double *p2, double x, double * const rate_f, double * const distbysse_f";
+ specialize qw/av1_interp_cubic_rate_dist sse4_1/;
+
add_proto qw/void av1_nn_predict/, "const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
add_proto qw/void av1_nn_fast_softmax_16/, "const float *input_nodes, float *output";
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index e9f6e54f03..b5f6b3a725 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -949,6 +949,13 @@ static double interp_cubic(const double *p, double x) {
x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
}
+void av1_interp_cubic_rate_dist_c(const double *p1, const double *p2, double x,
+ double *const rate_f,
+ double *const distbysse_f) {
+ *rate_f = interp_cubic(p1, x);
+ *distbysse_f = interp_cubic(p2, x);
+}
+
/*
static double interp_bicubic(const double *p, int p_stride, double x,
double y) {
@@ -1080,9 +1087,8 @@ void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
assert(xi > 0);
const double *prate = &interp_rgrid_curv[rcat][(xi - 1)];
- *rate_f = interp_cubic(prate, xo);
const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)];
- *distbysse_f = interp_cubic(pdist, xo);
+ av1_interp_cubic_rate_dist(prate, pdist, xo, rate_f, distbysse_f);
}
static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize,
diff --git a/av1/encoder/x86/model_rd_sse4.c b/av1/encoder/x86/model_rd_sse4.c
new file mode 100644
index 0000000000..48bc580624
--- /dev/null
+++ b/av1/encoder/x86/model_rd_sse4.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2026, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+void av1_interp_cubic_rate_dist_sse4_1(const double *p1, const double *p2,
+ double x, double *const rate_f,
+ double *const distbysse_f) {
+ const __m128d half = _mm_set1_pd(0.5);
+ const __m128d two = _mm_set1_pd(2.0);
+ const __m128d three = _mm_set1_pd(3.0);
+ const __m128d four = _mm_set1_pd(4.0);
+ const __m128d five = _mm_set1_pd(5.0);
+
+ const __m128d reg_x = _mm_set1_pd(x);
+ const __m128d reg_p0 = _mm_set_pd(p2[0], p1[0]);
+ const __m128d reg_p1 = _mm_set_pd(p2[1], p1[1]);
+ const __m128d reg_p2 = _mm_set_pd(p2[2], p1[2]);
+ const __m128d reg_p3 = _mm_set_pd(p2[3], p1[3]);
+
+ // To ensure that results are bit-identical to the C code, we need to perform
+ // exactly the same sequence of operations here as in the C code.
+ // reg_res_0 = x * (3.0 * (p[1] - p[2]) + p[3] - p[0])
+ __m128d reg_res_0 = _mm_sub_pd(reg_p1, reg_p2);
+ reg_res_0 = _mm_mul_pd(three, reg_res_0);
+ reg_res_0 = _mm_add_pd(reg_res_0, reg_p3);
+ reg_res_0 = _mm_sub_pd(reg_res_0, reg_p0);
+ reg_res_0 = _mm_mul_pd(reg_x, reg_res_0);
+
+ // reg_res_1 = 2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2]- p[3]
+ const __m128d regp0_x_2 = _mm_mul_pd(two, reg_p0);
+ const __m128d regp1_x_5 = _mm_mul_pd(five, reg_p1);
+ const __m128d regp2_x_4 = _mm_mul_pd(four, reg_p2);
+ __m128d reg_res_1 = _mm_sub_pd(regp0_x_2, regp1_x_5);
+ reg_res_1 = _mm_add_pd(reg_res_1, regp2_x_4);
+ reg_res_1 = _mm_sub_pd(reg_res_1, reg_p3);
+
+ // reg_res_2 = x * (reg_res_1 + reg_res_0)
+ __m128d reg_res_2 = _mm_add_pd(reg_res_1, reg_res_0);
+ reg_res_2 = _mm_mul_pd(reg_x, reg_res_2);
+
+ // reg_res_3 = p[2] - p[0] + reg_res_2
+ __m128d reg_res_3 = _mm_sub_pd(reg_p2, reg_p0);
+ reg_res_3 = _mm_add_pd(reg_res_3, reg_res_2);
+
+ // reg_res_4 = p[1] + 0.5 * x * reg_res_3
+ __m128d reg_res_4 = _mm_mul_pd(_mm_mul_pd(half, reg_x), reg_res_3);
+ reg_res_4 = _mm_add_pd(reg_p1, reg_res_4);
+
+ double result[2];
+ _mm_storeu_pd(result, reg_res_4);
+ *rate_f = result[0];
+ *distbysse_f = result[1];
+}
diff --git a/test/model_rd_test.cc b/test/model_rd_test.cc
new file mode 100644
index 0000000000..910f20abda
--- /dev/null
+++ b/test/model_rd_test.cc
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2026, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+
+#include "gtest/gtest.h"
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+using InterpCubicRateDistFunc = void (*)(const double *p1, const double *p2,
+ double x, double *const rate_f,
+ double *const distbysse_f);
+
+using InterpCubicTestParam = std::tuple<const InterpCubicRateDistFunc>;
+
+class InterpCubicTest : public ::testing::TestWithParam<InterpCubicTestParam> {
+ public:
+ double generate_random_double(double min, double max) {
+ return min + (static_cast<double>(rnd_.Rand31()) / ((1U << 31) - 1)) *
+ (max - min);
+ }
+ void SetUp() override { target_func_ = GET_PARAM(0); }
+ void TearDown() override {}
+ void CheckOutput();
+ void SpeedTest();
+
+ protected:
+ InterpCubicRateDistFunc target_func_;
+
+ private:
+ libaom_test::ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InterpCubicTest);
+
+void InterpCubicTest::CheckOutput() {
+ double p1[4], p2[4];
+ double rate_f_ref, rate_f_mod, distbysse_f_ref, distbysse_f_mod;
+ const int knum_iter = 10000;
+ for (int iter = 0; iter < knum_iter; iter++) {
+ for (int i = 0; i < 4; i++) {
+ p1[i] = generate_random_double(0.0000, 4096.000000);
+ p2[i] = generate_random_double(0.0000, 16.0000);
+ }
+ double x = generate_random_double(0.0000, 1.0000);
+
+ av1_interp_cubic_rate_dist_c(p1, p2, x, &rate_f_ref, &distbysse_f_ref);
+ target_func_(p1, p2, x, &rate_f_mod, &distbysse_f_mod);
+ EXPECT_EQ(rate_f_ref, rate_f_mod) << "Error: rate_f value mismatch";
+ EXPECT_EQ(distbysse_f_ref, distbysse_f_mod)
+ << "Error: distbysse_f value mismatch";
+ }
+}
+
+void InterpCubicTest::SpeedTest() {
+ double p1[4], p2[4];
+ double rate_f_ref, rate_f_mod, distbysse_f_ref, distbysse_f_mod;
+
+ for (int i = 0; i < 4; i++) {
+ p1[i] = generate_random_double(0.0000, 4096.0000);
+ p2[i] = generate_random_double(0.0000, 16.0000);
+ }
+ double x = generate_random_double(0.0000, 1.0000);
+
+ const int num_iter = 100000000;
+
+ aom_usec_timer ref_timer, test_timer;
+ aom_usec_timer_start(&ref_timer);
+ for (int iter = 0; iter < num_iter; iter++) {
+ av1_interp_cubic_rate_dist_c(p1, p2, x, &rate_f_ref, &distbysse_f_ref);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int iter = 0; iter < num_iter; iter++) {
+ target_func_(p1, p2, x, &rate_f_mod, &distbysse_f_mod);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf(
+ " c_time=%d \t simd_time=%d \t "
+ "Scaling=%lf \n",
+ elapsed_time_c, elapsed_time_simd,
+ (static_cast<double>(elapsed_time_c) / elapsed_time_simd));
+}
+
+TEST_P(InterpCubicTest, CheckOutput) { CheckOutput(); }
+
+TEST_P(InterpCubicTest, DISABLED_Speed) { SpeedTest(); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, InterpCubicTest,
+ ::testing::Values(av1_interp_cubic_rate_dist_sse4_1));
+#endif // HAVE_SSE4_1
+
+} // namespace
diff --git a/test/test.cmake b/test/test.cmake
index 6224c54e6e..84ba9182f5 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -232,6 +232,7 @@ if(NOT BUILD_SHARED_LIBS)
"${AOM_ROOT}/test/masked_variance_test.cc"
"${AOM_ROOT}/test/metadata_test.cc"
"${AOM_ROOT}/test/minmax_test.cc"
+ "${AOM_ROOT}/test/model_rd_test.cc"
"${AOM_ROOT}/test/motion_vector_test.cc"
"${AOM_ROOT}/test/mv_cost_test.cc"
"${AOM_ROOT}/test/obmc_sad_test.cc"