Commit b12f167ca2 for aom

commit b12f167ca277634753badd3d05befd4dadd76937
Author: Diksha Singh <diksha.singh@ittiam.com>
Date:   Thu May 14 11:33:43 2026 +0530

    Refactor av1_interp_cubic_rate_dist()

    The function av1_interp_cubic_rate_dist() is refactored to
    store the results directly into a 2-element output array.
    This enables usage of a single 128 bit store in the
    corresponding SIMD function.

    Change-Id: If04b39dcde5a877cefa6d77655e8b6a231c87c8f

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 3b04073aa9..989b00d053 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -460,7 +460,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void av1_get_horver_correlation_full/, "const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
   specialize qw/av1_get_horver_correlation_full sse4_1 avx2 neon/;

-  add_proto qw/void av1_interp_cubic_rate_dist/, "const double *p1, const double *p2, double x, double * const rate_f, double * const distbysse_f";
+  add_proto qw/void av1_interp_cubic_rate_dist/, "const double *p1, const double *p2, double x, double rate_dist_f[2]";
   specialize qw/av1_interp_cubic_rate_dist sse2/;

   add_proto qw/void av1_nn_predict/, "const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
diff --git a/av1/encoder/model_rd.h b/av1/encoder/model_rd.h
index 08b81dadfd..0e519bc54e 100644
--- a/av1/encoder/model_rd.h
+++ b/av1/encoder/model_rd.h
@@ -134,12 +134,11 @@ static inline void model_rd_with_curvfit(const AV1_COMP *const cpi,
   const double sse_norm = (double)sse / num_samples;
   const double qstepsqr = (double)qstep * qstep;
   const double xqr = log2(sse_norm / qstepsqr);
-  double rate_f, dist_by_sse_norm_f;
-  av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f,
-                       &dist_by_sse_norm_f);
+  double rate_dist_f[2];
+  av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, rate_dist_f);

-  const double dist_f = dist_by_sse_norm_f * sse_norm;
-  int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+  const double dist_f = rate_dist_f[1] * sse_norm;
+  int rate_i = (int)(AOMMAX(0.0, rate_dist_f[0] * num_samples) + 0.5);
   int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);

   // Check if skip is better
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index b5f6b3a725..9a6e89bf01 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -950,10 +950,9 @@ static double interp_cubic(const double *p, double x) {
 }

 void av1_interp_cubic_rate_dist_c(const double *p1, const double *p2, double x,
-                                  double *const rate_f,
-                                  double *const distbysse_f) {
-  *rate_f = interp_cubic(p1, x);
-  *distbysse_f = interp_cubic(p2, x);
+                                  double rate_dist_f[2]) {
+  rate_dist_f[0] = interp_cubic(p1, x);
+  rate_dist_f[1] = interp_cubic(p2, x);
 }

 /*
@@ -1069,7 +1068,7 @@ static const double interp_dgrid_curv[3][65] = {
 };

 void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
-                          double *rate_f, double *distbysse_f) {
+                          double rate_dist_f[2]) {
   const double x_start = -15.5;
   const double x_end = 16.5;
   const double x_step = 0.5;
@@ -1088,7 +1087,7 @@ void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,

   const double *prate = &interp_rgrid_curv[rcat][(xi - 1)];
   const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)];
-  av1_interp_cubic_rate_dist(prate, pdist, xo, rate_f, distbysse_f);
+  av1_interp_cubic_rate_dist(prate, pdist, xo, rate_dist_f);
 }

 static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize,
diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index b97149c96c..c85cc9a8a5 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h
@@ -258,8 +258,23 @@ void av1_set_sad_per_bit(const struct AV1_COMP *cpi, int *sadperbit,
 void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
                                   unsigned int qstep, int *rate, int64_t *dist);

+/*!\brief Estimate rate and distortion for a block.
+ *
+ * \param[in]    bsize           Block size
+ * \param[in]    sse_norm        Normalized SSE
+ * \param[in]    xqr             The log2 ratio of normalized SSE to the
+ *                               squared quantization step size, computed
+ *                               as: log2(sse_norm / qstep^2)
+ * \param[out]   rate_dist_f     Pointer to store the results
+ *                               rate_dist_f[0] stores the estimated rate
+ *                               rate_dist_f[1] stores the estimated
+ *                               distortion by normalized SSE
+ *
+ * \remark Nothing is returned. Results are saved in rate_dist_f[0]
+ * and rate_dist_f[1].
+ */
 void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
-                          double *rate_f, double *distbysse_f);
+                          double rate_dist_f[2]);

 int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
                             InterpFilter interp_filter, int dual_filter);
diff --git a/av1/encoder/x86/model_rd_sse2.c b/av1/encoder/x86/model_rd_sse2.c
index 9986666de5..1d6e9460ad 100644
--- a/av1/encoder/x86/model_rd_sse2.c
+++ b/av1/encoder/x86/model_rd_sse2.c
@@ -14,8 +14,7 @@
 #include "config/av1_rtcd.h"

 void av1_interp_cubic_rate_dist_sse2(const double *p1, const double *p2,
-                                     double x, double *const rate_f,
-                                     double *const distbysse_f) {
+                                     double x, double rate_dist_f[2]) {
   const __m128d half = _mm_set1_pd(0.5);
   const __m128d two = _mm_set1_pd(2.0);
   const __m128d three = _mm_set1_pd(3.0);
@@ -57,6 +56,5 @@ void av1_interp_cubic_rate_dist_sse2(const double *p1, const double *p2,
   __m128d reg_res_4 = _mm_mul_pd(_mm_mul_pd(half, reg_x), reg_res_3);
   reg_res_4 = _mm_add_pd(reg_p1, reg_res_4);

-  _mm_storel_pd(rate_f, reg_res_4);
-  _mm_storeh_pd(distbysse_f, reg_res_4);
+  _mm_storeu_pd(rate_dist_f, reg_res_4);
 }
diff --git a/test/model_rd_test.cc b/test/model_rd_test.cc
index 40a88d049a..7c114106a0 100644
--- a/test/model_rd_test.cc
+++ b/test/model_rd_test.cc
@@ -22,8 +22,7 @@
 namespace {

 using InterpCubicRateDistFunc = void (*)(const double *p1, const double *p2,
-                                         double x, double *const rate_f,
-                                         double *const distbysse_f);
+                                         double x, double rate_dist_f[2]);

 class InterpCubicTest
     : public ::testing::TestWithParam<InterpCubicRateDistFunc> {
@@ -58,8 +57,7 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InterpCubicTest);
 #endif  // AOM_ARCH_X86 || AOM_ARCH_X86_64

 void InterpCubicTest::CheckOutput() {
-  double p1[4], p2[4];
-  double rate_f_ref, rate_f_mod, distbysse_f_ref, distbysse_f_mod;
+  double p1[4], p2[4], out_ref[2], out_mod[2];
   constexpr int kNumIters = 10000;
   for (int iter = 0; iter < kNumIters; ++iter) {
     for (int i = 0; i < 4; ++i) {
@@ -69,20 +67,18 @@ void InterpCubicTest::CheckOutput() {
     const double x = GenerateRandomDouble(0.0000, 1.0000);

     FLOATING_POINT_SET_PRECISION
-    av1_interp_cubic_rate_dist_c(p1, p2, x, &rate_f_ref, &distbysse_f_ref);
+    av1_interp_cubic_rate_dist_c(p1, p2, x, out_ref);
     FLOATING_POINT_RESTORE_PRECISION

-    API_REGISTER_STATE_CHECK(
-        target_func_(p1, p2, x, &rate_f_mod, &distbysse_f_mod));
-    EXPECT_EQ(rate_f_ref, rate_f_mod) << "Error: rate_f value mismatch";
-    EXPECT_EQ(distbysse_f_ref, distbysse_f_mod)
-        << "Error: distbysse_f value mismatch";
+    API_REGISTER_STATE_CHECK(target_func_(p1, p2, x, out_mod));
+
+    EXPECT_EQ(out_ref[0], out_mod[0]) << "Error: rate_f value mismatch";
+    EXPECT_EQ(out_ref[1], out_mod[1]) << "Error: distbysse_f value mismatch";
   }
 }

 void InterpCubicTest::SpeedTest() {
-  double p1[4], p2[4];
-  double rate_f_ref, rate_f_mod, distbysse_f_ref, distbysse_f_mod;
+  double p1[4], p2[4], out_ref[2], out_mod[2];

   for (int i = 0; i < 4; ++i) {
     p1[i] = GenerateRandomDouble(0.0000, 4096.0000);
@@ -96,7 +92,7 @@ void InterpCubicTest::SpeedTest() {
   FLOATING_POINT_SET_PRECISION
   aom_usec_timer_start(&ref_timer);
   for (int iter = 0; iter < kNumIters; ++iter) {
-    av1_interp_cubic_rate_dist_c(p1, p2, x, &rate_f_ref, &distbysse_f_ref);
+    av1_interp_cubic_rate_dist_c(p1, p2, x, out_ref);
   }
   aom_usec_timer_mark(&ref_timer);
   FLOATING_POINT_RESTORE_PRECISION
@@ -105,8 +101,7 @@ void InterpCubicTest::SpeedTest() {

   aom_usec_timer_start(&test_timer);
   for (int iter = 0; iter < kNumIters; ++iter) {
-    API_REGISTER_STATE_CHECK(
-        target_func_(p1, p2, x, &rate_f_mod, &distbysse_f_mod));
+    API_REGISTER_STATE_CHECK(target_func_(p1, p2, x, out_mod));
   }
   aom_usec_timer_mark(&test_timer);
   const int elapsed_time_simd =