Commit b12f167ca2 for aom
commit b12f167ca277634753badd3d05befd4dadd76937
Author: Diksha Singh <diksha.singh@ittiam.com>
Date: Thu May 14 11:33:43 2026 +0530
Refactor av1_interp_cubic_rate_dist()
The function av1_interp_cubic_rate_dist() is refactored to
store the results directly into a 2-element output array.
This enables usage of a single 128 bit store in the
corresponding SIMD function.
Change-Id: If04b39dcde5a877cefa6d77655e8b6a231c87c8f
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 3b04073aa9..989b00d053 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -460,7 +460,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void av1_get_horver_correlation_full/, "const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
specialize qw/av1_get_horver_correlation_full sse4_1 avx2 neon/;
- add_proto qw/void av1_interp_cubic_rate_dist/, "const double *p1, const double *p2, double x, double * const rate_f, double * const distbysse_f";
+ add_proto qw/void av1_interp_cubic_rate_dist/, "const double *p1, const double *p2, double x, double rate_dist_f[2]";
specialize qw/av1_interp_cubic_rate_dist sse2/;
add_proto qw/void av1_nn_predict/, "const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
diff --git a/av1/encoder/model_rd.h b/av1/encoder/model_rd.h
index 08b81dadfd..0e519bc54e 100644
--- a/av1/encoder/model_rd.h
+++ b/av1/encoder/model_rd.h
@@ -134,12 +134,11 @@ static inline void model_rd_with_curvfit(const AV1_COMP *const cpi,
const double sse_norm = (double)sse / num_samples;
const double qstepsqr = (double)qstep * qstep;
const double xqr = log2(sse_norm / qstepsqr);
- double rate_f, dist_by_sse_norm_f;
- av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f,
- &dist_by_sse_norm_f);
+ double rate_dist_f[2];
+ av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, rate_dist_f);
- const double dist_f = dist_by_sse_norm_f * sse_norm;
- int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+ const double dist_f = rate_dist_f[1] * sse_norm;
+ int rate_i = (int)(AOMMAX(0.0, rate_dist_f[0] * num_samples) + 0.5);
int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
// Check if skip is better
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index b5f6b3a725..9a6e89bf01 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -950,10 +950,9 @@ static double interp_cubic(const double *p, double x) {
}
void av1_interp_cubic_rate_dist_c(const double *p1, const double *p2, double x,
- double *const rate_f,
- double *const distbysse_f) {
- *rate_f = interp_cubic(p1, x);
- *distbysse_f = interp_cubic(p2, x);
+ double rate_dist_f[2]) {
+ rate_dist_f[0] = interp_cubic(p1, x);
+ rate_dist_f[1] = interp_cubic(p2, x);
}
/*
@@ -1069,7 +1068,7 @@ static const double interp_dgrid_curv[3][65] = {
};
void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
- double *rate_f, double *distbysse_f) {
+ double rate_dist_f[2]) {
const double x_start = -15.5;
const double x_end = 16.5;
const double x_step = 0.5;
@@ -1088,7 +1087,7 @@ void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
const double *prate = &interp_rgrid_curv[rcat][(xi - 1)];
const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)];
- av1_interp_cubic_rate_dist(prate, pdist, xo, rate_f, distbysse_f);
+ av1_interp_cubic_rate_dist(prate, pdist, xo, rate_dist_f);
}
static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize,
diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index b97149c96c..c85cc9a8a5 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h
@@ -258,8 +258,23 @@ void av1_set_sad_per_bit(const struct AV1_COMP *cpi, int *sadperbit,
void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
unsigned int qstep, int *rate, int64_t *dist);
+/*!\brief Estimate rate and distortion for a block.
+ *
+ * \param[in] bsize Block size
+ * \param[in] sse_norm Normalized SSE
+ * \param[in] xqr The log2 ratio of normalized SSE to the
+ * squared quantization step size, computed
+ * as: log2(sse_norm / qstep^2)
+ * \param[out] rate_dist_f Pointer to store the results
+ * rate_dist_f[0] stores the estimated rate
+ * rate_dist_f[1] stores the estimated
+ * distortion by normalized SSE
+ *
+ * \remark Nothing is returned. Results are saved in rate_dist_f[0]
+ * and rate_dist_f[1].
+ */
void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
- double *rate_f, double *distbysse_f);
+ double rate_dist_f[2]);
int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
InterpFilter interp_filter, int dual_filter);
diff --git a/av1/encoder/x86/model_rd_sse2.c b/av1/encoder/x86/model_rd_sse2.c
index 9986666de5..1d6e9460ad 100644
--- a/av1/encoder/x86/model_rd_sse2.c
+++ b/av1/encoder/x86/model_rd_sse2.c
@@ -14,8 +14,7 @@
#include "config/av1_rtcd.h"
void av1_interp_cubic_rate_dist_sse2(const double *p1, const double *p2,
- double x, double *const rate_f,
- double *const distbysse_f) {
+ double x, double rate_dist_f[2]) {
const __m128d half = _mm_set1_pd(0.5);
const __m128d two = _mm_set1_pd(2.0);
const __m128d three = _mm_set1_pd(3.0);
@@ -57,6 +56,5 @@ void av1_interp_cubic_rate_dist_sse2(const double *p1, const double *p2,
__m128d reg_res_4 = _mm_mul_pd(_mm_mul_pd(half, reg_x), reg_res_3);
reg_res_4 = _mm_add_pd(reg_p1, reg_res_4);
- _mm_storel_pd(rate_f, reg_res_4);
- _mm_storeh_pd(distbysse_f, reg_res_4);
+ _mm_storeu_pd(rate_dist_f, reg_res_4);
}
diff --git a/test/model_rd_test.cc b/test/model_rd_test.cc
index 40a88d049a..7c114106a0 100644
--- a/test/model_rd_test.cc
+++ b/test/model_rd_test.cc
@@ -22,8 +22,7 @@
namespace {
using InterpCubicRateDistFunc = void (*)(const double *p1, const double *p2,
- double x, double *const rate_f,
- double *const distbysse_f);
+ double x, double rate_dist_f[2]);
class InterpCubicTest
: public ::testing::TestWithParam<InterpCubicRateDistFunc> {
@@ -58,8 +57,7 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InterpCubicTest);
#endif // AOM_ARCH_X86 || AOM_ARCH_X86_64
void InterpCubicTest::CheckOutput() {
- double p1[4], p2[4];
- double rate_f_ref, rate_f_mod, distbysse_f_ref, distbysse_f_mod;
+ double p1[4], p2[4], out_ref[2], out_mod[2];
constexpr int kNumIters = 10000;
for (int iter = 0; iter < kNumIters; ++iter) {
for (int i = 0; i < 4; ++i) {
@@ -69,20 +67,18 @@ void InterpCubicTest::CheckOutput() {
const double x = GenerateRandomDouble(0.0000, 1.0000);
FLOATING_POINT_SET_PRECISION
- av1_interp_cubic_rate_dist_c(p1, p2, x, &rate_f_ref, &distbysse_f_ref);
+ av1_interp_cubic_rate_dist_c(p1, p2, x, out_ref);
FLOATING_POINT_RESTORE_PRECISION
- API_REGISTER_STATE_CHECK(
- target_func_(p1, p2, x, &rate_f_mod, &distbysse_f_mod));
- EXPECT_EQ(rate_f_ref, rate_f_mod) << "Error: rate_f value mismatch";
- EXPECT_EQ(distbysse_f_ref, distbysse_f_mod)
- << "Error: distbysse_f value mismatch";
+ API_REGISTER_STATE_CHECK(target_func_(p1, p2, x, out_mod));
+
+ EXPECT_EQ(out_ref[0], out_mod[0]) << "Error: rate_f value mismatch";
+ EXPECT_EQ(out_ref[1], out_mod[1]) << "Error: distbysse_f value mismatch";
}
}
void InterpCubicTest::SpeedTest() {
- double p1[4], p2[4];
- double rate_f_ref, rate_f_mod, distbysse_f_ref, distbysse_f_mod;
+ double p1[4], p2[4], out_ref[2], out_mod[2];
for (int i = 0; i < 4; ++i) {
p1[i] = GenerateRandomDouble(0.0000, 4096.0000);
@@ -96,7 +92,7 @@ void InterpCubicTest::SpeedTest() {
FLOATING_POINT_SET_PRECISION
aom_usec_timer_start(&ref_timer);
for (int iter = 0; iter < kNumIters; ++iter) {
- av1_interp_cubic_rate_dist_c(p1, p2, x, &rate_f_ref, &distbysse_f_ref);
+ av1_interp_cubic_rate_dist_c(p1, p2, x, out_ref);
}
aom_usec_timer_mark(&ref_timer);
FLOATING_POINT_RESTORE_PRECISION
@@ -105,8 +101,7 @@ void InterpCubicTest::SpeedTest() {
aom_usec_timer_start(&test_timer);
for (int iter = 0; iter < kNumIters; ++iter) {
- API_REGISTER_STATE_CHECK(
- target_func_(p1, p2, x, &rate_f_mod, &distbysse_f_mod));
+ API_REGISTER_STATE_CHECK(target_func_(p1, p2, x, out_mod));
}
aom_usec_timer_mark(&test_timer);
const int elapsed_time_simd =