Commit f55fa5ee4e for aom

commit f55fa5ee4eebad77d9271198436f27123340ed52
Author: Jerome Jiang <jianj@google.com>
Date:   Thu Jun 11 14:08:33 2026 -0400

    Optimize update_coeff_* functions in txb_rdopt.c

    This change is bitexact.

    Avoid calling get_coeff_dist() with value=0 (which computes dist0)
    when qmatrix == NULL.

    This is achieved by rewriting the RD cost comparison using relative
    offset distortions compared to dist0 (dist_diff_0 = dist - dist0
    and dist_diff_low_0 = dist_low - dist0).

    Since dist0 * 128 cancels out on both sides of the inequality:
      rd_low < rd
      => RDCOST(rdmult, rate_low, dist_low) < RDCOST(rdmult, rate, dist)
      => RDCOST(rdmult, rate_low, dist_diff_low_0) <
         RDCOST(rdmult, rate, dist_diff_0)

    We can compute the exact same RD cost comparison with perfect
    bit-exactness using the standard RDCOST macro, while completely
    skipping the computation of dist0 when qmatrix is NULL.

    Speed up:

    speed 1: 0.096%
    speed 2: 0.118%
    speed 3: 0.115%
    speed 4: 0.025%
    speed 5: 0.022%

    Change-Id: I360628cfcde67709255bd53a37396cca252ac069

diff --git a/av1/encoder/txb_rdopt.c b/av1/encoder/txb_rdopt.c
index fb0ef12860..14c903eba1 100644
--- a/av1/encoder/txb_rdopt.c
+++ b/av1/encoder/txb_rdopt.c
@@ -34,41 +34,53 @@ static inline void update_coeff_general(
     const tran_low_t abs_qc = abs(qc);
     const tran_low_t tqc = tcoeff[ci];
     const tran_low_t dqc = dqcoeff[ci];
-    const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci);
-    const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
     const int rate =
         get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx,
                                dc_sign_ctx, txb_costs, bhl, tx_class, levels);
-    const int64_t rd = RDCOST(rdmult, rate, dist);

-    tran_low_t qc_low, dqc_low;
-    tran_low_t abs_qc_low;
-    int64_t dist_low, rd_low;
+    tran_low_t qc_low = 0, dqc_low = 0;
+    tran_low_t abs_qc_low = 0;
     int rate_low;
     if (abs_qc == 1) {
-      abs_qc_low = qc_low = dqc_low = 0;
-      dist_low = dist0;
       rate_low = txb_costs->base_cost[coeff_ctx][0];
     } else {
       const int dqv = get_dqv(dequant, scan[si], iqmatrix);
       get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
       abs_qc_low = abs_qc - 1;
-      dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci);
       rate_low =
           get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
                                  dc_sign_ctx, txb_costs, bhl, tx_class, levels);
     }

-    rd_low = RDCOST(rdmult, rate_low, dist_low);
+    int64_t dist_diff_0, dist_diff_low_0;
+    if (qmatrix == NULL) {
+      const int64_t tqc2 = (int64_t)tqc * 2;
+      dist_diff_0 = ((int64_t)dqc * (dqc - tqc2)) * (1 << (2 * shift));
+      dist_diff_low_0 =
+          (abs_qc == 1)
+              ? 0
+              : (((int64_t)dqc_low * (dqc_low - tqc2)) * (1 << (2 * shift)));
+    } else {
+      const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+      dist_diff_0 = get_coeff_dist(tqc, dqc, shift, qmatrix, ci) - dist0;
+      dist_diff_low_0 =
+          (abs_qc == 1)
+              ? 0
+              : (get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci) - dist0);
+    }
+
+    const int64_t rd = RDCOST(rdmult, rate, dist_diff_0);
+    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_diff_low_0);
+
     if (rd_low < rd) {
       qcoeff[ci] = qc_low;
       dqcoeff[ci] = dqc_low;
       levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
       *accu_rate += rate_low;
-      *accu_dist += dist_low - dist0;
+      *accu_dist += dist_diff_low_0;
     } else {
       *accu_rate += rate;
-      *accu_dist += dist - dist0;
+      *accu_dist += dist_diff_0;
     }
   }
 }
@@ -103,24 +115,30 @@ static AOM_FORCE_INLINE void update_coeff_simple(
         return;
       }

-      const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci);
-      const int64_t rd = RDCOST(rdmult, rate, dist);
-
-      const int64_t dist_low =
-          get_coeff_dist(abs_tqc, /*abs_dqc_low*/ 0, shift, qmatrix, ci);
-      const int rate_low = rate - base_cost[5];
-      const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
-
       const int allow_lower_qc = sharpness ? 0 : 1;
-
-      if (rd_low < rd && allow_lower_qc) {
-        qcoeff[ci] = 0;
-        dqcoeff[ci] = 0;
-        levels[get_padded_idx(ci, bhl)] = 0;
-        *accu_rate += rate_low;
-      } else {
-        *accu_rate += rate;
+      if (allow_lower_qc) {
+        int64_t dist_diff_0;
+        if (qmatrix == NULL) {
+          dist_diff_0 =
+              ((int64_t)abs_dqc * (abs_dqc - ((int64_t)abs_tqc * 2))) *
+              (1 << (2 * shift));
+        } else {
+          const int64_t dist0 = get_coeff_dist(abs_tqc, 0, shift, qmatrix, ci);
+          dist_diff_0 =
+              get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci) - dist0;
+        }
+        const int rate_low = rate - base_cost[5];
+        const int64_t rd = RDCOST(rdmult, rate, dist_diff_0);
+        const int64_t rd_low = RDCOST(rdmult, rate_low, 0);
+        if (rd_low < rd) {
+          qcoeff[ci] = 0;
+          dqcoeff[ci] = 0;
+          levels[get_padded_idx(ci, bhl)] = 0;
+          *accu_rate += rate_low;
+          return;
+        }
       }
+      *accu_rate += rate;
     } else {
       int rate_low = 0;
       const int rate = get_two_coeff_cost_simple(
@@ -130,27 +148,37 @@ static AOM_FORCE_INLINE void update_coeff_simple(
         return;
       }

-      const int dqv = get_dqv(dequant, scan[si], iqmatrix);
-      const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci);
-      const int64_t rd = RDCOST(rdmult, rate, dist);
-
-      const tran_low_t abs_qc_low = abs_qc - 1;
-      const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
-      const int64_t dist_low =
-          get_coeff_dist(abs_tqc, abs_dqc_low, shift, qmatrix, ci);
-      const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
-
       const int allow_lower_qc = sharpness ? (abs_qc > 1) : 1;
-
-      if (rd_low < rd && allow_lower_qc) {
-        const int sign = (qc < 0) ? 1 : 0;
-        qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
-        dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
-        levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
-        *accu_rate += rate_low;
-      } else {
-        *accu_rate += rate;
+      if (allow_lower_qc) {
+        const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+        const tran_low_t abs_qc_low = abs_qc - 1;
+        const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+        int64_t dist_diff_0, dist_diff_low_0;
+        if (qmatrix == NULL) {
+          const int64_t abs_tqc2 = (int64_t)abs_tqc << 1;
+          dist_diff_0 =
+              ((int64_t)abs_dqc * (abs_dqc - abs_tqc2)) * (1 << (2 * shift));
+          dist_diff_low_0 = ((int64_t)abs_dqc_low * (abs_dqc_low - abs_tqc2)) *
+                            (1 << (2 * shift));
+        } else {
+          const int64_t dist0 = get_coeff_dist(abs_tqc, 0, shift, qmatrix, ci);
+          dist_diff_0 =
+              get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci) - dist0;
+          dist_diff_low_0 =
+              get_coeff_dist(abs_tqc, abs_dqc_low, shift, qmatrix, ci) - dist0;
+        }
+        const int64_t rd = RDCOST(rdmult, rate, dist_diff_0);
+        const int64_t rd_low = RDCOST(rdmult, rate_low, dist_diff_low_0);
+        if (rd_low < rd) {
+          const int sign = (qc < 0) ? 1 : 0;
+          qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
+          dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
+          levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
+          *accu_rate += rate_low;
+          return;
+        }
       }
+      *accu_rate += rate;
     }
   }
 }
@@ -177,34 +205,45 @@ static AOM_FORCE_INLINE void update_coeff_eob(
     const tran_low_t tqc = tcoeff[ci];
     const tran_low_t dqc = dqcoeff[ci];
     const int sign = (qc < 0) ? 1 : 0;
-    const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
-    int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci) - dist0;
-    int rate =
-        get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
-                               txb_costs, bhl, tx_class, levels);
-    int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);

-    tran_low_t qc_low, dqc_low;
-    tran_low_t abs_qc_low;
-    int64_t dist_low, rd_low;
+    tran_low_t qc_low = 0, dqc_low = 0;
+    tran_low_t abs_qc_low = 0;
     int rate_low;

     if (abs_qc == 1) {
-      abs_qc_low = 0;
-      dqc_low = qc_low = 0;
-      dist_low = 0;
       rate_low = txb_costs->base_cost[coeff_ctx][0];
-      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist);
     } else {
       get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
       abs_qc_low = abs_qc - 1;
-      dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci) - dist0;
       rate_low =
           get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx,
                                  dc_sign_ctx, txb_costs, bhl, tx_class, levels);
-      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
     }

+    int64_t dist, dist_low;
+    if (qmatrix == NULL) {
+      const int64_t tqc2 = (int64_t)tqc * 2;
+      dist = ((int64_t)dqc * (dqc - tqc2)) * (1 << (2 * shift));
+      dist_low =
+          (abs_qc == 1)
+              ? 0
+              : (((int64_t)dqc_low * (dqc_low - tqc2)) * (1 << (2 * shift)));
+    } else {
+      const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+      dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci) - dist0;
+      dist_low =
+          (abs_qc == 1)
+              ? 0
+              : (get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci) - dist0);
+    }
+
+    int rate =
+        get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
+                               txb_costs, bhl, tx_class, levels);
+    int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
+    int64_t rd_low =
+        RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+
     int lower_level_new_eob = 0;
     const int new_eob = si + 1;
     const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bhl, width, si);