Commit f55fa5ee4e for aom
commit f55fa5ee4eebad77d9271198436f27123340ed52
Author: Jerome Jiang <jianj@google.com>
Date: Thu Jun 11 14:08:33 2026 -0400
Optimize update_coeff_* functions in txb_rdopt.c
This change is bitexact.
Avoid calling get_coeff_dist() with value=0 (which computes dist0)
when qmatrix == NULL.
This is achieved by rewriting the RD cost comparison using relative
offset distortions compared to dist0 (dist_diff_0 = dist - dist0
and dist_diff_low_0 = dist_low - dist0).
Since dist0 * 128 cancels out on both sides of the inequality:
rd_low < rd
=> RDCOST(rdmult, rate_low, dist_low) < RDCOST(rdmult, rate, dist)
=> RDCOST(rdmult, rate_low, dist_diff_low_0) <
RDCOST(rdmult, rate, dist_diff_0)
We can compute the exact same RD cost comparison with perfect
bit-exactness using the standard RDCOST macro, while completely
skipping the computation of dist0 when qmatrix is NULL.
Speed up:
speed 1: 0.096%
speed 2: 0.118%
speed 3: 0.115%
speed 4: 0.025%
speed 5: 0.022%
Change-Id: I360628cfcde67709255bd53a37396cca252ac069
diff --git a/av1/encoder/txb_rdopt.c b/av1/encoder/txb_rdopt.c
index fb0ef12860..14c903eba1 100644
--- a/av1/encoder/txb_rdopt.c
+++ b/av1/encoder/txb_rdopt.c
@@ -34,41 +34,53 @@ static inline void update_coeff_general(
const tran_low_t abs_qc = abs(qc);
const tran_low_t tqc = tcoeff[ci];
const tran_low_t dqc = dqcoeff[ci];
- const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci);
- const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
const int rate =
get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx,
dc_sign_ctx, txb_costs, bhl, tx_class, levels);
- const int64_t rd = RDCOST(rdmult, rate, dist);
- tran_low_t qc_low, dqc_low;
- tran_low_t abs_qc_low;
- int64_t dist_low, rd_low;
+ tran_low_t qc_low = 0, dqc_low = 0;
+ tran_low_t abs_qc_low = 0;
int rate_low;
if (abs_qc == 1) {
- abs_qc_low = qc_low = dqc_low = 0;
- dist_low = dist0;
rate_low = txb_costs->base_cost[coeff_ctx][0];
} else {
const int dqv = get_dqv(dequant, scan[si], iqmatrix);
get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
abs_qc_low = abs_qc - 1;
- dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci);
rate_low =
get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
dc_sign_ctx, txb_costs, bhl, tx_class, levels);
}
- rd_low = RDCOST(rdmult, rate_low, dist_low);
+ int64_t dist_diff_0, dist_diff_low_0;
+ if (qmatrix == NULL) {
+ const int64_t tqc2 = (int64_t)tqc * 2;
+ dist_diff_0 = ((int64_t)dqc * (dqc - tqc2)) * (1 << (2 * shift));
+ dist_diff_low_0 =
+ (abs_qc == 1)
+ ? 0
+ : (((int64_t)dqc_low * (dqc_low - tqc2)) * (1 << (2 * shift)));
+ } else {
+ const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+ dist_diff_0 = get_coeff_dist(tqc, dqc, shift, qmatrix, ci) - dist0;
+ dist_diff_low_0 =
+ (abs_qc == 1)
+ ? 0
+ : (get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci) - dist0);
+ }
+
+ const int64_t rd = RDCOST(rdmult, rate, dist_diff_0);
+ const int64_t rd_low = RDCOST(rdmult, rate_low, dist_diff_low_0);
+
if (rd_low < rd) {
qcoeff[ci] = qc_low;
dqcoeff[ci] = dqc_low;
levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
*accu_rate += rate_low;
- *accu_dist += dist_low - dist0;
+ *accu_dist += dist_diff_low_0;
} else {
*accu_rate += rate;
- *accu_dist += dist - dist0;
+ *accu_dist += dist_diff_0;
}
}
}
@@ -103,24 +115,30 @@ static AOM_FORCE_INLINE void update_coeff_simple(
return;
}
- const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci);
- const int64_t rd = RDCOST(rdmult, rate, dist);
-
- const int64_t dist_low =
- get_coeff_dist(abs_tqc, /*abs_dqc_low*/ 0, shift, qmatrix, ci);
- const int rate_low = rate - base_cost[5];
- const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
-
const int allow_lower_qc = sharpness ? 0 : 1;
-
- if (rd_low < rd && allow_lower_qc) {
- qcoeff[ci] = 0;
- dqcoeff[ci] = 0;
- levels[get_padded_idx(ci, bhl)] = 0;
- *accu_rate += rate_low;
- } else {
- *accu_rate += rate;
+ if (allow_lower_qc) {
+ int64_t dist_diff_0;
+ if (qmatrix == NULL) {
+ dist_diff_0 =
+ ((int64_t)abs_dqc * (abs_dqc - ((int64_t)abs_tqc * 2))) *
+ (1 << (2 * shift));
+ } else {
+ const int64_t dist0 = get_coeff_dist(abs_tqc, 0, shift, qmatrix, ci);
+ dist_diff_0 =
+ get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci) - dist0;
+ }
+ const int rate_low = rate - base_cost[5];
+ const int64_t rd = RDCOST(rdmult, rate, dist_diff_0);
+ const int64_t rd_low = RDCOST(rdmult, rate_low, 0);
+ if (rd_low < rd) {
+ qcoeff[ci] = 0;
+ dqcoeff[ci] = 0;
+ levels[get_padded_idx(ci, bhl)] = 0;
+ *accu_rate += rate_low;
+ return;
+ }
}
+ *accu_rate += rate;
} else {
int rate_low = 0;
const int rate = get_two_coeff_cost_simple(
@@ -130,27 +148,37 @@ static AOM_FORCE_INLINE void update_coeff_simple(
return;
}
- const int dqv = get_dqv(dequant, scan[si], iqmatrix);
- const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci);
- const int64_t rd = RDCOST(rdmult, rate, dist);
-
- const tran_low_t abs_qc_low = abs_qc - 1;
- const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
- const int64_t dist_low =
- get_coeff_dist(abs_tqc, abs_dqc_low, shift, qmatrix, ci);
- const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
-
const int allow_lower_qc = sharpness ? (abs_qc > 1) : 1;
-
- if (rd_low < rd && allow_lower_qc) {
- const int sign = (qc < 0) ? 1 : 0;
- qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
- dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
- levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
- *accu_rate += rate_low;
- } else {
- *accu_rate += rate;
+ if (allow_lower_qc) {
+ const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+ const tran_low_t abs_qc_low = abs_qc - 1;
+ const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+ int64_t dist_diff_0, dist_diff_low_0;
+ if (qmatrix == NULL) {
+ const int64_t abs_tqc2 = (int64_t)abs_tqc << 1;
+ dist_diff_0 =
+ ((int64_t)abs_dqc * (abs_dqc - abs_tqc2)) * (1 << (2 * shift));
+ dist_diff_low_0 = ((int64_t)abs_dqc_low * (abs_dqc_low - abs_tqc2)) *
+ (1 << (2 * shift));
+ } else {
+ const int64_t dist0 = get_coeff_dist(abs_tqc, 0, shift, qmatrix, ci);
+ dist_diff_0 =
+ get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci) - dist0;
+ dist_diff_low_0 =
+ get_coeff_dist(abs_tqc, abs_dqc_low, shift, qmatrix, ci) - dist0;
+ }
+ const int64_t rd = RDCOST(rdmult, rate, dist_diff_0);
+ const int64_t rd_low = RDCOST(rdmult, rate_low, dist_diff_low_0);
+ if (rd_low < rd) {
+ const int sign = (qc < 0) ? 1 : 0;
+ qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
+ dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
+ levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
+ *accu_rate += rate_low;
+ return;
+ }
}
+ *accu_rate += rate;
}
}
}
@@ -177,34 +205,45 @@ static AOM_FORCE_INLINE void update_coeff_eob(
const tran_low_t tqc = tcoeff[ci];
const tran_low_t dqc = dqcoeff[ci];
const int sign = (qc < 0) ? 1 : 0;
- const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
- int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci) - dist0;
- int rate =
- get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
- txb_costs, bhl, tx_class, levels);
- int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
- tran_low_t qc_low, dqc_low;
- tran_low_t abs_qc_low;
- int64_t dist_low, rd_low;
+ tran_low_t qc_low = 0, dqc_low = 0;
+ tran_low_t abs_qc_low = 0;
int rate_low;
if (abs_qc == 1) {
- abs_qc_low = 0;
- dqc_low = qc_low = 0;
- dist_low = 0;
rate_low = txb_costs->base_cost[coeff_ctx][0];
- rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist);
} else {
get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
abs_qc_low = abs_qc - 1;
- dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci) - dist0;
rate_low =
get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx,
dc_sign_ctx, txb_costs, bhl, tx_class, levels);
- rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
}
+ int64_t dist, dist_low;
+ if (qmatrix == NULL) {
+ const int64_t tqc2 = (int64_t)tqc * 2;
+ dist = ((int64_t)dqc * (dqc - tqc2)) * (1 << (2 * shift));
+ dist_low =
+ (abs_qc == 1)
+ ? 0
+ : (((int64_t)dqc_low * (dqc_low - tqc2)) * (1 << (2 * shift)));
+ } else {
+ const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+ dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci) - dist0;
+ dist_low =
+ (abs_qc == 1)
+ ? 0
+ : (get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci) - dist0);
+ }
+
+ int rate =
+ get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
+ txb_costs, bhl, tx_class, levels);
+ int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
+ int64_t rd_low =
+ RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+
int lower_level_new_eob = 0;
const int new_eob = si + 1;
const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bhl, width, si);