Commit 1c1c4abc43 for aom

commit 1c1c4abc43937d87732eef1ebb8e57e4e44eded6
Author: Apurve Kumar Pandey <apurve.pandey@ittiam.com>
Date:   Thu May 21 14:00:43 2026 +0530

    Fill residual pixels outside the actual frame border

    - The residual pixels outside the actual frame boundary of a
    transform block are modified (zero for identity transform
    and average of residual pixels inside the frame boundary of
    a transform block for others).
    - Distortion and SSE calculation are limited to actual width
    and height.

    Encoder performance results for various resolutions are as
    follows:

                    Instruction Count               BD-Rate Loss(%)
    cpu Resolution    Reduction(%)    avg.psnr  ovr.psnr   ssim     vmaf   vmaf_neg
     1    LOWRES2       -0.199        -0.3875   -0.3869  -0.4015  -0.3944  -0.3830
     1    MIDRES2       -0.889        -0.1784   -0.1773  -0.1935  -0.2374  -0.1984
     1    HDRES2        -0.268        -0.0686   -0.0788  -0.1383  -0.1734  -0.0232

    STATS_CHANGED

    Change-Id: Ia424bc2bcabecbb8fd975c8923f570e7df0a5608

diff --git a/aom_ports/mem.h b/aom_ports/mem.h
index df736a1846..f37a8259cc 100644
--- a/aom_ports/mem.h
+++ b/aom_ports/mem.h
@@ -72,6 +72,9 @@

 #define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y))

+#define DIVIDE_AND_ROUND_SIGNED(n, d) \
+  ((((n) < 0) ^ ((d) < 0)) ? (((n) - (d) / 2) / (d)) : (((n) + (d) / 2) / (d)))
+
 #define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
 #define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))

diff --git a/av1/encoder/allintra_vis.c b/av1/encoder/allintra_vis.c
index 50e0ba25d9..ec8ec2680d 100644
--- a/av1/encoder/allintra_vis.c
+++ b/av1/encoder/allintra_vis.c
@@ -349,8 +349,10 @@ void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
           xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter,
           block_size, block_size, tx_size, mode, 0, 0, FILTER_INTRA_MODES,
           mb_buffer, buf_stride, dst_buffer, dst_buffer_stride, 0, 0, 0);
-      av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size,
-                         mb_buffer, buf_stride, dst_buffer, dst_buffer_stride);
+      av1_subtract_block(x, block_size, block_size, src_diff, block_size,
+                         mb_buffer, buf_stride, dst_buffer, dst_buffer_stride,
+                         PLANE_TYPE_Y, bsize, 0, 0, DCT_DCT,
+                         cpi->do_border_pad);
       av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
       int intra_cost = aom_satd(coeff, coeff_count);
       if (intra_cost < best_intra_cost) {
@@ -363,8 +365,9 @@ void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
         xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter,
         block_size, block_size, tx_size, best_mode, 0, 0, FILTER_INTRA_MODES,
         mb_buffer, buf_stride, dst_buffer, dst_buffer_stride, 0, 0, 0);
-    av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size,
-                       mb_buffer, buf_stride, dst_buffer, dst_buffer_stride);
+    av1_subtract_block(x, block_size, block_size, src_diff, block_size,
+                       mb_buffer, buf_stride, dst_buffer, dst_buffer_stride,
+                       PLANE_TYPE_Y, bsize, 0, 0, DCT_DCT, cpi->do_border_pad);
     av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);

     const struct macroblock_plane *const p = &x->plane[0];
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index e811fcf740..ffda806a08 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -1438,6 +1438,12 @@ typedef struct macroblock {
    */
   RD_STATS *rdcost;
 #endif  // CONFIG_PARTITION_SEARCH_ORDER
+
+  /*! \brief Distance from bottom edge of the frame in pixels. */
+  int pix_to_bottom_edge;
+
+  /*! \brief Distance from right edge of the frame in pixels. */
+  int pix_to_right_edge;
 } MACROBLOCK;
 #undef SINGLE_REF_MODES

diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index be175a9989..23a912e205 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c
@@ -488,7 +488,6 @@ static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
                                    RD_STATS *rd_stats) {
   MACROBLOCKD *const xd = &x->e_mbd;
   if (ref_best_rd < 0) return INT64_MAX;
-  av1_subtract_plane(x, bs, 0);
   const int64_t rd = av1_estimate_txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs,
                                            max_txsize_rect_lookup[bs]);
   if (rd != INT64_MAX) {
@@ -1072,7 +1071,7 @@ static inline int prune_mode_by_skip_rd(const AV1_COMP *const cpi,
                              TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0);
   // Check if the mode is good enough based on skip rd
   if (txfm_rd_gate_level) {
-    int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize);
+    int64_t sse_y = compute_sse_plane(cpi, x, xd, PLANE_TYPE_Y, bsize);
     int64_t skip_rd = RDCOST(x->rdmult, mode_rate, (sse_y << 4));
     eval_txfm =
         check_txfm_eval(x, bsize, ref_skip_rd, skip_rd, txfm_rd_gate_level, 1);
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index bf5c697bde..7cd3a0273c 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -35,27 +35,152 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"

-void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff,
+// Compute the average value of the wxh block.
+static inline int16_t avg_wxh_block_c(int16_t *diff, ptrdiff_t diff_stride,
+                                      int w, int h) {
+  int32_t sum = 0;
+  for (int row = 0; row < h; ++row) {
+    for (int col = 0; col < w; ++col) {
+      sum += *(diff + row * diff_stride + col);
+    }
+  }
+  return (w * h > 0) ? (int16_t)(DIVIDE_AND_ROUND_SIGNED(sum, w * h)) : 0;
+}
+
+// Compute the row average value of the wxh block.
+static inline void avg_wxh_block_horiz_c(int16_t *diff, ptrdiff_t diff_stride,
+                                         int w, int h, int16_t *out) {
+  for (int row = 0; row < h; ++row) {
+    int32_t sum = 0;
+    for (int col = 0; col < w; ++col) {
+      sum += *(diff + row * diff_stride + col);
+    }
+    out[row] = (w > 0) ? (int16_t)DIVIDE_AND_ROUND_SIGNED(sum, w) : 0;
+  }
+}
+
+// Compute the column average value of the wxh block.
+static inline void avg_wxh_block_vert_c(int16_t *diff, ptrdiff_t diff_stride,
+                                        int w, int h, int16_t *out) {
+  for (int col = 0; col < w; ++col) {
+    int32_t sum = 0;
+    for (int row = 0; row < h; ++row) {
+      sum += *(diff + row * diff_stride + col);
+    }
+    out[col] = (h > 0) ? (int16_t)DIVIDE_AND_ROUND_SIGNED(sum, h) : 0;
+  }
+}
+
+static inline void *aom_memset_int16(void *dest, int16_t val, size_t length) {
+  size_t i;
+  int16_t *dest16 = (int16_t *)dest;
+  for (i = 0; i < length; i++) *dest16++ = val;
+  return dest;
+}
+
+// Fill the outside-frame part's residues with values derived from the in-frame
+// part's residues.
+static inline void fill_residue_outside_frame(
+    int16_t *diff, ptrdiff_t diff_stride, int tx_cols, int tx_rows,
+    int visible_tx_cols, int visible_tx_rows, TX_TYPE tx_type) {
+  const int complete_block_outside =
+      (visible_tx_cols == 0 || visible_tx_rows == 0);
+
+  if (tx_type <= IDTX) {
+    int16_t avg = 0;
+    if (tx_type != IDTX && !complete_block_outside)
+      avg =
+          avg_wxh_block_c(diff, diff_stride, visible_tx_cols, visible_tx_rows);
+
+    // Fill the remaining parts of the block with the average value
+    const int right_pixels = tx_cols - visible_tx_cols;
+    for (int i = 0; i < tx_rows; ++i) {
+      aom_memset_int16(diff + i * diff_stride + visible_tx_cols, avg,
+                       right_pixels);
+    }
+
+    for (int i = visible_tx_rows; i < tx_rows; ++i) {
+      aom_memset_int16(diff + i * diff_stride, avg, visible_tx_cols);
+    }
+  } else if (htx_tab[tx_type] == IDTX_1D) {
+    if (visible_tx_rows < tx_rows) {
+      int16_t out[64] = { 0 };
+      if (!complete_block_outside)
+        avg_wxh_block_vert_c(diff, diff_stride, visible_tx_cols,
+                             visible_tx_rows, out);
+
+      for (int j = 0; j < visible_tx_cols; j++) {
+        for (int i = visible_tx_rows; i < tx_rows; ++i) {
+          *(diff + i * diff_stride + j) = out[j];
+        }
+      }
+    }
+
+    const int right_pixels = tx_cols - visible_tx_cols;
+    if (right_pixels) {
+      for (int i = 0; i < tx_rows; ++i) {
+        memset(diff + i * diff_stride + visible_tx_cols, 0,
+               right_pixels * sizeof(*diff));
+      }
+    }
+  } else {
+    assert(vtx_tab[tx_type] == IDTX_1D);
+
+    const int right_pixels = tx_cols - visible_tx_cols;
+    if (right_pixels) {
+      int16_t out[64] = { 0 };
+      if (!complete_block_outside)
+        avg_wxh_block_horiz_c(diff, diff_stride, visible_tx_cols,
+                              visible_tx_rows, out);
+
+      for (int i = 0; i < visible_tx_rows; ++i) {
+        aom_memset_int16(diff + i * diff_stride + visible_tx_cols, out[i],
+                         right_pixels);
+      }
+    }
+
+    for (int i = visible_tx_rows; i < tx_rows; ++i) {
+      memset(diff + i * diff_stride, 0, tx_cols * sizeof(*diff));
+    }
+  }
+}
+
+void av1_subtract_block(const MACROBLOCK *x, int rows, int cols, int16_t *diff,
                         ptrdiff_t diff_stride, const uint8_t *src8,
                         ptrdiff_t src_stride, const uint8_t *pred8,
-                        ptrdiff_t pred_stride) {
+                        ptrdiff_t pred_stride, int plane,
+                        BLOCK_SIZE plane_bsize, int blk_col, int blk_row,
+                        TX_TYPE tx_type, bool do_border_pad) {
   assert(rows >= 4 && cols >= 4);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  BitDepthInfo bd_info = get_bit_depth_info(xd);
 #if CONFIG_AV1_HIGHBITDEPTH
   if (bd_info.use_highbitdepth_buf) {
     aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
                               pred8, pred_stride);
-    return;
+  } else {
+    aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+                       pred_stride);
   }
-#endif
+#else
   (void)bd_info;
   aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
                      pred_stride);
+#endif
+  if (!do_border_pad) return;
+
+  int visible_cols, visible_rows;
+  const int is_border_block =
+      get_visible_dimensions(x, plane, plane_bsize, blk_col, blk_row, cols,
+                             rows, &visible_cols, &visible_rows, true);
+  if (is_border_block)
+    fill_residue_outside_frame(diff, diff_stride, cols, rows, visible_cols,
+                               visible_rows, tx_type);
 }

 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
-                      int blk_col, int blk_row, TX_SIZE tx_size) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+                      int blk_col, int blk_row, TX_SIZE tx_size,
+                      TX_TYPE tx_type, bool do_border_pad) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
   const int diff_stride = block_size_wide[plane_bsize];
@@ -67,21 +192,22 @@ void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
   uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
   int16_t *src_diff =
       &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
-  av1_subtract_block(bd_info, tx1d_height, tx1d_width, src_diff, diff_stride,
-                     src, src_stride, dst, dst_stride);
+  av1_subtract_block(x, tx1d_height, tx1d_width, src_diff, diff_stride, src,
+                     src_stride, dst, dst_stride, plane, plane_bsize, blk_col,
+                     blk_row, tx_type, do_border_pad);
 }

-void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) {
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane,
+                        bool do_border_pad) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
   assert(plane_bsize < BLOCK_SIZES_ALL);
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
-  const MACROBLOCKD *xd = &x->e_mbd;
-  const BitDepthInfo bd_info = get_bit_depth_info(xd);

-  av1_subtract_block(bd_info, bh, bw, p->src_diff, bw, p->src.buf,
-                     p->src.stride, pd->dst.buf, pd->dst.stride);
+  av1_subtract_block(x, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                     pd->dst.buf, pd->dst.stride, plane, plane_bsize, 0, 0,
+                     DCT_DCT, do_border_pad);
 }

 int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
@@ -401,6 +527,9 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
   if (!mbmi->skip_mode) {
     tx_type = av1_get_tx_type(xd, pd->plane_type, blk_row, blk_col, tx_size,
                               cm->features.reduced_tx_set_used);
+
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size, tx_type,
+                     cpi->do_border_pad);
     TxfmParam txfm_param;
     QUANT_PARAM quant_param;
     const int use_trellis = is_trellis_used(args->enable_optimize_b, dry_run);
@@ -628,7 +757,7 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,

 void av1_encode_sby_pass1(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize) {
   encode_block_pass1_args args = { cpi, x };
-  av1_subtract_plane(x, bsize, 0);
+  av1_subtract_plane(x, bsize, PLANE_TYPE_Y, cpi->do_border_pad);
   av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
                                          encode_block_pass1, &args);
 }
@@ -672,7 +801,6 @@ void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     const int step =
         tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
     av1_get_entropy_contexts(plane_bsize, pd, ctx.ta[plane], ctx.tl[plane]);
-    av1_subtract_plane(x, plane_bsize, plane);
     arg.ta = ctx.ta[plane];
     arg.tl = ctx.tl[plane];
     const BLOCK_SIZE max_unit_bsize =
@@ -723,12 +851,13 @@ static void encode_block_intra(int plane, int block, int blk_row, int blk_col,
     *eob = 0;
     p->txb_entropy_ctx[block] = 0;
   } else {
-    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
-
     const ENTROPY_CONTEXT *a = &args->ta[blk_col];
     const ENTROPY_CONTEXT *l = &args->tl[blk_row];
     tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
                               cm->features.reduced_tx_set_used);
+    TX_TYPE primary_tx_type = is_stat_generation_stage(cpi) ? DCT_DCT : tx_type;
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size,
+                     primary_tx_type, cpi->do_border_pad);
     TxfmParam txfm_param;
     QUANT_PARAM quant_param;
     const int use_trellis =
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index b35265cc2e..822032a903 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -128,15 +128,19 @@ void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block,
                             TX_SIZE tx_size, TX_TYPE tx_type,
                             int dropout_num_before, int dropout_num_after);

-void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff,
+void av1_subtract_block(const MACROBLOCK *x, int rows, int cols, int16_t *diff,
                         ptrdiff_t diff_stride, const uint8_t *src8,
                         ptrdiff_t src_stride, const uint8_t *pred8,
-                        ptrdiff_t pred_stride);
+                        ptrdiff_t pred_stride, int plane,
+                        BLOCK_SIZE plane_bsize, int blk_col, int blk_row,
+                        TX_TYPE tx_type, bool do_border_pad);

 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
-                      int blk_col, int blk_row, TX_SIZE tx_size);
+                      int blk_col, int blk_row, TX_SIZE tx_size,
+                      TX_TYPE tx_type, bool do_border_pad);

-void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane);
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane,
+                        bool do_border_pad);

 static inline void av1_set_txb_context(MACROBLOCK *x, int plane, int block,
                                        TX_SIZE tx_size, ENTROPY_CONTEXT *a,
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 79b92f8bc8..3842190096 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -4553,6 +4553,16 @@ int av1_encode(AV1_COMP *const cpi, uint8_t *const dest, size_t dest_size,
   cm->show_existing_frame = frame_params->show_existing_frame;
   cpi->existing_fb_idx_to_show = frame_params->existing_fb_idx_to_show;

+  if (cpi->oxcf.mode == GOOD &&
+      cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_OBJECTIVE &&
+      cpi->oxcf.algo_cfg.enable_tpl_model && cpi->oxcf.q_cfg.aq_mode == NO_AQ &&
+      !cpi->common.seg.enabled && !cpi->roi.enabled && !cpi->oxcf.sb_qp_sweep &&
+      !cpi->use_ducky_encode && cpi->oxcf.algo_cfg.sharpness != 3) {
+    cpi->do_border_pad = true;
+  } else {
+    cpi->do_border_pad = false;
+  }
+
   memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx,
          REF_FRAMES * sizeof(*cm->remapped_ref_idx));

diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index bf47b76170..9505d0af53 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -3737,6 +3737,11 @@ typedef struct AV1_COMP {
    * Store TPL stats before propagation
    */
   AomTplGopStats extrc_tpl_gop_stats;
+
+  /*!
+   * If true fills residual pixels outside the actual frame border
+   */
+  bool do_border_pad;
 } AV1_COMP;

 /*!
@@ -4282,6 +4287,18 @@ static inline int get_mi_ext_idx(const int mi_row, const int mi_col,
   return mi_ext_row * mbmi_ext_stride + mi_ext_col;
 }

+static inline void set_pixels_to_frame_edge(MACROBLOCK *x, int bw, int bh,
+                                            int mi_col, int mi_row, int mi_cols,
+                                            int mi_rows, int frame_width,
+                                            int frame_height,
+                                            bool do_border_pad) {
+  int total_frame_width = do_border_pad ? frame_width : (mi_cols * 4);
+  int total_frame_height = do_border_pad ? frame_height : (mi_rows * 4);
+
+  x->pix_to_bottom_edge = total_frame_height - ((mi_row + bh) << MI_SIZE_LOG2);
+  x->pix_to_right_edge = total_frame_width - ((mi_col + bw) << MI_SIZE_LOG2);
+}
+
 // Lighter version of set_offsets that only sets the mode info
 // pointers.
 static inline void set_mode_info_offsets(
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 605e9687ad..30d3522c33 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -398,6 +398,7 @@ static inline int calc_wavelet_energy(const AV1EncoderConfig *oxcf) {
 typedef struct intra_pred_block_pass1_args {
   const SequenceHeader *seq_params;
   MACROBLOCK *x;
+  bool do_border_pad;
 } intra_pred_block_pass1_args;

 static inline void copy_rect(uint8_t *dst, int dstride, const uint8_t *src,
@@ -437,11 +438,13 @@ static void first_pass_intra_pred_and_calc_diff(int plane, int block,
       pd->height, tx_size, mbmi->mode, 0, 0, FILTER_INTRA_MODES, src,
       src_stride, dst, dst_stride, blk_col, blk_row, plane);

-  av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+  av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size, DCT_DCT,
+                   args->do_border_pad);
 }

 static void first_pass_predict_intra_block_for_luma_plane(
-    const SequenceHeader *seq_params, MACROBLOCK *x, BLOCK_SIZE bsize) {
+    const SequenceHeader *seq_params, MACROBLOCK *x, BLOCK_SIZE bsize,
+    bool do_border_pad) {
   assert(bsize < BLOCK_SIZES_ALL);
   const MACROBLOCKD *const xd = &x->e_mbd;
   const int plane = AOM_PLANE_Y;
@@ -455,7 +458,7 @@ static void first_pass_predict_intra_block_for_luma_plane(
   const int src_stride = p->src.stride;
   const uint8_t *src = p->src.buf;

-  intra_pred_block_pass1_args args = { seq_params, x };
+  intra_pred_block_pass1_args args = { seq_params, x, do_border_pad };
   av1_foreach_transformed_block_in_plane(
       xd, plane_bsize, plane, first_pass_intra_pred_and_calc_diff, &args);

@@ -518,6 +521,12 @@ static int firstpass_intra_prediction(
   set_mi_row_col(xd, tile, unit_row * unit_scale, mi_size_high[bsize],
                  unit_col * unit_scale, mi_size_wide[bsize], mi_params->mi_rows,
                  mi_params->mi_cols);
+
+  set_pixels_to_frame_edge(x, mi_size_wide[bsize], mi_size_high[bsize],
+                           unit_col * unit_scale, unit_row * unit_scale,
+                           mi_params->mi_cols, mi_params->mi_rows, cm->width,
+                           cm->height, cpi->do_border_pad);
+
   set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes);
   xd->mi[0]->segment_id = 0;
   xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
@@ -526,7 +535,8 @@ static int firstpass_intra_prediction(
   xd->mi[0]->skip_txfm = 0;

   if (cpi->sf.fp_sf.disable_recon)
-    first_pass_predict_intra_block_for_luma_plane(seq_params, x, bsize);
+    first_pass_predict_intra_block_for_luma_plane(seq_params, x, bsize,
+                                                  cpi->do_border_pad);
   else
     av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0);
   int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff);
diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index 4cefbab651..26e47c9add 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c
@@ -619,8 +619,8 @@ static int64_t cfl_compute_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
   mbmi->cfl_alpha_idx = (cfl_alpha << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha;
   int64_t cfl_cost;
   if (fast_mode) {
-    cfl_cost =
-        intra_model_rd(cm, x, plane, plane_bsize, tx_size, /*use_hadamard=*/0);
+    cfl_cost = intra_model_rd(cm, x, plane, plane_bsize, tx_size,
+                              /*use_hadamard=*/0, cpi->do_border_pad);
   } else {
     av1_init_rd_stats(rd_stats);
     av1_txfm_rd_in_plane(x, cpi, rd_stats, INT64_MAX, 0, plane, plane_bsize,
@@ -1330,7 +1330,8 @@ int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
   }
   const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
   const int64_t this_model_rd =
-      intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1);
+      intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1,
+                     cpi->do_border_pad);

   const int model_rd_index_for_pruning =
       get_model_rd_index_for_pruning(x, intra_sf);
@@ -1600,7 +1601,8 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,

     const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
     const int64_t this_model_rd =
-        intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1);
+        intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1,
+                       cpi->do_border_pad);

     const int model_rd_index_for_pruning =
         get_model_rd_index_for_pruning(x, intra_sf);
diff --git a/av1/encoder/intra_mode_search_utils.h b/av1/encoder/intra_mode_search_utils.h
index 720aec2a14..3947437aeb 100644
--- a/av1/encoder/intra_mode_search_utils.h
+++ b/av1/encoder/intra_mode_search_utils.h
@@ -621,7 +621,8 @@ static inline int intra_mode_info_cost_uv(const AV1_COMP *cpi,
 // going through the whole txfm/quantize/itxfm process.
 static int64_t intra_model_rd(const AV1_COMMON *cm, MACROBLOCK *const x,
                               int plane, BLOCK_SIZE plane_bsize,
-                              TX_SIZE tx_size, int use_hadamard) {
+                              TX_SIZE tx_size, int use_hadamard,
+                              bool do_border_pad) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const BitDepthInfo bd_info = get_bit_depth_info(xd);
   int row, col;
@@ -644,9 +645,10 @@ static int64_t intra_model_rd(const AV1_COMMON *cm, MACROBLOCK *const x,
       // used in this for loop, therefore we don't need to properly add offset
       // to the buffers.
       av1_subtract_block(
-          bd_info, txbh, txbw, p->src_diff, block_size_wide[plane_bsize],
+          x, txbh, txbw, p->src_diff, block_size_wide[plane_bsize],
           p->src.buf + (((row * p->src.stride) + col) << 2), p->src.stride,
-          pd->dst.buf + (((row * pd->dst.stride) + col) << 2), pd->dst.stride);
+          pd->dst.buf + (((row * pd->dst.stride) + col) << 2), pd->dst.stride,
+          plane, plane_bsize, col, row, DCT_DCT, do_border_pad);
       av1_quick_txfm(use_hadamard, tx_size, bd_info, p->src_diff,
                      block_size_wide[plane_bsize], p->coeff);
       satd_cost += aom_satd(p->coeff, tx_size_2d[tx_size]);
@@ -672,8 +674,8 @@ static inline int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
   const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
   const int plane = 0;
   const AV1_COMMON *cm = &cpi->common;
-  const int64_t this_model_rd =
-      intra_model_rd(cm, x, plane, bsize, tx_size, /*use_hadamard=*/1);
+  const int64_t this_model_rd = intra_model_rd(
+      cm, x, plane, bsize, tx_size, /*use_hadamard=*/1, cpi->do_border_pad);
   if (*best_model_rd != INT64_MAX &&
       this_model_rd > *best_model_rd + (*best_model_rd >> 2)) {
     return 1;
diff --git a/av1/encoder/model_rd.h b/av1/encoder/model_rd.h
index 0e519bc54e..90d8429347 100644
--- a/av1/encoder/model_rd.h
+++ b/av1/encoder/model_rd.h
@@ -67,17 +67,62 @@ static int64_t calculate_sse(MACROBLOCKD *const xd,
   return sse;
 }

-static inline int64_t compute_sse_plane(MACROBLOCK *x, MACROBLOCKD *xd,
-                                        int plane, const BLOCK_SIZE bsize) {
+static unsigned pixel_dist_visible_only(
+    const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
+    const int src_stride, const uint8_t *dst, const int dst_stride,
+    const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows,
+    int visible_cols) {
+  if (visible_rows == 0 || visible_cols == 0) return 0;
+
+  unsigned sse;
+  if (txb_rows == visible_rows && txb_cols == visible_cols) {
+    cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+    return sse;
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  const MACROBLOCKD *xd = &x->e_mbd;
+  if (is_cur_buf_hbd(xd)) {
+    uint64_t sse64;
+    if (!(visible_rows % 4) && !(visible_cols % 4)) {
+      sse64 = aom_highbd_sse(src, src_stride, dst, dst_stride, visible_cols,
+                             visible_rows);
+    } else {
+      sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
+                                      visible_cols, visible_rows);
+    }
+    return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
+  }
+#else
+  (void)x;
+#endif
+  if (!(visible_rows % 4) && !(visible_cols % 4)) {
+    sse = (unsigned)aom_sse(src, src_stride, dst, dst_stride, visible_cols,
+                            visible_rows);
+  } else {
+    sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
+                           visible_rows);
+  }
+  return sse;
+}
+
+static inline int64_t compute_sse_plane(const AV1_COMP *cpi, MACROBLOCK *x,
+                                        MACROBLOCKD *xd, int plane,
+                                        const BLOCK_SIZE bsize) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const BLOCK_SIZE plane_bsize =
       get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
   int bw, bh;
   const struct macroblock_plane *const p = &x->plane[plane];
-  get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
-                     &bh);
+  const int block_width = block_size_wide[plane_bsize];
+  const int block_height = block_size_high[plane_bsize];

-  int64_t sse = calculate_sse(xd, p, pd, bw, bh);
+  get_visible_dimensions(x, plane, plane_bsize, 0, 0, block_width, block_height,
+                         &bw, &bh, true);
+
+  int64_t sse = pixel_dist_visible_only(
+      cpi, x, p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+      plane_bsize, block_height, block_width, bh, bw);

   return sse;
 }
@@ -231,10 +276,16 @@ static inline void model_rd_for_sb_with_curvfit(
     int rate;
     int bw, bh;
     const struct macroblock_plane *const p = &x->plane[plane];
-    get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
-                       &bw, &bh);
+    const int block_width = block_size_wide[plane_bsize];
+    const int block_height = block_size_high[plane_bsize];
+
+    get_visible_dimensions(x, plane, plane_bsize, 0, 0, block_width,
+                           block_height, &bw, &bh, true);
+
+    sse = pixel_dist_visible_only(cpi, x, p->src.buf, p->src.stride,
+                                  pd->dst.buf, pd->dst.stride, plane_bsize,
+                                  block_height, block_width, bh, bw);

-    sse = calculate_sse(xd, p, pd, bw, bh);
     model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
                           &dist);

diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index 4c6d4f8b4f..575ac98714 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -443,7 +443,6 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                 mbmi->mv[0].as_mv = best_mv->as_mv;
                 av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
                                               bsize, 0, 0);
-                av1_subtract_plane(x, bsize, 0);
                 RD_STATS this_rd_stats;
                 av1_init_rd_stats(&this_rd_stats);
                 av1_estimate_txfm_yrd(cpi, x, &this_rd_stats, INT64_MAX, bsize,
@@ -459,7 +458,6 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                 mbmi->mv[0].as_mv = this_best_mv;
                 av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
                                               bsize, 0, 0);
-                av1_subtract_plane(x, bsize, 0);
                 RD_STATS tmp_rd_stats;
                 av1_init_rd_stats(&tmp_rd_stats);
                 av1_estimate_txfm_yrd(cpi, x, &tmp_rd_stats, INT64_MAX, bsize,
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 09838846b8..f162ec154d 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -692,6 +692,10 @@ void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi,
   set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
                  cm->mi_params.mi_rows, cm->mi_params.mi_cols);

+  set_pixels_to_frame_edge(x, mi_width, mi_height, mi_col, mi_row,
+                           cm->mi_params.mi_cols, cm->mi_params.mi_rows,
+                           cm->width, cm->height, cpi->do_border_pad);
+
   // Set up source buffers.
   av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);

diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index ec3d06713e..706e08e316 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -879,9 +879,16 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x,
     const BLOCK_SIZE bs =
         get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
     unsigned int sse;
+    int bw, bh;
+    const int block_width = block_size_wide[bs];
+    const int block_height = block_size_high[bs];

-    cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
-                            pd->dst.stride, &sse);
+    get_visible_dimensions(x, plane, bs, 0, 0, block_width, block_height, &bw,
+                           &bh, cpi->do_border_pad);
+
+    sse = pixel_dist_visible_only(cpi, x, p->src.buf, p->src.stride,
+                                  pd->dst.buf, pd->dst.stride, bs, block_height,
+                                  block_width, bh, bw);
     total_sse += sse;
     if (!plane && sse_y) *sse_y = sse;
   }
@@ -1964,7 +1971,7 @@ static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
     const BLOCK_SIZE plane_bsize =
         get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);

-    av1_subtract_plane(x, plane_bsize, plane);
+    av1_subtract_plane(x, plane_bsize, plane, cpi->do_border_pad);

     int64_t sse =
         av1_pixel_diff_dist(x, plane, 0, 0, plane_bsize, plane_bsize, NULL);
@@ -3941,7 +3948,7 @@ static inline void refine_winner_mode_tx(
         if (mbmi->motion_mode == OBMC_CAUSAL)
           av1_build_obmc_inter_predictors_sb(cm, xd);

-        av1_subtract_plane(x, bsize, 0);
+        av1_subtract_plane(x, bsize, PLANE_TYPE_Y, cpi->do_border_pad);
         if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
             !xd->lossless[mbmi->segment_id]) {
           av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index 5063d34bb7..1474be981a 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h
@@ -339,6 +339,47 @@ static inline void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
   if (width) *width = txb_width;
 }

+static inline int get_visible_dimensions(const MACROBLOCK *x, int plane,
+                                         BLOCK_SIZE plane_bsize, int blk_col,
+                                         int blk_row, int cols, int rows,
+                                         int *visible_cols, int *visible_rows,
+                                         bool use_crop_dim) {
+  if ((x->pix_to_bottom_edge >= 0 && x->pix_to_right_edge >= 0) ||
+      !use_crop_dim) {
+    if (visible_cols != NULL && visible_rows != NULL) {
+      *visible_rows = rows;
+      *visible_cols = cols;
+    }
+    return 0;
+  }
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  int valid_cols, valid_rows;
+
+  if (x->pix_to_bottom_edge >= 0) {
+    valid_rows = rows;
+  } else {
+    const int block_height = block_size_high[plane_bsize];
+    const int block_rows =
+        (x->pix_to_bottom_edge >> pd->subsampling_y) + block_height;
+    valid_rows = clamp(block_rows - (blk_row << MI_SIZE_LOG2), 0, rows);
+  }
+
+  if (x->pix_to_right_edge >= 0) {
+    valid_cols = cols;
+  } else {
+    const int block_width = block_size_wide[plane_bsize];
+    const int block_cols =
+        (x->pix_to_right_edge >> pd->subsampling_x) + block_width;
+    valid_cols = clamp(block_cols - (blk_col << MI_SIZE_LOG2), 0, cols);
+  }
+  if (visible_cols != NULL && visible_rows != NULL) {
+    *visible_cols = valid_cols;
+    *visible_rows = valid_rows;
+  }
+  return (valid_cols < cols || valid_rows < rows);
+}
+
 static inline int bsize_to_num_blk(BLOCK_SIZE bsize) {
   int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * MI_SIZE_LOG2);
   return num_blk;
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index c1691f9151..a9efb79872 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -212,6 +212,24 @@ void av1_setup_tpl_buffers(AV1_PRIMARY *const ppi,
   tpl_data->prev_gop_arf_disp_order = -1;
 }

+static inline void tpl_subtract_block(BitDepthInfo bd_info, int rows, int cols,
+                                      int16_t *diff, ptrdiff_t diff_stride,
+                                      const uint8_t *src8, ptrdiff_t src_stride,
+                                      const uint8_t *pred8,
+                                      ptrdiff_t pred_stride) {
+  assert(rows >= 4 && cols >= 4);
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (bd_info.use_highbitdepth_buf) {
+    aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
+                              pred8, pred_stride);
+    return;
+  }
+#endif
+  (void)bd_info;
+  aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+                     pred_stride);
+}
+
 static inline int32_t tpl_get_satd_cost(BitDepthInfo bd_info, int16_t *src_diff,
                                         int diff_stride, const uint8_t *src,
                                         int src_stride, const uint8_t *dst,
@@ -219,7 +237,7 @@ static inline int32_t tpl_get_satd_cost(BitDepthInfo bd_info, int16_t *src_diff,
                                         int bw, int bh, TX_SIZE tx_size) {
   const int pix_num = bw * bh;

-  av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
+  tpl_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
                      dst, dst_stride);
   av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff);
   return aom_satd(coeff, pix_num);
@@ -247,7 +265,7 @@ static inline void txfm_quant_rdcost(
   const MACROBLOCKD *xd = &x->e_mbd;
   const BitDepthInfo bd_info = get_bit_depth_info(xd);
   uint16_t eob;
-  av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
+  tpl_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
                      dst, dst_stride);
   av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff);

diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index b23ed42073..48ced2426d 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -127,9 +127,10 @@ int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row,
                             const BLOCK_SIZE tx_bsize,
                             unsigned int *block_mse_q8) {
   int visible_rows, visible_cols;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
-                     NULL, &visible_cols, &visible_rows);
+  const int txb_cols = block_size_wide[tx_bsize];
+  const int txb_rows = block_size_high[tx_bsize];
+  get_visible_dimensions(x, plane, plane_bsize, blk_col, blk_row, txb_cols,
+                         txb_rows, &visible_cols, &visible_rows, true);
   const int diff_stride = block_size_wide[plane_bsize];
   const int16_t *diff = x->plane[plane].src_diff;

@@ -141,7 +142,7 @@ int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row,
       *block_mse_q8 =
           (unsigned int)((256 * sse) / (visible_cols * visible_rows));
     else
-      *block_mse_q8 = UINT_MAX;
+      *block_mse_q8 = 0;
   }
   return sse;
 }
@@ -153,9 +154,10 @@ static inline int64_t pixel_diff_stats(
     const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize,
     unsigned int *block_mse_q8, int64_t *per_px_mean, uint64_t *block_var) {
   int visible_rows, visible_cols;
-  const MACROBLOCKD *xd = &x->e_mbd;
-  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
-                     NULL, &visible_cols, &visible_rows);
+  const int txb_cols = block_size_wide[tx_bsize];
+  const int txb_rows = block_size_high[tx_bsize];
+  get_visible_dimensions(x, plane, plane_bsize, blk_col, blk_row, txb_cols,
+                         txb_rows, &visible_cols, &visible_rows, true);
   const int diff_stride = block_size_wide[plane_bsize];
   const int16_t *diff = x->plane[plane].src_diff;

@@ -172,7 +174,9 @@ static inline int64_t pixel_diff_stats(
     *block_mse_q8 = (unsigned int)(norm_factor * (256 * sse));
     *block_var = (uint64_t)(sse - (uint64_t)(norm_factor * sum * sum));
   } else {
-    *block_mse_q8 = UINT_MAX;
+    *block_mse_q8 = 0;
+    *block_var = 0;
+    *per_px_mean = 0;
   }
   return sse;
 }
@@ -966,33 +970,6 @@ static inline void recon_intra(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   }
 }

-static unsigned pixel_dist_visible_only(
-    const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
-    const int src_stride, const uint8_t *dst, const int dst_stride,
-    const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows,
-    int visible_cols) {
-  unsigned sse;
-
-  if (txb_rows == visible_rows && txb_cols == visible_cols) {
-    cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
-    return sse;
-  }
-
-#if CONFIG_AV1_HIGHBITDEPTH
-  const MACROBLOCKD *xd = &x->e_mbd;
-  if (is_cur_buf_hbd(xd)) {
-    uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
-                                             visible_cols, visible_rows);
-    return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
-  }
-#else
-  (void)x;
-#endif
-  sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
-                         visible_rows);
-  return sse;
-}
-
 // Compute the pixel domain distortion from src and dst on all visible 4x4s in
 // the
 // transform block.
@@ -1003,10 +980,11 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
                            const BLOCK_SIZE plane_bsize,
                            const BLOCK_SIZE tx_bsize) {
   int txb_rows, txb_cols, visible_rows, visible_cols;
-  const MACROBLOCKD *xd = &x->e_mbd;
+  txb_cols = block_size_wide[tx_bsize];
+  txb_rows = block_size_high[tx_bsize];

-  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
-                     &txb_cols, &txb_rows, &visible_cols, &visible_rows);
+  get_visible_dimensions(x, plane, plane_bsize, blk_col, blk_row, txb_cols,
+                         txb_rows, &visible_cols, &visible_rows, true);
   assert(visible_rows > 0);
   assert(visible_cols > 0);

@@ -2118,6 +2096,16 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
   const bool predict_dc_block =
       txfm_params->predict_dc_level >= 1 && txw != 64 && txh != 64;
   int64_t per_px_mean = INT64_MAX;
+
+  int is_border_block = 0;
+  if (cpi->do_border_pad) {
+    is_border_block = get_visible_dimensions(
+        x, plane, plane_bsize, blk_col, blk_row, txw, txh, NULL, NULL, true);
+    if (is_border_block)
+      av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size,
+                       best_tx_type, cpi->do_border_pad);
+  }
+
   if (predict_dc_block) {
     predict_dc_only_block(x, plane, plane_bsize, tx_size, block, blk_row,
                           blk_col, best_rd_stats, &block_sse, &block_mse_q8,
@@ -2209,6 +2197,10 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     RD_STATS this_rd_stats;
     av1_invalid_rd_stats(&this_rd_stats);

+    if (is_border_block)
+      av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size,
+                       tx_type, cpi->do_border_pad);
+
     if (!dc_only_blk)
       av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param);
     else
@@ -2381,6 +2373,10 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
     best_rd_stats->sse = block_sse;
   }

+  if (is_border_block)
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size,
+                     best_tx_type, cpi->do_border_pad);
+
   // Intra mode needs decoded pixels such that the next transform block
   // can use them for prediction.
   recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
@@ -3083,7 +3079,8 @@ static inline void block_rd_txfm(int plane, int block, int blk_row, int blk_col,

   if (!is_inter) {
     av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
-    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size, DCT_DCT,
+                     false);
 #if !CONFIG_REALTIME_ONLY
     const TxfmSearchParams *const txfm_params = &x->txfm_search_params;
     if (txfm_params->enable_nn_prune_intra_tx_depths) {
@@ -3194,6 +3191,8 @@ int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
       QUANT_PARAM quant_param;
       av1_setup_xform(&cpi->common, x, tx_size, DCT_DCT, &txfm_param);
       av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, 0, &quant_param);
+      av1_subtract_txb(x, PLANE_TYPE_Y, bs, blk_col, blk_row, tx_size, DCT_DCT,
+                       cpi->do_border_pad);

       av1_xform(x, 0, i, blk_row, blk_col, bs, &txfm_param);
       av1_quant(x, 0, i, &txfm_param, &quant_param);
@@ -3709,7 +3708,7 @@ int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,

   if (is_inter) {
     for (int plane = 1; plane < MAX_MB_PLANE; ++plane)
-      av1_subtract_plane(x, plane_bsize, plane);
+      av1_subtract_plane(x, plane_bsize, plane, cpi->do_border_pad);
   }

   const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
@@ -3820,7 +3819,7 @@ int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   rd_stats->rate = mode_rate;

   // cost and distortion
-  av1_subtract_plane(x, bsize, 0);
+  av1_subtract_plane(x, bsize, PLANE_TYPE_Y, cpi->do_border_pad);
   if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
       !xd->lossless[mbmi->segment_id]) {
     av1_pick_recursive_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);