Commit f1e7f4e023 for aom

commit f1e7f4e023d8dd0a60ee8bb26ebdb73f91d0a20e
Author: Deepa K G <deepa.kg@ittiam.com>
Date:   Fri Jun 26 17:55:36 2026 +0530

    Avoid SSE calculation for blocks outside frame boundary

    When the frame dimension is not a multiple of 8, the
    function `aom_sum_squares_2d_i16()` can be called with
    width = 0 or height = 0. However, the SSE2 and ARM
    optimizations of 'aom_sum_squares_2d_i16()' require
    that the width and height be non-zero. Thus, in this
    patch aom_sum_squares_2d_i16() is called only for
    non-zero values of width and height.

    Also, suggestions in
    https://aomedia-review.googlesource.com/c/aom/+/213801
    are taken
    - Setting of 'visible_cols' and 'visible_rows' in
      get_visible_dimensions() is corrected for odd frame
      dimensions.
    - Code documentation is improved.

    The flag 'do_border_pad' is not yet enabled.

    Bug: 527078408, 527242002

    Change-Id: Ie5b79b5c7ce0ac94c5609b2f2b28fc82920a3c35

diff --git a/aom_ports/mem.h b/aom_ports/mem.h
index f37a8259cc..a7e0585a2c 100644
--- a/aom_ports/mem.h
+++ b/aom_ports/mem.h
@@ -72,8 +72,9 @@

 #define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y))

+/* This macro requires d > 0. */
 #define DIVIDE_AND_ROUND_SIGNED(n, d) \
-  ((((n) < 0) ^ ((d) < 0)) ? (((n) - (d) / 2) / (d)) : (((n) + (d) / 2) / (d)))
+  (((n) < 0) ? (((n) - (d) / 2) / (d)) : (((n) + (d) / 2) / (d)))

 #define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
 #define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index ffda806a08..9e49fc8fa6 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -1439,10 +1439,20 @@ typedef struct macroblock {
   RD_STATS *rdcost;
 #endif  // CONFIG_PARTITION_SEARCH_ORDER

-  /*! \brief Distance from bottom edge of the frame in pixels. */
+  /*! \brief Signed vertical distance, in pixels, from the bottom edge of the
+   *  current prediction block to the bottom edge of the frame.
+   *
+   *  The value may be negative when the prediction block extends beyond the
+   *  bottom edge of the frame.
+   */
   int pix_to_bottom_edge;

-  /*! \brief Distance from right edge of the frame in pixels. */
+  /*! \brief Signed horizontal distance, in pixels, from the right edge of the
+   *  current prediction block to the right edge of the frame.
+   *
+   *  The value may be negative when the prediction block extends beyond the
+   *  right edge of the frame.
+   */
   int pix_to_right_edge;
 } MACROBLOCK;
 #undef SINGLE_REF_MODES
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 7cd3a0273c..a6d255dcd4 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -38,46 +38,42 @@
 // Compute the average value of the wxh block.
 static inline int16_t avg_wxh_block_c(int16_t *diff, ptrdiff_t diff_stride,
                                       int w, int h) {
+  assert(w > 0 && h > 0);
   int32_t sum = 0;
   for (int row = 0; row < h; ++row) {
     for (int col = 0; col < w; ++col) {
-      sum += *(diff + row * diff_stride + col);
+      sum += diff[row * diff_stride + col];
     }
   }
-  return (w * h > 0) ? (int16_t)(DIVIDE_AND_ROUND_SIGNED(sum, w * h)) : 0;
+  return (int16_t)DIVIDE_AND_ROUND_SIGNED(sum, w * h);
 }

 // Compute the row average value of the wxh block.
 static inline void avg_wxh_block_horiz_c(int16_t *diff, ptrdiff_t diff_stride,
                                          int w, int h, int16_t *out) {
+  assert(w > 0 && h > 0);
   for (int row = 0; row < h; ++row) {
     int32_t sum = 0;
     for (int col = 0; col < w; ++col) {
-      sum += *(diff + row * diff_stride + col);
+      sum += diff[row * diff_stride + col];
     }
-    out[row] = (w > 0) ? (int16_t)DIVIDE_AND_ROUND_SIGNED(sum, w) : 0;
+    out[row] = (int16_t)DIVIDE_AND_ROUND_SIGNED(sum, w);
   }
 }

 // Compute the column average value of the wxh block.
 static inline void avg_wxh_block_vert_c(int16_t *diff, ptrdiff_t diff_stride,
                                         int w, int h, int16_t *out) {
+  assert(w > 0 && h > 0);
   for (int col = 0; col < w; ++col) {
     int32_t sum = 0;
     for (int row = 0; row < h; ++row) {
-      sum += *(diff + row * diff_stride + col);
+      sum += diff[row * diff_stride + col];
     }
-    out[col] = (h > 0) ? (int16_t)DIVIDE_AND_ROUND_SIGNED(sum, h) : 0;
+    out[col] = (int16_t)DIVIDE_AND_ROUND_SIGNED(sum, h);
   }
 }

-static inline void *aom_memset_int16(void *dest, int16_t val, size_t length) {
-  size_t i;
-  int16_t *dest16 = (int16_t *)dest;
-  for (i = 0; i < length; i++) *dest16++ = val;
-  return dest;
-}
-
 // Fill the outside-frame part's residues with values derived from the in-frame
 // part's residues.
 static inline void fill_residue_outside_frame(
@@ -95,12 +91,11 @@ static inline void fill_residue_outside_frame(
     // Fill the remaining parts of the block with the average value
     const int right_pixels = tx_cols - visible_tx_cols;
     for (int i = 0; i < tx_rows; ++i) {
-      aom_memset_int16(diff + i * diff_stride + visible_tx_cols, avg,
-                       right_pixels);
+      aom_memset16(diff + i * diff_stride + visible_tx_cols, avg, right_pixels);
     }

     for (int i = visible_tx_rows; i < tx_rows; ++i) {
-      aom_memset_int16(diff + i * diff_stride, avg, visible_tx_cols);
+      aom_memset16(diff + i * diff_stride, avg, visible_tx_cols);
     }
   } else if (htx_tab[tx_type] == IDTX_1D) {
     if (visible_tx_rows < tx_rows) {
@@ -134,8 +129,8 @@ static inline void fill_residue_outside_frame(
                               visible_tx_rows, out);

       for (int i = 0; i < visible_tx_rows; ++i) {
-        aom_memset_int16(diff + i * diff_stride + visible_tx_cols, out[i],
-                         right_pixels);
+        aom_memset16(diff + i * diff_stride + visible_tx_cols, out[i],
+                     right_pixels);
       }
     }

@@ -170,9 +165,9 @@ void av1_subtract_block(const MACROBLOCK *x, int rows, int cols, int16_t *diff,
   if (!do_border_pad) return;

   int visible_cols, visible_rows;
-  const int is_border_block =
-      get_visible_dimensions(x, plane, plane_bsize, blk_col, blk_row, cols,
-                             rows, &visible_cols, &visible_rows, true);
+  const int is_border_block = get_visible_dimensions(
+      x, plane, plane_bsize, blk_col, blk_row, cols, rows,
+      /*clip_dims=*/true, &visible_cols, &visible_rows);
   if (is_border_block)
     fill_residue_outside_frame(diff, diff_stride, cols, rows, visible_cols,
                                visible_rows, tx_type);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 9505d0af53..9aa8ca0f4a 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -4287,16 +4287,26 @@ static inline int get_mi_ext_idx(const int mi_row, const int mi_col,
   return mi_ext_row * mbmi_ext_stride + mi_ext_col;
 }

+// Computes the signed distances from the bottom and right edges of the current
+// prediction block to the corresponding edges of the frame.
 static inline void set_pixels_to_frame_edge(MACROBLOCK *x, int bw, int bh,
                                             int mi_col, int mi_row, int mi_cols,
                                             int mi_rows, int frame_width,
                                             int frame_height,
                                             bool do_border_pad) {
-  int total_frame_width = do_border_pad ? frame_width : (mi_cols * 4);
-  int total_frame_height = do_border_pad ? frame_height : (mi_rows * 4);
-
-  x->pix_to_bottom_edge = total_frame_height - ((mi_row + bh) << MI_SIZE_LOG2);
-  x->pix_to_right_edge = total_frame_width - ((mi_col + bw) << MI_SIZE_LOG2);
+  // For do_border_pad = 1, compute distances using the actual frame
+  // dimensions.
+  // For do_border_pad = 0, compute distances using the frame dimensions
+  // aligned to a multiple of 8 pixels to match the dimensions represented
+  // by mi_cols and mi_rows, which are rounded up to multiples of 8 pixels.
+  int boundary_frame_width =
+      do_border_pad ? frame_width : (mi_cols << MI_SIZE_LOG2);
+  int boundary_frame_height =
+      do_border_pad ? frame_height : (mi_rows << MI_SIZE_LOG2);
+
+  x->pix_to_bottom_edge =
+      boundary_frame_height - ((mi_row + bh) << MI_SIZE_LOG2);
+  x->pix_to_right_edge = boundary_frame_width - ((mi_col + bw) << MI_SIZE_LOG2);
 }

 // Lighter version of set_offsets that only sets the mode info
diff --git a/av1/encoder/model_rd.h b/av1/encoder/model_rd.h
index 90d8429347..d792b8ea0d 100644
--- a/av1/encoder/model_rd.h
+++ b/av1/encoder/model_rd.h
@@ -118,7 +118,7 @@ static inline int64_t compute_sse_plane(const AV1_COMP *cpi, MACROBLOCK *x,
   const int block_height = block_size_high[plane_bsize];

   get_visible_dimensions(x, plane, plane_bsize, 0, 0, block_width, block_height,
-                         &bw, &bh, true);
+                         /*clip_dims=*/true, &bw, &bh);

   int64_t sse = pixel_dist_visible_only(
       cpi, x, p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
@@ -280,7 +280,7 @@ static inline void model_rd_for_sb_with_curvfit(
     const int block_height = block_size_high[plane_bsize];

     get_visible_dimensions(x, plane, plane_bsize, 0, 0, block_width,
-                           block_height, &bw, &bh, true);
+                           block_height, /*clip_dims=*/true, &bw, &bh);

     sse = pixel_dist_visible_only(cpi, x, p->src.buf, p->src.stride,
                                   pd->dst.buf, pd->dst.stride, plane_bsize,
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 97174d75da..a11d58e4d0 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -883,8 +883,8 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x,
     const int block_width = block_size_wide[bs];
     const int block_height = block_size_high[bs];

-    get_visible_dimensions(x, plane, bs, 0, 0, block_width, block_height, &bw,
-                           &bh, cpi->do_border_pad);
+    get_visible_dimensions(x, plane, bs, 0, 0, block_width, block_height,
+                           cpi->do_border_pad, &bw, &bh);

     sse = pixel_dist_visible_only(cpi, x, p->src.buf, p->src.stride,
                                   pd->dst.buf, pd->dst.stride, bs, block_height,
diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index 9e93fa246b..1c50cb4b3d 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h
@@ -339,13 +339,31 @@ static inline void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
   if (width) *width = txb_width;
 }

+/*!
+ * \brief Computes the effective dimensions of a block.
+ *
+ * \param[in]    x             Pointer to structure holding the data for the
+                               current encoding macroblock
+ * \param[in]    plane         The index of the current plane
+ * \param[in]    plane_bsize   Block size for the current plane
+ * \param[in]    blk_col       Column offset of the transform block, in MI units
+ * \param[in]    blk_row       Row offset of the transform block, in MI units
+ * \param[in]    cols          Transform block width, in pixels
+ * \param[in]    rows          Transform block height, in pixels
+ * \param[in]    clip_dims     If false, returns the original block dimensions
+ *                             If true, clips the block dimensions so they lie
+ *                             within the valid frame extent
+ * \param[out]   visible_cols  Pointer to the effective block width, in pixels
+ * \param[out]   visible_rows  Pointer to the effective block height, in pixels
+ *
+ * \return 1 if the block dimensions were clipped; otherwise 0.
+ */
 static inline int get_visible_dimensions(const MACROBLOCK *x, int plane,
                                          BLOCK_SIZE plane_bsize, int blk_col,
                                          int blk_row, int cols, int rows,
-                                         int *visible_cols, int *visible_rows,
-                                         bool use_crop_dim) {
-  if ((x->pix_to_bottom_edge >= 0 && x->pix_to_right_edge >= 0) ||
-      !use_crop_dim) {
+                                         bool clip_dims, int *visible_cols,
+                                         int *visible_rows) {
+  if ((x->pix_to_bottom_edge >= 0 && x->pix_to_right_edge >= 0) || !clip_dims) {
     if (visible_cols != NULL && visible_rows != NULL) {
       *visible_rows = rows;
       *visible_cols = cols;
@@ -361,7 +379,8 @@ static inline int get_visible_dimensions(const MACROBLOCK *x, int plane,
   } else {
     const int block_height = block_size_high[plane_bsize];
     const int block_rows =
-        (x->pix_to_bottom_edge >> pd->subsampling_y) + block_height;
+        ROUND_POWER_OF_TWO_SIGNED(x->pix_to_bottom_edge, pd->subsampling_y) +
+        block_height;
     valid_rows = clamp(block_rows - (blk_row << MI_SIZE_LOG2), 0, rows);
   }

@@ -370,7 +389,8 @@ static inline int get_visible_dimensions(const MACROBLOCK *x, int plane,
   } else {
     const int block_width = block_size_wide[plane_bsize];
     const int block_cols =
-        (x->pix_to_right_edge >> pd->subsampling_x) + block_width;
+        ROUND_POWER_OF_TWO_SIGNED(x->pix_to_right_edge, pd->subsampling_x) +
+        block_width;
     valid_cols = clamp(block_cols - (blk_col << MI_SIZE_LOG2), 0, cols);
   }
   if (visible_cols != NULL && visible_rows != NULL) {
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index df78888ea0..2fe835b891 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -130,19 +130,22 @@ int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row,
   const int txb_cols = block_size_wide[tx_bsize];
   const int txb_rows = block_size_high[tx_bsize];
   get_visible_dimensions(x, plane, plane_bsize, blk_col, blk_row, txb_cols,
-                         txb_rows, &visible_cols, &visible_rows, true);
-  const int diff_stride = block_size_wide[plane_bsize];
-  const int16_t *diff = x->plane[plane].src_diff;
+                         txb_rows, /*clip_dims=*/true, &visible_cols,
+                         &visible_rows);

-  diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
-  uint64_t sse =
-      aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
-  if (block_mse_q8 != NULL) {
-    if (visible_cols > 0 && visible_rows > 0)
+  uint64_t sse = 0;
+  if (visible_cols > 0 && visible_rows > 0) {
+    const int diff_stride = block_size_wide[plane_bsize];
+    const int16_t *diff = x->plane[plane].src_diff;
+
+    diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
+    sse = aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+    if (block_mse_q8 != NULL) {
       *block_mse_q8 =
           (unsigned int)((256 * sse) / (visible_cols * visible_rows));
-    else
-      *block_mse_q8 = 0;
+    }
+  } else {
+    if (block_mse_q8 != NULL) *block_mse_q8 = 0;
   }
   return sse;
 }
@@ -157,15 +160,18 @@ static inline int64_t pixel_diff_stats(
   const int txb_cols = block_size_wide[tx_bsize];
   const int txb_rows = block_size_high[tx_bsize];
   get_visible_dimensions(x, plane, plane_bsize, blk_col, blk_row, txb_cols,
-                         txb_rows, &visible_cols, &visible_rows, true);
-  const int diff_stride = block_size_wide[plane_bsize];
-  const int16_t *diff = x->plane[plane].src_diff;
+                         txb_rows, /*clip_dims=*/true, &visible_cols,
+                         &visible_rows);

-  diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
   uint64_t sse = 0;
-  int sum = 0;
-  sse = aom_sum_sse_2d_i16(diff, diff_stride, visible_cols, visible_rows, &sum);
   if (visible_cols > 0 && visible_rows > 0) {
+    const int diff_stride = block_size_wide[plane_bsize];
+    const int16_t *diff = x->plane[plane].src_diff;
+
+    diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
+    int sum = 0;
+    sse =
+        aom_sum_sse_2d_i16(diff, diff_stride, visible_cols, visible_rows, &sum);
     double norm_factor = 1.0 / (visible_cols * visible_rows);
     int sign_sum = sum > 0 ? 1 : -1;
     // Conversion to transform domain
@@ -984,7 +990,8 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
   txb_rows = block_size_high[tx_bsize];

   get_visible_dimensions(x, plane, plane_bsize, blk_col, blk_row, txb_cols,
-                         txb_rows, &visible_cols, &visible_rows, true);
+                         txb_rows, /*clip_dims=*/true, &visible_cols,
+                         &visible_rows);
   assert(visible_rows > 0);
   assert(visible_cols > 0);

@@ -2099,8 +2106,9 @@ static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,

   int is_border_block = 0;
   if (cpi->do_border_pad) {
-    is_border_block = get_visible_dimensions(
-        x, plane, plane_bsize, blk_col, blk_row, txw, txh, NULL, NULL, true);
+    is_border_block =
+        get_visible_dimensions(x, plane, plane_bsize, blk_col, blk_row, txw,
+                               txh, /*clip_dims=*/true, NULL, NULL);
     if (is_border_block)
       av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size,
                        best_tx_type, cpi->do_border_pad);