Commit d7f8c2fbaf for aom
commit d7f8c2fbafa212880c987fc2be1f51d326144560
Author: Gerda Zsejke More <gerdazsejke.more@arm.com>
Date: Mon Oct 20 09:11:40 2025 +0200
Move av1_warp_affine_common impl from warp_plane_neon.h
The av1_warp_affine_common implementation is only used by
av1_warp_affine_neon so move the implementation to that function.
Delete warp_affine_horizontal as well, as it is not a common function
anymore.
Delete horizontal_filter_4x1_f1_beta0 and
horizontal_filter_8x1_f1_beta0 in warp_plane_neon_i8mm.c and
warp_plane_sve.c as they are not required by the header file anymore.
Change-Id: I200f6fa15e6babacff976e079938e318529425ae
diff --git a/av1/common/arm/warp_plane_neon.c b/av1/common/arm/warp_plane_neon.c
index 497273bc65..f4c1377211 100644
--- a/av1/common/arm/warp_plane_neon.c
+++ b/av1/common/arm/warp_plane_neon.c
@@ -272,13 +272,106 @@ static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src,
*res_high = horizontal_add_4d_s32x4(m4567_pairs);
}
+static AOM_FORCE_INLINE void warp_affine_horizontal_neon(
+ const uint8_t *ref, int width, int height, int stride, int p_width,
+ int p_height, int16_t alpha, int16_t beta, const int64_t x4,
+ const int64_t y4, const int i, int16x8_t tmp[]) {
+ const int height_limit = AOMMIN(8, p_height - i) + 7;
+
+ int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
+ int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
+
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ if (warp_affine_special_case(ref, ix4, iy4, width, height, stride,
+ height_limit, tmp)) {
+ return;
+ }
+
+ static const uint8_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15 };
+ const uint8x16_t indx = vld1q_u8(kIotaArr);
+
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+
+ if (p_width == 4) {
+ if (beta == 0) {
+ if (alpha == 0) {
+ int16x8_t f_s16 =
+ vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1_beta0, f_s16);
+ } else {
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha);
+ }
+ } else {
+ if (alpha == 0) {
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1,
+ (sx4 + beta * (k - 3)));
+ } else {
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, (sx4 + beta * (k - 3)),
+ alpha);
+ }
+ }
+ } else {
+ if (beta == 0) {
+ if (alpha == 0) {
+ int16x8_t f_s16 =
+ vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1_beta0, f_s16);
+ } else {
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha);
+ }
+ } else {
+ if (alpha == 0) {
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1,
+ (sx4 + beta * (k - 3)));
+ } else {
+ APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, (sx4 + beta * (k - 3)),
+ alpha);
+ }
+ }
+ }
+}
+
void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
int height, int stride, uint8_t *pred, int p_col,
int p_row, int p_width, int p_height, int p_stride,
int subsampling_x, int subsampling_y,
ConvolveParams *conv_params, int16_t alpha,
int16_t beta, int16_t gamma, int16_t delta) {
- av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row,
- p_width, p_height, p_stride, subsampling_x,
- subsampling_y, conv_params, alpha, beta, gamma, delta);
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const int is_compound = conv_params->is_compound;
+ uint16_t *const dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ const int do_average = conv_params->do_average;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+ assert(IMPLIES(is_compound, dst != NULL));
+ assert(IMPLIES(do_average, is_compound));
+
+ for (int i = 0; i < p_height; i += 8) {
+ for (int j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int64_t dst_x =
+ (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
+ const int64_t dst_y =
+ (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
+
+ const int64_t x4 = dst_x >> subsampling_x;
+ const int64_t y4 = dst_y >> subsampling_y;
+
+ int16x8_t tmp[15];
+ warp_affine_horizontal_neon(ref, width, height, stride, p_width, p_height,
+ alpha, beta, x4, y4, i, tmp);
+ warp_affine_vertical(pred, p_width, p_height, p_stride, is_compound, dst,
+ dst_stride, do_average, use_dist_wtd_comp_avg, gamma,
+ delta, y4, i, j, tmp, w0, w1);
+ }
+ }
}
diff --git a/av1/common/arm/warp_plane_neon.h b/av1/common/arm/warp_plane_neon.h
index 2909df7b7f..6c50c41f45 100644
--- a/av1/common/arm/warp_plane_neon.h
+++ b/av1/common/arm/warp_plane_neon.h
@@ -24,24 +24,6 @@
#include "av1/common/warped_motion.h"
#include "av1/common/scale.h"
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in,
- int sx, int alpha);
-
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in,
- int sx, int alpha);
-
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
- int sx);
-
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
- int sx);
-
-static AOM_FORCE_INLINE int16x8_t
-horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16);
-
-static AOM_FORCE_INLINE int16x8_t
-horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16);
-
static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
int32x4_t *res, int sy);
@@ -95,21 +77,12 @@ static AOM_FORCE_INLINE int clamp_iy(int iy, int height) {
return clamp(iy, 0, height - 1);
}
-static AOM_FORCE_INLINE void warp_affine_horizontal(
- const uint8_t *ref, int width, int height, int stride, int p_width,
- int p_height, int16_t alpha, int16_t beta, const int64_t x4,
- const int64_t y4, const int i, int16x8_t tmp[]) {
+static inline bool warp_affine_special_case(const uint8_t *ref, int32_t ix4,
+ int32_t iy4, int width, int height,
+ int stride, const int height_limit,
+ int16x8_t tmp[]) {
const int bd = 8;
const int reduce_bits_horiz = ROUND0_BITS;
- const int height_limit = AOMMIN(8, p_height - i) + 7;
-
- int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
- int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
-
- int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
- sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
- sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
if (ix4 <= -7) {
for (int k = 0; k < height_limit; ++k) {
@@ -119,7 +92,7 @@ static AOM_FORCE_INLINE void warp_affine_horizontal(
ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
tmp[k] = vdupq_n_s16(dup_val);
}
- return;
+ return true;
} else if (ix4 >= width + 6) {
for (int k = 0; k < height_limit; ++k) {
int iy = clamp_iy(iy4 + k - 7, height);
@@ -128,15 +101,11 @@ static AOM_FORCE_INLINE void warp_affine_horizontal(
(1 << (FILTER_BITS - reduce_bits_horiz));
tmp[k] = vdupq_n_s16(dup_val);
}
- return;
+ return true;
}
- static const uint8_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 13, 14, 15 };
- const uint8x16_t indx = vld1q_u8(kIotaArr);
-
- const int out_of_boundary_left = -(ix4 - 6);
- const int out_of_boundary_right = (ix4 + 8) - width;
+ return false;
+}
#define APPLY_HORIZONTAL_SHIFT(fn, ...) \
do { \
@@ -172,45 +141,6 @@ static AOM_FORCE_INLINE void warp_affine_horizontal(
} \
} while (0)
- if (p_width == 4) {
- if (beta == 0) {
- if (alpha == 0) {
- int16x8_t f_s16 =
- vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
- APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1_beta0, f_s16);
- } else {
- APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha);
- }
- } else {
- if (alpha == 0) {
- APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1,
- (sx4 + beta * (k - 3)));
- } else {
- APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, (sx4 + beta * (k - 3)),
- alpha);
- }
- }
- } else {
- if (beta == 0) {
- if (alpha == 0) {
- int16x8_t f_s16 =
- vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]);
- APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1_beta0, f_s16);
- } else {
- APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha);
- }
- } else {
- if (alpha == 0) {
- APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1,
- (sx4 + beta * (k - 3)));
- } else {
- APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, (sx4 + beta * (k - 3)),
- alpha);
- }
- }
- }
-}
-
static AOM_FORCE_INLINE void warp_affine_vertical(
uint8_t *pred, int p_width, int p_height, int p_stride, int is_compound,
uint16_t *dst, int dst_stride, int do_average, int use_dist_wtd_comp_avg,
@@ -339,43 +269,4 @@ static AOM_FORCE_INLINE void warp_affine_vertical(
}
}
-static AOM_FORCE_INLINE void av1_warp_affine_common(
- const int32_t *mat, const uint8_t *ref, int width, int height, int stride,
- uint8_t *pred, int p_col, int p_row, int p_width, int p_height,
- int p_stride, int subsampling_x, int subsampling_y,
- ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma,
- int16_t delta) {
- const int w0 = conv_params->fwd_offset;
- const int w1 = conv_params->bck_offset;
- const int is_compound = conv_params->is_compound;
- uint16_t *const dst = conv_params->dst;
- const int dst_stride = conv_params->dst_stride;
- const int do_average = conv_params->do_average;
- const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
-
- assert(IMPLIES(is_compound, dst != NULL));
- assert(IMPLIES(do_average, is_compound));
-
- for (int i = 0; i < p_height; i += 8) {
- for (int j = 0; j < p_width; j += 8) {
- const int32_t src_x = (p_col + j + 4) << subsampling_x;
- const int32_t src_y = (p_row + i + 4) << subsampling_y;
- const int64_t dst_x =
- (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
- const int64_t dst_y =
- (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
-
- const int64_t x4 = dst_x >> subsampling_x;
- const int64_t y4 = dst_y >> subsampling_y;
-
- int16x8_t tmp[15];
- warp_affine_horizontal(ref, width, height, stride, p_width, p_height,
- alpha, beta, x4, y4, i, tmp);
- warp_affine_vertical(pred, p_width, p_height, p_stride, is_compound, dst,
- dst_stride, do_average, use_dist_wtd_comp_avg, gamma,
- delta, y4, i, j, tmp, w0, w1);
- }
- }
-}
-
#endif // AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_
diff --git a/av1/common/arm/warp_plane_neon_i8mm.c b/av1/common/arm/warp_plane_neon_i8mm.c
index 2d02974527..44689e96b2 100644
--- a/av1/common/arm/warp_plane_neon_i8mm.c
+++ b/av1/common/arm/warp_plane_neon_i8mm.c
@@ -143,10 +143,11 @@ static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1_8tap_beta0(
return vreinterpretq_s16_u16(res);
}
-static AOM_FORCE_INLINE int16x8_t
-horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
+static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
+ int sx) {
const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+ int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
@@ -166,12 +167,6 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
return vreinterpretq_s16_u16(res);
}
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
- int sx) {
- int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
- return horizontal_filter_4x1_f1_beta0(in, f_s16);
-}
-
static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1_6tap_beta0(
const uint8x16_t in, const int8x16_t filter, const uint8x16x2_t perm_tbl) {
const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
@@ -219,10 +214,11 @@ static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1_8tap_beta0(
return vreinterpretq_s16_u16(res);
}
-static AOM_FORCE_INLINE int16x8_t
-horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
+static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
+ int sx) {
const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+ int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
@@ -248,12 +244,6 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
return vreinterpretq_s16_u16(res);
}
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
- int sx) {
- int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
- return horizontal_filter_8x1_f1_beta0(in, f_s16);
-}
-
static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
int32x4_t *res, int sy) {
int16x4_t s0 = vget_low_s16(src[0]);
@@ -387,8 +377,6 @@ static AOM_FORCE_INLINE void warp_affine_horizontal_neon_i8mm(
const uint8_t *ref, int width, int height, int stride, int p_width,
int p_height, int16_t alpha, int16_t beta, const int64_t x4,
const int64_t y4, const int i, int16x8_t tmp[]) {
- const int bd = 8;
- const int reduce_bits_horiz = ROUND0_BITS;
const int height_limit = AOMMIN(8, p_height - i) + 7;
int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
@@ -399,23 +387,8 @@ static AOM_FORCE_INLINE void warp_affine_horizontal_neon_i8mm(
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
- if (ix4 <= -7) {
- for (int k = 0; k < height_limit; ++k) {
- int iy = clamp_iy(iy4 + k - 7, height);
- int16_t dup_val =
- (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
- ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
- tmp[k] = vdupq_n_s16(dup_val);
- }
- return;
- } else if (ix4 >= width + 6) {
- for (int k = 0; k < height_limit; ++k) {
- int iy = clamp_iy(iy4 + k - 7, height);
- int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
- ref[iy * stride + (width - 1)] *
- (1 << (FILTER_BITS - reduce_bits_horiz));
- tmp[k] = vdupq_n_s16(dup_val);
- }
+ if (warp_affine_special_case(ref, ix4, iy4, width, height, stride,
+ height_limit, tmp)) {
return;
}
diff --git a/av1/common/arm/warp_plane_sve.c b/av1/common/arm/warp_plane_sve.c
index 455e29d124..885ffe80da 100644
--- a/av1/common/arm/warp_plane_sve.c
+++ b/av1/common/arm/warp_plane_sve.c
@@ -146,10 +146,11 @@ static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1_8tap_beta0(
return vreinterpretq_s16_u16(res);
}
-static AOM_FORCE_INLINE int16x8_t
-horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
+static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
+ int sx) {
const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+ int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
@@ -169,12 +170,6 @@ horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
return vreinterpretq_s16_u16(res);
}
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in,
- int sx) {
- int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
- return horizontal_filter_4x1_f1_beta0(in, f_s16);
-}
-
static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1_6tap_beta0(
const uint8x16_t in, const int8x16_t filter, const uint8x16x2_t perm_tbl) {
const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
@@ -222,10 +217,11 @@ static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1_8tap_beta0(
return vreinterpretq_s16_u16(res);
}
-static AOM_FORCE_INLINE int16x8_t
-horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
+static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
+ int sx) {
const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1));
+ int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16));
uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]);
@@ -251,12 +247,6 @@ horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) {
return vreinterpretq_s16_u16(res);
}
-static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in,
- int sx) {
- int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]);
- return horizontal_filter_8x1_f1_beta0(in, f_s16);
-}
-
static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src,
int32x4_t *res, int sy) {
int16x4_t s0 = vget_low_s16(src[0]);
@@ -381,8 +371,6 @@ static AOM_FORCE_INLINE void warp_affine_horizontal_sve(
const uint8_t *ref, int width, int height, int stride, int p_width,
int p_height, int16_t alpha, int16_t beta, const int64_t x4,
const int64_t y4, const int i, int16x8_t tmp[]) {
- const int bd = 8;
- const int reduce_bits_horiz = ROUND0_BITS;
const int height_limit = AOMMIN(8, p_height - i) + 7;
int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
@@ -393,23 +381,8 @@ static AOM_FORCE_INLINE void warp_affine_horizontal_sve(
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
- if (ix4 <= -7) {
- for (int k = 0; k < height_limit; ++k) {
- int iy = clamp_iy(iy4 + k - 7, height);
- int16_t dup_val =
- (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
- ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
- tmp[k] = vdupq_n_s16(dup_val);
- }
- return;
- } else if (ix4 >= width + 6) {
- for (int k = 0; k < height_limit; ++k) {
- int iy = clamp_iy(iy4 + k - 7, height);
- int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
- ref[iy * stride + (width - 1)] *
- (1 << (FILTER_BITS - reduce_bits_horiz));
- tmp[k] = vdupq_n_s16(dup_val);
- }
+ if (warp_affine_special_case(ref, ix4, iy4, width, height, stride,
+ height_limit, tmp)) {
return;
}