Commit 3d57983ce3 for aom

commit 3d57983ce3968415c20ec5b4d66c8b3a1b10d82d
Author: Jeremy Dorfman <jdorfman@google.com>
Date:   Tue May 12 10:41:03 2026 -0400

    [convolve] Optimize av1_convolve_2d_sr_avx2

    Previously, horizontal and vertical convolutions were interleaved
    strip-by-strip. The horizontal convolution's inner loop traversed
    vertical row indices, leading to non-local accesses.

    This change optimizes memory spatial and temporal locality by switching
    the loop order:
    - Swaps column-strip and row processing, making the column-strip (j) the
      inner loop and the row (i) the outer loop during horizontal filtering
    - Enables contiguous, cache-line-aligned sequential memory reads across
      rows, which is more efficient on modern CPUs
    - Resizes the intermediate stack buffer from a reusable single-strip
      buffer (~2.2 KB) to a full block-sized buffer im_block_buf (~35.8 KB)
      to support decoupled horizontal filtering across all strips

    No functional changes. This is very slightly slower for 8x32 and ranges
    between neutral to much better for most other block sizes:

    BlockWidth/BlockHeight/HorizontalFilter/VerticalFilter
                                    Baseline CPU   New CPU        Improvement
    4/4/0/0                         14.42n ±  1%   14.57n ±  1%   +1.06% (p=0.001 n=10)
    4/4/0/1                         14.50n ±  1%   14.59n ±  0%        ~ (p=0.052 n=10)
    4/4/0/2                         14.40n ±  0%   14.58n ±  1%   +1.24% (p=0.000 n=10)
    4/4/0/3                         14.41n ±  1%   14.57n ±  1%   +1.13% (p=0.001 n=10)
    4/4/1/0                         14.47n ±  2%   14.60n ±  0%        ~ (p=0.063 n=10)
    4/4/1/1                         14.39n ±  1%   14.58n ±  0%   +1.27% (p=0.015 n=10)
    4/4/1/2                         14.50n ±  1%   14.57n ±  0%   +0.54% (p=0.011 n=10)
    4/4/1/3                         14.44n ±  1%   14.59n ±  0%   +1.04% (p=0.023 n=10)
    4/4/2/0                         14.40n ±  1%   14.58n ±  0%   +1.21% (p=0.015 n=10)
    4/4/2/1                         14.46n ±  0%   14.56n ±  0%   +0.65% (p=0.004 n=10)
    4/4/2/2                         14.45n ±  2%   14.53n ±  0%        ~ (p=0.085 n=10)
    4/4/2/3                         14.40n ±  1%   14.58n ±  0%   +1.21% (p=0.001 n=10)
    4/4/3/0                         14.46n ±  1%   14.57n ±  0%        ~ (p=0.165 n=10)
    4/4/3/1                         14.49n ±  3%   14.58n ±  1%        ~ (p=0.353 n=10)
    4/4/3/2                         14.39n ±  1%   14.58n ±  1%   +1.36% (p=0.001 n=10)
    4/4/3/3                         14.41n ±  1%   14.57n ±  1%   +1.10% (p=0.009 n=10)
    4/4/4/4                         42.91n ±  1%   41.77n ±  1%   -2.64% (p=0.000 n=10)
    4/8/0/0                         17.15n ±  1%   17.08n ±  4%        ~ (p=0.247 n=10)
    4/8/0/1                         17.13n ±  1%   17.18n ±  2%        ~ (p=0.315 n=10)
    4/8/0/2                         17.20n ±  1%   17.02n ±  3%        ~ (p=0.240 n=10)
    4/8/0/3                         17.27n ±  1%   17.03n ±  3%        ~ (p=0.143 n=10)
    4/8/1/0                         17.27n ±  1%   17.10n ±  3%        ~ (p=0.143 n=10)
    4/8/1/1                         17.13n ±  0%   17.11n ±  3%        ~ (p=0.796 n=10)
    4/8/1/2                         17.13n ±  2%   17.13n ±  2%        ~ (p=0.529 n=10)
    4/8/1/3                         17.20n ±  1%   17.05n ±  3%        ~ (p=0.160 n=10)
    4/8/2/0                         17.22n ±  1%   17.12n ±  3%        ~ (p=0.436 n=10)
    4/8/2/1                         17.18n ±  1%   17.17n ±  2%        ~ (p=0.987 n=10)
    4/8/2/2                         17.36n ±  1%   17.16n ±  4%        ~ (p=0.424 n=10)
    4/8/2/3                         17.21n ±  2%   17.15n ±  2%        ~ (p=0.353 n=10)
    4/8/3/0                         17.18n ±  1%   17.09n ±  3%        ~ (p=0.404 n=10)
    4/8/3/1                         17.23n ±  2%   17.25n ±  2%        ~ (p=0.724 n=10)
    4/8/3/2                         17.15n ±  1%   17.18n ±  2%        ~ (p=0.631 n=10)
    4/8/3/3                         17.21n ±  2%   17.08n ±  3%        ~ (p=0.183 n=10)
    4/8/4/4                         56.68n ±  1%   55.07n ±  1%   -2.83% (p=0.000 n=10)
    8/4/0/0                         20.72n ±  2%   13.98n ±  1%  -32.53% (p=0.000 n=10)
    8/4/0/1                         20.73n ±  1%   13.98n ±  1%  -32.57% (p=0.000 n=10)
    8/4/0/2                         20.83n ±  1%   13.95n ±  1%  -33.05% (p=0.000 n=10)
    8/4/0/3                         20.76n ±  3%   13.94n ±  0%  -32.82% (p=0.000 n=10)
    8/4/1/0                         20.64n ±  4%   13.95n ±  1%  -32.40% (p=0.000 n=10)
    8/4/1/1                         20.96n ±  2%   13.99n ±  0%  -33.28% (p=0.000 n=10)
    8/4/1/2                         20.93n ±  2%   13.96n ±  0%  -33.28% (p=0.000 n=10)
    8/4/1/3                         20.58n ±  2%   13.96n ±  0%  -32.17% (p=0.000 n=10)
    8/4/2/0                         20.72n ±  2%   13.94n ±  1%  -32.72% (p=0.000 n=10)
    8/4/2/1                         20.86n ±  3%   13.96n ±  1%  -33.07% (p=0.000 n=10)
    8/4/2/2                         20.70n ±  1%   13.95n ±  1%  -32.58% (p=0.000 n=10)
    8/4/2/3                         20.59n ±  2%   13.96n ±  1%  -32.19% (p=0.000 n=10)
    8/4/3/0                         20.58n ±  2%   13.95n ±  1%  -32.18% (p=0.000 n=10)
    8/4/3/1                         20.62n ±  1%   13.95n ±  0%  -32.33% (p=0.000 n=10)
    8/4/3/2                         20.76n ±  1%   14.00n ±  1%  -32.57% (p=0.000 n=10)
    8/4/3/3                         20.83n ±  4%   13.96n ±  1%  -32.97% (p=0.000 n=10)
    8/4/4/4                         58.96n ±  1%   59.39n ±  2%   +0.73% (p=0.015 n=10)
    8/8/0/0                         21.55n ±  1%   18.17n ±  0%  -15.68% (p=0.000 n=10)
    8/8/0/1                         21.51n ±  0%   18.16n ±  0%  -15.58% (p=0.000 n=10)
    8/8/0/2                         21.56n ±  1%   18.16n ±  0%  -15.79% (p=0.000 n=10)
    8/8/0/3                         21.58n ±  1%   18.17n ±  0%  -15.81% (p=0.000 n=10)
    8/8/1/0                         21.56n ±  1%   18.16n ±  0%  -15.79% (p=0.000 n=10)
    8/8/1/1                         21.62n ±  1%   18.18n ±  1%  -15.88% (p=0.000 n=10)
    8/8/1/2                         21.59n ±  1%   18.16n ±  0%  -15.89% (p=0.000 n=10)
    8/8/1/3                         21.50n ±  1%   18.16n ±  0%  -15.54% (p=0.000 n=10)
    8/8/2/0                         21.57n ±  0%   18.14n ±  0%  -15.89% (p=0.000 n=10)
    8/8/2/1                         21.52n ±  1%   18.15n ±  1%  -15.64% (p=0.000 n=10)
    8/8/2/2                         21.58n ±  1%   18.16n ±  0%  -15.84% (p=0.000 n=10)
    8/8/2/3                         21.57n ±  0%   18.16n ±  0%  -15.82% (p=0.000 n=10)
    8/8/3/0                         21.61n ±  1%   18.17n ±  0%  -15.90% (p=0.000 n=10)
    8/8/3/1                         21.52n ±  0%   18.16n ±  0%  -15.63% (p=0.000 n=10)
    8/8/3/2                         21.60n ±  1%   18.16n ±  0%  -15.95% (p=0.000 n=10)
    8/8/3/3                         21.52n ±  1%   18.16n ±  0%  -15.63% (p=0.000 n=10)
    8/8/4/4                         76.97n ±  0%   78.07n ±  1%   +1.42% (p=0.000 n=10)
    8/16/0/0                        26.12n ±  0%   25.93n ±  1%   -0.72% (p=0.002 n=10)
    8/16/0/1                        26.19n ±  1%   25.96n ±  1%   -0.89% (p=0.020 n=10)
    8/16/0/2                        26.11n ±  0%   25.97n ±  1%   -0.53% (p=0.007 n=10)
    8/16/0/3                        26.13n ±  0%   25.98n ±  1%   -0.56% (p=0.023 n=10)
    8/16/1/0                        26.16n ±  1%   26.00n ±  0%   -0.63% (p=0.004 n=10)
    8/16/1/1                        26.09n ±  0%   25.94n ±  1%   -0.57% (p=0.001 n=10)
    8/16/1/2                        26.10n ±  0%   25.97n ±  1%   -0.50% (p=0.009 n=10)
    8/16/1/3                        26.08n ±  1%   25.99n ±  0%   -0.34% (p=0.045 n=10)
    8/16/2/0                        26.16n ±  1%   25.98n ±  1%   -0.68% (p=0.000 n=10)
    8/16/2/1                        26.07n ±  1%   25.99n ±  0%   -0.33% (p=0.011 n=10)
    8/16/2/2                        26.10n ±  1%   25.95n ±  0%   -0.59% (p=0.001 n=10)
    8/16/2/3                        26.12n ±  0%   25.97n ±  1%   -0.57% (p=0.043 n=10)
    8/16/3/0                        26.19n ±  1%   25.94n ±  1%   -0.94% (p=0.002 n=10)
    8/16/3/1                        26.12n ±  1%   25.95n ±  0%   -0.65% (p=0.000 n=10)
    8/16/3/2                        26.13n ±  1%   25.97n ±  1%   -0.59% (p=0.004 n=10)
    8/16/3/3                        26.14n ±  0%   25.96n ±  1%   -0.66% (p=0.001 n=10)
    8/16/4/4                        112.0n ±  0%   113.5n ±  1%   +1.39% (p=0.000 n=10)
    16/8/0/0                        37.37n ±  1%   20.04n ±  2%  -46.39% (p=0.000 n=10)
    16/8/0/1                        37.19n ±  1%   20.03n ±  0%  -46.16% (p=0.000 n=10)
    16/8/0/2                        37.41n ±  1%   20.06n ±  0%  -46.38% (p=0.000 n=10)
    16/8/0/3                        37.37n ±  1%   20.04n ±  1%  -46.38% (p=0.000 n=10)
    16/8/1/0                        37.22n ±  1%   20.02n ±  0%  -46.22% (p=0.000 n=10)
    16/8/1/1                        37.40n ±  2%   20.01n ±  1%  -46.49% (p=0.000 n=10)
    16/8/1/2                        37.53n ±  1%   20.00n ±  0%  -46.71% (p=0.000 n=10)
    16/8/1/3                        37.33n ±  1%   20.02n ±  1%  -46.37% (p=0.000 n=10)
    16/8/2/0                        37.14n ±  1%   20.02n ±  1%  -46.10% (p=0.000 n=10)
    16/8/2/1                        37.54n ±  1%   20.04n ±  1%  -46.62% (p=0.000 n=10)
    16/8/2/2                        37.19n ±  2%   20.03n ±  1%  -46.16% (p=0.000 n=10)
    16/8/2/3                        37.75n ±  2%   20.05n ±  1%  -46.89% (p=0.000 n=10)
    16/8/3/0                        37.47n ±  1%   20.04n ±  1%  -46.51% (p=0.000 n=10)
    16/8/3/1                        37.40n ±  1%   20.03n ±  0%  -46.45% (p=0.000 n=10)
    16/8/3/2                        37.42n ±  1%   19.96n ±  0%  -46.67% (p=0.000 n=10)
    16/8/3/3                        37.42n ±  2%   20.03n ±  1%  -46.47% (p=0.000 n=10)
    16/8/4/4                        142.9n ±  1%   141.9n ±  1%   -0.70% (p=0.035 n=10)
    16/16/0/0                       45.08n ±  0%   30.92n ±  1%  -31.41% (p=0.000 n=10)
    16/16/0/1                       45.07n ±  1%   30.99n ±  1%  -31.25% (p=0.000 n=10)
    16/16/0/2                       45.16n ±  1%   30.80n ±  1%  -31.79% (p=0.000 n=10)
    16/16/0/3                       45.20n ±  0%   30.80n ±  2%  -31.86% (p=0.000 n=10)
    16/16/1/0                       44.99n ±  0%   30.75n ±  2%  -31.65% (p=0.000 n=10)
    16/16/1/1                       45.27n ±  1%   30.86n ±  1%  -31.82% (p=0.000 n=10)
    16/16/1/2                       45.14n ±  0%   30.89n ±  1%  -31.56% (p=0.000 n=10)
    16/16/1/3                       45.01n ±  1%   30.91n ±  1%  -31.33% (p=0.000 n=10)
    16/16/2/0                       45.07n ±  0%   30.83n ±  1%  -31.58% (p=0.000 n=10)
    16/16/2/1                       45.05n ±  2%   30.83n ±  2%  -31.57% (p=0.000 n=10)
    16/16/2/2                       45.08n ±  0%   30.78n ±  1%  -31.71% (p=0.000 n=10)
    16/16/2/3                       45.13n ±  1%   30.83n ±  1%  -31.69% (p=0.000 n=10)
    16/16/3/0                       45.06n ±  1%   30.92n ±  0%  -31.37% (p=0.000 n=10)
    16/16/3/1                       45.14n ±  1%   30.88n ±  2%  -31.58% (p=0.000 n=10)
    16/16/3/2                       45.13n ±  1%   30.80n ±  1%  -31.75% (p=0.000 n=10)
    16/16/3/3                       45.25n ±  1%   30.96n ±  1%  -31.57% (p=0.000 n=10)
    16/16/4/4                       212.7n ±  0%   211.9n ±  0%   -0.37% (p=0.035 n=10)
    16/32/0/0                       66.68n ±  0%   54.86n ±  6%  -17.74% (p=0.000 n=10)
    16/32/0/1                       66.65n ±  0%   55.89n ±  7%  -16.13% (p=0.000 n=10)
    16/32/0/2                       66.68n ±  0%   58.62n ± 10%  -12.10% (p=0.000 n=10)
    16/32/0/3                       66.69n ±  0%   53.86n ± 11%  -19.23% (p=0.000 n=10)
    16/32/1/0                       66.69n ±  1%   56.56n ±  6%  -15.19% (p=0.000 n=10)
    16/32/1/1                       66.70n ±  0%   53.82n ±  8%  -19.30% (p=0.000 n=10)
    16/32/1/2                       66.71n ±  1%   53.06n ± 12%  -20.46% (p=0.000 n=10)
    16/32/1/3                       66.65n ±  0%   57.09n ±  8%  -14.34% (p=0.000 n=10)
    16/32/2/0                       66.69n ±  1%   55.87n ±  6%  -16.22% (p=0.000 n=10)
    16/32/2/1                       66.75n ±  0%   53.30n ± 11%  -20.15% (p=0.000 n=10)
    16/32/2/2                       66.76n ±  1%   53.76n ± 11%  -19.48% (p=0.000 n=10)
    16/32/2/3                       66.76n ±  0%   57.96n ±  9%  -13.19% (p=0.000 n=10)
    16/32/3/0                       66.68n ±  0%   53.73n ± 11%  -19.42% (p=0.000 n=10)
    16/32/3/1                       66.72n ±  0%   54.78n ±  9%  -17.91% (p=0.000 n=10)
    16/32/3/2                       66.68n ±  0%   57.23n ±  7%  -14.17% (p=0.000 n=10)
    16/32/3/3                       66.77n ±  0%   53.68n ± 10%  -19.61% (p=0.000 n=10)
    16/32/4/4                       354.3n ±  0%   356.9n ±  1%   +0.74% (p=0.000 n=10)
    32/16/0/0                       83.68n ±  1%   40.10n ±  1%  -52.08% (p=0.000 n=10)
    32/16/0/1                       83.56n ±  1%   39.93n ±  1%  -52.22% (p=0.000 n=10)
    32/16/0/2                       83.68n ±  1%   40.04n ±  1%  -52.16% (p=0.000 n=10)
    32/16/0/3                       84.08n ±  1%   40.08n ±  0%  -52.33% (p=0.000 n=10)
    32/16/1/0                       83.58n ±  1%   40.09n ±  1%  -52.04% (p=0.000 n=10)
    32/16/1/1                       83.84n ±  1%   40.08n ±  1%  -52.20% (p=0.000 n=10)
    32/16/1/2                       83.56n ±  1%   40.02n ±  1%  -52.11% (p=0.000 n=10)
    32/16/1/3                       83.55n ±  1%   39.98n ±  1%  -52.15% (p=0.000 n=10)
    32/16/2/0                       83.55n ±  1%   40.08n ±  1%  -52.02% (p=0.000 n=10)
    32/16/2/1                       83.84n ±  1%   40.00n ±  0%  -52.29% (p=0.000 n=10)
    32/16/2/2                       83.75n ±  1%   39.80n ±  4%  -52.48% (p=0.000 n=10)
    32/16/2/3                       83.71n ±  1%   39.98n ±  1%  -52.24% (p=0.000 n=10)
    32/16/3/0                       83.63n ±  1%   40.08n ±  1%  -52.08% (p=0.000 n=10)
    32/16/3/1                       83.83n ±  1%   40.10n ±  1%  -52.17% (p=0.000 n=10)
    32/16/3/2                       83.64n ±  1%   39.89n ±  1%  -52.31% (p=0.000 n=10)
    32/16/3/3                       83.77n ±  0%   40.07n ±  1%  -52.17% (p=0.000 n=10)
    32/16/4/4                       419.2n ±  1%   415.3n ±  1%   -0.94% (p=0.000 n=10)
    32/32/0/0                      123.33n ±  1%   76.38n ±  1%  -38.07% (p=0.000 n=10)
    32/32/0/1                      123.21n ±  0%   76.17n ±  2%  -38.18% (p=0.000 n=10)
    32/32/0/2                      123.32n ±  1%   76.63n ±  2%  -37.86% (p=0.000 n=10)
    32/32/0/3                      123.31n ±  0%   76.73n ±  1%  -37.77% (p=0.000 n=10)
    32/32/1/0                      123.14n ±  1%   76.39n ±  2%  -37.96% (p=0.000 n=10)
    32/32/1/1                      123.21n ±  0%   76.71n ±  1%  -37.74% (p=0.000 n=10)
    32/32/1/2                      123.00n ±  1%   76.97n ±  1%  -37.42% (p=0.000 n=10)
    32/32/1/3                      123.26n ±  0%   76.79n ±  1%  -37.70% (p=0.000 n=10)
    32/32/2/0                      123.32n ±  0%   76.83n ±  3%  -37.70% (p=0.000 n=10)
    32/32/2/1                      123.15n ±  1%   76.64n ±  3%  -37.77% (p=0.000 n=10)
    32/32/2/2                      122.94n ±  1%   76.95n ±  3%  -37.41% (p=0.000 n=10)
    32/32/2/3                      123.11n ±  0%   76.72n ±  1%  -37.68% (p=0.000 n=10)
    32/32/3/0                      123.27n ±  0%   76.78n ±  1%  -37.72% (p=0.000 n=10)
    32/32/3/1                      123.11n ±  1%   76.50n ±  1%  -37.86% (p=0.000 n=10)
    32/32/3/2                      123.29n ±  1%   76.99n ±  1%  -37.55% (p=0.000 n=10)
    32/32/3/3                      123.13n ±  1%   76.99n ±  1%  -37.47% (p=0.000 n=10)
    32/32/4/4                       701.4n ±  0%   694.4n ±  0%   -1.00% (p=0.000 n=10)
    32/64/0/0                       231.5n ±  1%   140.4n ±  1%  -39.35% (p=0.000 n=10)
    32/64/0/1                       231.5n ±  1%   140.6n ±  1%  -39.28% (p=0.000 n=10)
    32/64/0/2                       231.8n ±  1%   140.0n ±  1%  -39.58% (p=0.000 n=10)
    32/64/0/3                       231.2n ±  1%   139.3n ±  1%  -39.76% (p=0.000 n=10)
    32/64/1/0                       231.8n ±  0%   140.0n ±  2%  -39.61% (p=0.000 n=10)
    32/64/1/1                       232.3n ±  1%   140.4n ±  2%  -39.57% (p=0.000 n=10)
    32/64/1/2                       231.6n ±  0%   139.8n ±  1%  -39.64% (p=0.000 n=10)
    32/64/1/3                       231.6n ±  1%   140.5n ±  2%  -39.33% (p=0.000 n=10)
    32/64/2/0                       231.8n ±  1%   140.3n ±  2%  -39.50% (p=0.000 n=10)
    32/64/2/1                       231.4n ±  1%   138.7n ±  1%  -40.06% (p=0.000 n=10)
    32/64/2/2                       231.8n ±  1%   140.4n ±  1%  -39.44% (p=0.000 n=10)
    32/64/2/3                       231.9n ±  1%   138.7n ±  2%  -40.17% (p=0.000 n=10)
    32/64/3/0                       231.5n ±  0%   140.6n ±  1%  -39.28% (p=0.000 n=10)
    32/64/3/1                       231.9n ±  1%   139.7n ±  1%  -39.75% (p=0.000 n=10)
    32/64/3/2                       232.0n ±  0%   139.9n ±  2%  -39.70% (p=0.000 n=10)
    32/64/3/3                       231.6n ±  0%   139.9n ±  2%  -39.62% (p=0.000 n=10)
    32/64/4/4                       1.265µ ±  0%   1.261µ ±  0%        ~ (p=0.089 n=10)
    64/32/0/0                      242.16n ±  1%   61.13n ± 10%  -74.76% (p=0.000 n=10)
    64/32/0/1                      242.21n ±  1%   55.68n ± 10%  -77.01% (p=0.000 n=10)
    64/32/0/2                      241.93n ±  1%   58.46n ±  6%  -75.84% (p=0.000 n=10)
    64/32/0/3                      242.18n ±  1%   61.14n ± 10%  -74.75% (p=0.000 n=10)
    64/32/1/0                      241.88n ±  0%   61.12n ± 10%  -74.73% (p=0.000 n=10)
    64/32/1/1                      242.44n ±  1%   55.30n ± 11%  -77.19% (p=0.000 n=10)
    64/32/1/2                      241.08n ±  1%   58.99n ±  6%  -75.53% (p=0.000 n=10)
    64/32/1/3                      241.39n ±  0%   61.12n ± 10%  -74.68% (p=0.000 n=10)
    64/32/2/0                      241.47n ±  1%   55.66n ± 10%  -76.95% (p=0.000 n=10)
    64/32/2/1                      241.82n ±  1%   55.69n ± 10%  -76.97% (p=0.000 n=10)
    64/32/2/2                      241.91n ±  1%   61.12n ± 10%  -74.74% (p=0.000 n=10)
    64/32/2/3                      241.16n ±  1%   61.13n ±  9%  -74.65% (p=0.000 n=10)
    64/32/3/0                      241.75n ±  1%   61.12n ± 10%  -74.72% (p=0.000 n=10)
    64/32/3/1                      242.04n ±  1%   55.24n ± 11%  -77.18% (p=0.000 n=10)
    64/32/3/2                      241.78n ±  1%   61.13n ± 10%  -74.72% (p=0.000 n=10)
    64/32/3/3                      241.61n ±  0%   61.13n ±  9%  -74.70% (p=0.000 n=10)
    64/32/4/4                       1.386µ ±  1%   1.373µ ±  1%   -0.91% (p=0.004 n=10)
    64/64/0/0                       444.8n ±  0%   106.1n ±  0%  -76.15% (p=0.000 n=10)
    64/64/0/1                       444.0n ±  1%   106.1n ±  0%  -76.10% (p=0.000 n=10)
    64/64/0/2                       444.8n ±  2%   106.1n ±  0%  -76.16% (p=0.000 n=10)
    64/64/0/3                       443.4n ±  0%   106.1n ±  0%  -76.07% (p=0.000 n=10)
    64/64/1/0                       443.2n ±  0%   106.2n ±  0%  -76.04% (p=0.000 n=10)
    64/64/1/1                       443.6n ±  0%   106.1n ±  0%  -76.09% (p=0.000 n=10)
    64/64/1/2                       444.3n ±  1%   106.1n ±  0%  -76.13% (p=0.000 n=10)
    64/64/1/3                       444.4n ±  0%   106.1n ±  1%  -76.13% (p=0.000 n=10)
    64/64/2/0                       443.6n ±  0%   106.1n ±  0%  -76.07% (p=0.000 n=10)
    64/64/2/1                       444.0n ±  0%   106.1n ±  0%  -76.11% (p=0.000 n=10)
    64/64/2/2                       444.1n ±  0%   106.1n ±  0%  -76.12% (p=0.000 n=10)
    64/64/2/3                       443.7n ±  1%   106.1n ±  0%  -76.08% (p=0.000 n=10)
    64/64/3/0                       443.8n ±  0%   106.1n ±  0%  -76.08% (p=0.000 n=10)
    64/64/3/1                       444.3n ±  0%   106.1n ±  0%  -76.12% (p=0.000 n=10)
    64/64/3/2                       444.0n ±  1%   106.1n ±  0%  -76.11% (p=0.000 n=10)
    64/64/3/3                       444.3n ±  1%   106.1n ±  0%  -76.11% (p=0.000 n=10)
    64/64/4/4                       2.506µ ±  0%   2.495µ ±  1%   -0.43% (p=0.015 n=10)
    64/128/0/0                      880.2n ±  1%   197.5n ±  1%  -77.56% (p=0.000 n=10)
    64/128/0/1                      882.8n ±  1%   197.5n ±  1%  -77.63% (p=0.000 n=10)
    64/128/0/2                      881.9n ±  1%   197.4n ±  1%  -77.62% (p=0.000 n=10)
    64/128/0/3                      882.6n ±  1%   197.5n ±  6%  -77.62% (p=0.000 n=10)
    64/128/1/0                      881.2n ±  1%   197.4n ±  1%  -77.60% (p=0.000 n=10)
    64/128/1/1                      880.2n ±  1%   197.3n ±  1%  -77.58% (p=0.000 n=10)
    64/128/1/2                      879.7n ±  1%   197.6n ±  1%  -77.54% (p=0.000 n=10)
    64/128/1/3                      880.1n ±  1%   197.4n ±  1%  -77.57% (p=0.000 n=10)
    64/128/2/0                      882.2n ±  1%   197.5n ±  1%  -77.61% (p=0.000 n=10)
    64/128/2/1                      879.3n ±  1%   197.5n ±  1%  -77.54% (p=0.000 n=10)
    64/128/2/2                      880.6n ±  1%   197.3n ±  2%  -77.59% (p=0.000 n=10)
    64/128/2/3                      878.8n ±  1%   197.5n ±  1%  -77.53% (p=0.000 n=10)
    64/128/3/0                      881.5n ±  1%   197.3n ±  2%  -77.61% (p=0.000 n=10)
    64/128/3/1                      880.1n ±  1%   197.5n ±  1%  -77.56% (p=0.000 n=10)
    64/128/3/2                      881.2n ±  1%   197.4n ±  3%  -77.60% (p=0.000 n=10)
    64/128/3/3                      880.6n ±  1%   197.4n ±  1%  -77.59% (p=0.000 n=10)
    64/128/4/4                      4.833µ ±  0%   4.823µ ±  0%        ~ (p=0.247 n=10)
    128/64/0/0                      869.4n ±  0%   160.4n ±  1%  -81.55% (p=0.000 n=10)
    128/64/0/1                      868.3n ±  1%   160.4n ±  0%  -81.53% (p=0.000 n=10)
    128/64/0/2                      867.7n ±  0%   160.4n ±  0%  -81.52% (p=0.000 n=10)
    128/64/0/3                      868.6n ±  1%   160.4n ±  0%  -81.54% (p=0.000 n=10)
    128/64/1/0                      868.6n ±  0%   160.4n ±  1%  -81.53% (p=0.000 n=10)
    128/64/1/1                      867.8n ±  1%   160.5n ±  0%  -81.51% (p=0.000 n=10)
    128/64/1/2                      867.6n ±  1%   160.4n ±  0%  -81.51% (p=0.000 n=10)
    128/64/1/3                      867.4n ±  0%   160.4n ±  0%  -81.51% (p=0.000 n=10)
    128/64/2/0                      867.0n ±  1%   160.5n ±  0%  -81.49% (p=0.000 n=10)
    128/64/2/1                      867.1n ±  1%   160.4n ±  0%  -81.50% (p=0.000 n=10)
    128/64/2/2                      867.9n ±  0%   160.5n ±  0%  -81.51% (p=0.000 n=10)
    128/64/2/3                      867.9n ±  0%   160.4n ±  0%  -81.52% (p=0.000 n=10)
    128/64/3/0                      868.1n ±  0%   160.3n ±  0%  -81.53% (p=0.000 n=10)
    128/64/3/1                      867.6n ±  0%   160.4n ±  0%  -81.52% (p=0.000 n=10)
    128/64/3/2                      867.3n ±  0%   160.4n ±  0%  -81.51% (p=0.000 n=10)
    128/64/3/3                      869.0n ±  0%   160.4n ±  0%  -81.54% (p=0.000 n=10)
    128/64/4/4                      5.000µ ±  0%   5.015µ ±  1%   +0.30% (p=0.001 n=10)
    128/128/0/0                    1880.9n ±  1%   307.6n ±  1%  -83.65% (p=0.000 n=10)
    128/128/0/1                    1889.2n ±  1%   307.7n ±  0%  -83.71% (p=0.000 n=10)
    128/128/0/2                    1878.7n ±  1%   307.7n ±  0%  -83.62% (p=0.000 n=10)
    128/128/0/3                    1882.9n ±  1%   308.1n ±  0%  -83.64% (p=0.000 n=10)
    128/128/1/0                    1882.6n ±  0%   307.7n ±  0%  -83.65% (p=0.000 n=10)
    128/128/1/1                    1883.8n ±  1%   307.7n ±  0%  -83.67% (p=0.000 n=10)
    128/128/1/2                    1879.8n ±  1%   307.8n ±  0%  -83.63% (p=0.000 n=10)
    128/128/1/3                    1883.5n ±  1%   308.0n ±  0%  -83.65% (p=0.000 n=10)
    128/128/2/0                    1883.1n ±  1%   307.8n ±  0%  -83.65% (p=0.000 n=10)
    128/128/2/1                    1880.2n ±  0%   307.8n ±  0%  -83.63% (p=0.000 n=10)
    128/128/2/2                    1882.4n ±  1%   307.7n ±  0%  -83.65% (p=0.000 n=10)
    128/128/2/3                    1883.3n ±  1%   307.8n ±  0%  -83.65% (p=0.000 n=10)
    128/128/3/0                    1882.5n ±  1%   307.7n ±  0%  -83.65% (p=0.000 n=10)
    128/128/3/1                    1882.0n ±  1%   307.9n ±  0%  -83.64% (p=0.000 n=10)
    128/128/3/2                    1879.9n ±  1%   307.8n ±  1%  -83.63% (p=0.000 n=10)
    128/128/3/3                    1883.9n ±  1%   307.8n ±  0%  -83.66% (p=0.000 n=10)
    128/128/4/4                     9.923µ ±  0%   9.616µ ±  1%   -3.09% (p=0.000 n=10)
    4/16/0/0                        23.58n ±  1%   23.05n ±  1%   -2.26% (p=0.000 n=10)
    4/16/0/1                        23.59n ±  0%   22.97n ±  0%   -2.63% (p=0.000 n=10)
    4/16/0/2                        23.59n ±  0%   23.08n ±  1%   -2.17% (p=0.000 n=10)
    4/16/0/3                        23.64n ±  0%   22.97n ±  2%   -2.85% (p=0.000 n=10)
    4/16/1/0                        23.58n ±  0%   22.97n ±  0%   -2.61% (p=0.000 n=10)
    4/16/1/1                        23.60n ±  0%   23.00n ±  0%   -2.53% (p=0.000 n=10)
    4/16/1/2                        23.61n ±  0%   23.06n ±  0%   -2.35% (p=0.000 n=10)
    4/16/1/3                        23.61n ±  0%   23.00n ±  0%   -2.60% (p=0.000 n=10)
    4/16/2/0                        23.63n ±  1%   22.96n ±  0%   -2.85% (p=0.000 n=10)
    4/16/2/1                        23.58n ±  1%   23.02n ±  1%   -2.38% (p=0.000 n=10)
    4/16/2/2                        23.59n ±  0%   23.04n ±  1%   -2.32% (p=0.000 n=10)
    4/16/2/3                        23.59n ±  0%   23.00n ±  1%   -2.50% (p=0.000 n=10)
    4/16/3/0                        23.64n ±  0%   23.04n ±  2%   -2.57% (p=0.002 n=10)
    4/16/3/1                        23.59n ±  0%   23.06n ±  1%   -2.23% (p=0.000 n=10)
    4/16/3/2                        23.57n ±  1%   23.00n ±  1%   -2.40% (p=0.000 n=10)
    4/16/3/3                        23.62n ±  1%   22.98n ±  1%   -2.73% (p=0.000 n=10)
    4/16/4/4                        82.87n ±  0%   80.25n ±  1%   -3.16% (p=0.000 n=10)
    16/4/0/0                        33.00n ±  1%   15.07n ±  0%  -54.32% (p=0.000 n=10)
    16/4/0/1                        32.98n ±  1%   15.05n ±  0%  -54.37% (p=0.000 n=10)
    16/4/0/2                        33.13n ±  3%   15.06n ±  1%  -54.53% (p=0.000 n=10)
    16/4/0/3                        32.96n ±  4%   15.07n ±  1%  -54.27% (p=0.000 n=10)
    16/4/1/0                        32.98n ±  1%   15.08n ±  1%  -54.26% (p=0.000 n=10)
    16/4/1/1                        32.97n ±  2%   15.06n ±  0%  -54.33% (p=0.000 n=10)
    16/4/1/2                        33.16n ±  1%   15.06n ±  0%  -54.58% (p=0.000 n=10)
    16/4/1/3                        33.13n ±  1%   15.08n ±  1%  -54.49% (p=0.000 n=10)
    16/4/2/0                        33.02n ±  1%   15.07n ±  0%  -54.35% (p=0.000 n=10)
    16/4/2/1                        33.03n ±  2%   15.06n ±  1%  -54.40% (p=0.000 n=10)
    16/4/2/2                        33.17n ±  1%   15.07n ±  1%  -54.57% (p=0.000 n=10)
    16/4/2/3                        33.21n ±  3%   15.06n ±  0%  -54.64% (p=0.000 n=10)
    16/4/3/0                        33.14n ±  3%   15.07n ±  0%  -54.52% (p=0.000 n=10)
    16/4/3/1                        33.11n ±  3%   15.07n ±  1%  -54.47% (p=0.000 n=10)
    16/4/3/2                        33.07n ±  3%   15.06n ±  0%  -54.46% (p=0.000 n=10)
    16/4/3/3                        33.12n ±  2%   15.10n ±  0%  -54.41% (p=0.000 n=10)
    16/4/4/4                        108.0n ±  1%   105.9n ±  0%   -1.97% (p=0.000 n=10)
    8/32/0/0                        38.86n ±  0%   41.71n ±  0%   +7.34% (p=0.000 n=10)
    8/32/0/1                        38.70n ±  1%   41.75n ±  0%   +7.89% (p=0.000 n=10)
    8/32/0/2                        38.82n ±  1%   41.70n ±  0%   +7.42% (p=0.000 n=10)
    8/32/0/3                        38.70n ±  1%   41.74n ±  0%   +7.84% (p=0.000 n=10)
    8/32/1/0                        38.78n ±  1%   41.76n ±  1%   +7.70% (p=0.000 n=10)
    8/32/1/1                        38.74n ±  1%   41.78n ±  1%   +7.85% (p=0.000 n=10)
    8/32/1/2                        38.69n ±  1%   41.84n ±  1%   +8.14% (p=0.000 n=10)
    8/32/1/3                        38.70n ±  0%   41.76n ±  1%   +7.90% (p=0.000 n=10)
    8/32/2/0                        38.69n ±  0%   41.74n ±  1%   +7.88% (p=0.000 n=10)
    8/32/2/1                        38.77n ±  0%   41.81n ±  1%   +7.83% (p=0.000 n=10)
    8/32/2/2                        38.73n ±  0%   41.69n ±  1%   +7.65% (p=0.000 n=10)
    8/32/2/3                        38.79n ±  1%   41.76n ±  0%   +7.68% (p=0.000 n=10)
    8/32/3/0                        38.71n ±  1%   41.82n ±  1%   +8.04% (p=0.000 n=10)
    8/32/3/1                        38.77n ±  0%   41.93n ±  1%   +8.16% (p=0.000 n=10)
    8/32/3/2                        38.68n ±  1%   41.76n ±  0%   +7.96% (p=0.000 n=10)
    8/32/3/3                        38.72n ±  1%   41.76n ±  0%   +7.84% (p=0.000 n=10)
    8/32/4/4                        181.9n ±  1%   183.9n ±  1%   +1.11% (p=0.000 n=10)
    32/8/0/0                        67.06n ±  0%   24.48n ±  1%  -63.50% (p=0.000 n=10)
    32/8/0/1                        67.29n ±  1%   24.49n ±  0%  -63.61% (p=0.000 n=10)
    32/8/0/2                        67.43n ±  2%   24.50n ±  1%  -63.67% (p=0.000 n=10)
    32/8/0/3                        67.71n ±  2%   24.48n ±  1%  -63.84% (p=0.000 n=10)
    32/8/1/0                        67.16n ±  1%   24.48n ±  0%  -63.54% (p=0.000 n=10)
    32/8/1/1                        67.38n ±  1%   24.49n ±  0%  -63.66% (p=0.000 n=10)
    32/8/1/2                        67.31n ±  1%   24.49n ±  0%  -63.62% (p=0.000 n=10)
    32/8/1/3                        67.08n ±  1%   24.46n ±  0%  -63.54% (p=0.000 n=10)
    32/8/2/0                        67.17n ±  1%   24.48n ±  0%  -63.55% (p=0.000 n=10)
    32/8/2/1                        67.44n ±  1%   24.46n ±  0%  -63.72% (p=0.000 n=10)
    32/8/2/2                        66.96n ±  2%   24.46n ±  1%  -63.47% (p=0.000 n=10)
    32/8/2/3                        67.10n ±  0%   24.46n ±  0%  -63.55% (p=0.000 n=10)
    32/8/3/0                        67.17n ±  2%   24.49n ±  0%  -63.55% (p=0.000 n=10)
    32/8/3/1                        67.03n ±  1%   24.46n ±  1%  -63.50% (p=0.000 n=10)
    32/8/3/2                        67.07n ±  1%   24.49n ±  1%  -63.48% (p=0.000 n=10)
    32/8/3/3                        67.69n ±  2%   24.49n ±  1%  -63.82% (p=0.000 n=10)
    32/8/4/4                        275.3n ±  1%   269.0n ±  1%   -2.28% (p=0.000 n=10)
    16/64/0/0                       117.7n ±  0%   103.7n ±  5%  -11.91% (p=0.000 n=10)
    16/64/0/1                       117.6n ±  0%   103.2n ±  4%  -12.26% (p=0.000 n=10)
    16/64/0/2                       117.6n ±  0%   103.3n ±  3%  -12.15% (p=0.000 n=10)
    16/64/0/3                       117.7n ±  0%   102.7n ±  4%  -12.75% (p=0.000 n=10)
    16/64/1/0                       117.7n ±  0%   102.6n ±  2%  -12.81% (p=0.000 n=10)
    16/64/1/1                       117.6n ±  0%   102.5n ±  2%  -12.80% (p=0.000 n=10)
    16/64/1/2                       117.6n ±  0%   104.3n ±  1%  -11.34% (p=0.000 n=10)
    16/64/1/3                       117.5n ±  1%   102.7n ±  3%  -12.59% (p=0.000 n=10)
    16/64/2/0                       117.6n ±  0%   104.2n ±  3%  -11.46% (p=0.000 n=10)
    16/64/2/1                       117.7n ±  0%   103.1n ±  1%  -12.43% (p=0.000 n=10)
    16/64/2/2                       117.6n ±  0%   102.5n ±  2%  -12.88% (p=0.000 n=10)
    16/64/2/3                       117.5n ±  0%   102.8n ±  1%  -12.53% (p=0.000 n=10)
    16/64/3/0                       117.6n ±  0%   102.3n ±  5%  -13.07% (p=0.000 n=10)
    16/64/3/1                       117.5n ±  0%   104.1n ±  2%  -11.42% (p=0.000 n=10)
    16/64/3/2                       117.5n ±  1%   102.8n ±  5%  -12.47% (p=0.000 n=10)
    16/64/3/3                       117.6n ±  0%   102.6n ±  4%  -12.70% (p=0.000 n=10)
    16/64/4/4                       640.2n ±  0%   641.0n ±  1%        ~ (p=0.971 n=10)
    64/16/0/0                      160.98n ±  0%   32.72n ±  1%  -79.68% (p=0.000 n=10)
    64/16/0/1                      160.69n ±  1%   32.75n ±  0%  -79.62% (p=0.000 n=10)
    64/16/0/2                      160.67n ±  1%   32.73n ±  0%  -79.63% (p=0.000 n=10)
    64/16/0/3                      160.72n ±  1%   32.77n ±  0%  -79.61% (p=0.000 n=10)
    64/16/1/0                      160.40n ±  1%   32.72n ±  0%  -79.60% (p=0.000 n=10)
    64/16/1/1                      160.56n ±  0%   32.71n ±  0%  -79.63% (p=0.000 n=10)
    64/16/1/2                      160.52n ±  0%   32.74n ±  1%  -79.60% (p=0.000 n=10)
    64/16/1/3                      160.37n ±  0%   32.76n ±  0%  -79.57% (p=0.000 n=10)
    64/16/2/0                      161.31n ±  1%   32.71n ±  0%  -79.72% (p=0.000 n=10)
    64/16/2/1                      160.38n ±  1%   32.75n ±  1%  -79.58% (p=0.000 n=10)
    64/16/2/2                      161.01n ±  0%   32.71n ±  0%  -79.69% (p=0.000 n=10)
    64/16/2/3                      160.20n ±  1%   32.73n ±  1%  -79.57% (p=0.000 n=10)
    64/16/3/0                      160.57n ±  1%   32.72n ±  1%  -79.62% (p=0.000 n=10)
    64/16/3/1                      160.45n ±  1%   32.73n ±  0%  -79.60% (p=0.000 n=10)
    64/16/3/2                      160.41n ±  1%   32.70n ±  0%  -79.61% (p=0.000 n=10)
    64/16/3/3                      160.19n ±  1%   32.71n ±  0%  -79.58% (p=0.000 n=10)
    64/16/4/4                       826.2n ±  1%   809.8n ±  1%   -1.99% (p=0.000 n=10)

    Change-Id: I5cbaf59efaeb5f10b1fc8bf69d2d31db9bf7820b

diff --git a/aom_dsp/x86/convolve_avx2.h b/aom_dsp/x86/convolve_avx2.h
index ffafa13319..1191a28a67 100644
--- a/aom_dsp/x86/convolve_avx2.h
+++ b/aom_dsp/x86/convolve_avx2.h
@@ -424,89 +424,6 @@ static inline void sr_2d_ver_round_and_store(__m256i res_a, __m256i res_b,
     s[6] = s[7];                                                              \
   }

-#define CONVOLVE_SR_HORIZONTAL_FILTER_12TAP                                    \
-  const __m256i v_zero = _mm256_setzero_si256();                               \
-  __m256i s[12];                                                               \
-  if (w <= 4) {                                                                \
-    for (i = 0; i < im_h; i += 2) {                                            \
-      const __m256i data = _mm256_permute2x128_si256(                          \
-          _mm256_castsi128_si256(                                              \
-              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),     \
-          _mm256_castsi128_si256(_mm_loadu_si128(                              \
-              (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))),        \
-          0x20);                                                               \
-      const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);               \
-      const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);               \
-      const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);            \
-      const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);            \
-                                                                               \
-      const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);            \
-      const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);            \
-                                                                               \
-      s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);                            \
-      s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);                           \
-      s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);                            \
-      s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);                           \
-      s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);                            \
-      s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);                           \
-                                                                               \
-      const __m256i res_lo = convolve_12taps(s, coeffs_h);                     \
-                                                                               \
-      __m256i res_32b_lo = _mm256_sra_epi32(                                   \
-          _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);         \
-      __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);         \
-      const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0);           \
-      const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1);           \
-      if (w > 2) {                                                             \
-        _mm_storel_epi64((__m128i *)&im_block[i * im_stride], res_0);          \
-        _mm_storel_epi64((__m128i *)&im_block[i * im_stride + im_stride],      \
-                         res_1);                                               \
-      } else {                                                                 \
-        uint32_t horiz_2;                                                      \
-        horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0);                          \
-        im_block[i * im_stride] = (uint16_t)horiz_2;                           \
-        im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16);               \
-        horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1);                          \
-        im_block[i * im_stride + im_stride] = (uint16_t)horiz_2;               \
-        im_block[i * im_stride + im_stride + 1] = (uint16_t)(horiz_2 >> 16);   \
-      }                                                                        \
-    }                                                                          \
-  } else {                                                                     \
-    for (i = 0; i < im_h; i++) {                                               \
-      const __m256i data = _mm256_permute2x128_si256(                          \
-          _mm256_castsi128_si256(                                              \
-              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),     \
-          _mm256_castsi128_si256(                                              \
-              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j + 4]))), \
-          0x20);                                                               \
-      const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);               \
-      const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);               \
-                                                                               \
-      const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);            \
-      const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);            \
-                                                                               \
-      const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);            \
-      const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);            \
-                                                                               \
-      s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);                            \
-      s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);                           \
-      s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);                            \
-      s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);                           \
-      s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);                            \
-      s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);                           \
-                                                                               \
-      const __m256i res_lo = convolve_12taps(s, coeffs_h);                     \
-                                                                               \
-      __m256i res_32b_lo = _mm256_sra_epi32(                                   \
-          _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);         \
-                                                                               \
-      __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);         \
-      _mm_store_si128((__m128i *)&im_block[i * im_stride],                     \
-                      _mm256_extracti128_si256(                                \
-                          _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0));     \
-    }                                                                          \
-  }
-
 #define CONVOLVE_SR_VERTICAL_FILTER_12TAP                                      \
   __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));   \
   __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));   \
diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c
index be53424ac7..df4a7d5516 100644
--- a/av1/common/x86/convolve_2d_avx2.c
+++ b/av1/common/x86/convolve_2d_avx2.c
@@ -91,7 +91,10 @@ static void convolve_2d_sr_avx2(const uint8_t *src, int src_stride,
   if (filter_params_x->taps > 8) {
     const int bd = 8;
     int im_stride = 8, i;
-    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+    const int strip_stride = (MAX_SB_SIZE + MAX_FILTER_TAP) * 8;
+    DECLARE_ALIGNED(
+        32, int16_t,
+        im_block_buf[(MAX_SB_SIZE / 8) * (MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
     const int bits =
         FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
     const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
@@ -124,13 +127,105 @@ static void convolve_2d_sr_avx2(const uint8_t *src, int src_stride,
     const int fo_horiz = horiz_tap / 2 - 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;

+    const __m256i v_zero = _mm256_setzero_si256();
+    __m256i s[12];
+    if (w <= 4) {
+      for (i = 0; i < im_h; i += 2) {
+        for (int j = 0; j < w; j += 8) {
+          int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
+          const __m256i data = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
+              _mm256_castsi128_si256(_mm_loadu_si128(
+                  (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))),
+              0x20);
+          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
+          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
+          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
+          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
+
+          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
+          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
+
+          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
+          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
+          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
+          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
+          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
+          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
+
+          const __m256i res_lo = convolve_12taps(s, coeffs_h);
+
+          __m256i res_32b_lo = _mm256_sra_epi32(
+              _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);
+          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
+          const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0);
+          const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1);
+          if (w > 2) {
+            _mm_storel_epi64((__m128i *)&strip_im_block[i * im_stride], res_0);
+            _mm_storel_epi64(
+                (__m128i *)&strip_im_block[i * im_stride + im_stride], res_1);
+          } else {
+            uint32_t horiz_2;
+            horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0);
+            strip_im_block[i * im_stride] = (uint16_t)horiz_2;
+            strip_im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16);
+            horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1);
+            strip_im_block[i * im_stride + im_stride] = (uint16_t)horiz_2;
+            strip_im_block[i * im_stride + im_stride + 1] =
+                (uint16_t)(horiz_2 >> 16);
+          }
+        }
+      }
+    } else {
+      for (i = 0; i < im_h; i++) {
+        for (int j = 0; j < w; j += 8) {
+          int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
+          const __m256i data = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
+              _mm256_castsi128_si256(_mm_loadu_si128(
+                  (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
+              0x20);
+          const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
+          const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
+
+          const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
+          const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
+
+          const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
+          const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
+
+          s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
+          s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
+          s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
+          s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
+          s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
+          s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
+
+          const __m256i res_lo = convolve_12taps(s, coeffs_h);
+
+          __m256i res_32b_lo = _mm256_sra_epi32(
+              _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);
+
+          __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
+          _mm_store_si128((__m128i *)&strip_im_block[i * im_stride],
+                          _mm256_extracti128_si256(
+                              _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0));
+        }
+      }
+    }
+
     for (int j = 0; j < w; j += 8) {
-      CONVOLVE_SR_HORIZONTAL_FILTER_12TAP
+      const int16_t *im_block = &im_block_buf[(j / 8) * strip_stride];
       CONVOLVE_SR_VERTICAL_FILTER_12TAP
     }
   } else {
     int im_stride = 8, i;
-    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+    const int strip_stride = (MAX_SB_SIZE + MAX_FILTER_TAP) * 8;
+    DECLARE_ALIGNED(
+        32, int16_t,
+        im_block_buf[(MAX_SB_SIZE / 8) * (MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);

     assert(conv_params->round_0 == 3);
     assert(conv_params->round_1 == 11);
@@ -177,17 +272,65 @@ static void convolve_2d_sr_avx2(const uint8_t *src, int src_stride,
     filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
     filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);

-    for (int j = 0; j < w; j += 8) {
-      if (horiz_tap == 2) {
-        CONVOLVE_SR_HORIZONTAL_FILTER_2TAP
-      } else if (horiz_tap == 4) {
-        CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
-      } else if (horiz_tap == 6) {
-        CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
-      } else {
-        CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
+    if (subpel_x_qn == 0 && subpel_y_qn == 0) {
+      for (i = 0; i < h; ++i) {
+        for (int j = 0; j < w; j += 8) {
+          _mm_storel_epi64(
+              (__m128i *)&dst[i * dst_stride + j],
+              _mm_loadl_epi64((const __m128i *)&src[i * src_stride + j]));
+        }
       }
+      return;
+    }

+    for (i = 0; i < (im_h - 1); i += 2) {
+      const uint8_t *src_row0 = &src_ptr[i * src_stride];
+      const uint8_t *src_row1 = &src_ptr[(i + 1) * src_stride];
+      for (int j = 0; j < w; j += 8) {
+        int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
+        __m256i data =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src_row0[j]));
+        data = _mm256_inserti128_si256(
+            data, _mm_loadu_si128((__m128i *)&src_row1[j]), 1);
+
+        __m256i res;
+        if (horiz_tap == 2)
+          res = convolve_lowbd_x_2tap(data, coeffs_h, filt);
+        else if (horiz_tap == 4)
+          res = convolve_lowbd_x_4tap(data, coeffs_h, filt);
+        else if (horiz_tap == 6)
+          res = convolve_lowbd_x_6tap(data, coeffs_h, filt);
+        else
+          res = convolve_lowbd_x(data, coeffs_h, filt);
+
+        res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);
+        _mm256_store_si256((__m256i *)&strip_im_block[i * 8], res);
+      }
+    }
+    {
+      const uint8_t *src_row0 = &src_ptr[i * src_stride];
+      for (int j = 0; j < w; j += 8) {
+        int16_t *strip_im_block = &im_block_buf[(j / 8) * strip_stride];
+        __m256i data_1 =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src_row0[j]));
+        __m256i res;
+        if (horiz_tap == 2)
+          res = convolve_lowbd_x_2tap(data_1, coeffs_h, filt);
+        else if (horiz_tap == 4)
+          res = convolve_lowbd_x_4tap(data_1, coeffs_h, filt);
+        else if (horiz_tap == 6)
+          res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt);
+        else
+          res = convolve_lowbd_x(data_1, coeffs_h, filt);
+
+        res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);
+        _mm_store_si128((__m128i *)&strip_im_block[i * 8],
+                        _mm256_castsi256_si128(res));
+      }
+    }
+
+    for (int j = 0; j < w; j += 8) {
+      const int16_t *im_block = &im_block_buf[(j / 8) * strip_stride];
       uint8_t *dst_ptr = dst + j;
       if (vert_tap == 2) {
         CONVOLVE_SR_VERTICAL_FILTER_2TAP