Commit 3360063d11 for aom

commit 3360063d11607252a5cc72b5cd714ce6460f8863
Author: Jeremy Dorfman <jdorfman@google.com>
Date:   Thu May 14 13:18:04 2026 -0400

    [convolve] Optimize av1_dist_wtd_convolve_2d_avx2

    This change optimizes the AVX2 implementation of joint 2D convolution by
    introducing dedicated for combined 4-tap horizontal and vertical
    filtering. It ends up around 24% faster than the previous
    implementation.

    BlockWidth/BlockHeight/HorizontalFilter/VerticalFilter
                   Baseline CPU  New CPU      Improvement
    4/4/0/0        54.80n ± 1%   38.74n ± 1%  -29.31% (p=0.000 n=20)
    4/4/0/1        54.85n ± 0%   38.40n ± 1%  -29.99% (p=0.000 n=20)
    4/4/0/2        54.79n ± 0%   38.41n ± 1%  -29.89% (p=0.000 n=20)
    4/4/0/3        54.74n ± 0%   38.56n ± 1%  -29.56% (p=0.000 n=20)
    4/4/1/0        54.66n ± 0%   38.57n ± 1%  -29.44% (p=0.000 n=20)
    4/4/1/1        54.73n ± 0%   38.44n ± 1%  -29.77% (p=0.000 n=20)
    4/4/1/2        54.60n ± 1%   38.46n ± 1%  -29.56% (p=0.000 n=20)
    4/4/1/3        54.68n ± 0%   38.40n ± 1%  -29.78% (p=0.000 n=20)
    4/4/2/0        54.88n ± 1%   38.53n ± 1%  -29.80% (p=0.000 n=20)
    4/4/2/1        54.79n ± 0%   38.36n ± 1%  -29.98% (p=0.000 n=20)
    4/4/2/2        54.83n ± 0%   38.42n ± 1%  -29.92% (p=0.000 n=20)
    4/4/2/3        54.69n ± 0%   38.46n ± 1%  -29.68% (p=0.000 n=20)
    4/4/3/0        54.82n ± 0%   38.44n ± 1%  -29.88% (p=0.000 n=20)
    4/4/3/1        54.81n ± 1%   38.40n ± 0%  -29.95% (p=0.000 n=20)
    4/4/3/2        54.71n ± 1%   38.65n ± 1%  -29.35% (p=0.000 n=20)
    4/4/3/3        54.72n ± 0%   38.24n ± 1%  -30.11% (p=0.000 n=20)
    4/4/4/4        54.45n ± 0%   38.49n ± 1%  -29.31% (p=0.000 n=20)
    4/8/0/0        62.69n ± 1%   45.15n ± 1%  -27.97% (p=0.000 n=20)
    4/8/0/1        62.61n ± 0%   45.19n ± 1%  -27.83% (p=0.000 n=20)
    4/8/0/2        62.62n ± 0%   45.18n ± 1%  -27.85% (p=0.000 n=20)
    4/8/0/3        62.52n ± 0%   45.14n ± 1%  -27.80% (p=0.000 n=20)
    4/8/1/0        62.67n ± 1%   45.01n ± 1%  -28.18% (p=0.000 n=20)
    4/8/1/1        62.54n ± 1%   44.99n ± 1%  -28.06% (p=0.000 n=20)
    4/8/1/2        62.77n ± 0%   45.32n ± 1%  -27.81% (p=0.000 n=20)
    4/8/1/3        62.71n ± 0%   45.12n ± 0%  -28.05% (p=0.000 n=20)
    4/8/2/0        62.57n ± 0%   45.25n ± 1%  -27.68% (p=0.000 n=20)
    4/8/2/1        62.81n ± 0%   45.34n ± 1%  -27.82% (p=0.000 n=20)
    4/8/2/2        62.74n ± 0%   45.13n ± 0%  -28.07% (p=0.000 n=20)
    4/8/2/3        62.82n ± 0%   44.99n ± 1%  -28.39% (p=0.000 n=20)
    4/8/3/0        62.69n ± 1%   45.21n ± 0%  -27.89% (p=0.000 n=20)
    4/8/3/1        62.68n ± 0%   45.12n ± 1%  -28.02% (p=0.000 n=20)
    4/8/3/2        62.67n ± 0%   45.16n ± 1%  -27.94% (p=0.000 n=20)
    4/8/3/3        62.71n ± 1%   45.18n ± 1%  -27.95% (p=0.000 n=20)
    4/8/4/4        62.13n ± 1%   45.19n ± 0%  -27.26% (p=0.000 n=20)
    8/4/0/0        56.76n ± 0%   41.83n ± 0%  -26.29% (p=0.000 n=20)
    8/4/0/1        56.53n ± 0%   41.86n ± 1%  -25.96% (p=0.000 n=20)
    8/4/0/2        56.66n ± 0%   41.89n ± 0%  -26.07% (p=0.000 n=20)
    8/4/0/3        56.72n ± 0%   41.91n ± 1%  -26.12% (p=0.000 n=20)
    8/4/1/0        56.56n ± 0%   41.81n ± 1%  -26.08% (p=0.000 n=20)
    8/4/1/1        56.46n ± 0%   41.80n ± 1%  -25.97% (p=0.000 n=20)
    8/4/1/2        56.68n ± 0%   41.83n ± 0%  -26.19% (p=0.000 n=20)
    8/4/1/3        56.65n ± 0%   41.88n ± 0%  -26.07% (p=0.000 n=20)
    8/4/2/0        56.65n ± 0%   41.74n ± 1%  -26.31% (p=0.000 n=20)
    8/4/2/1        56.59n ± 0%   41.84n ± 3%  -26.05% (p=0.000 n=20)
    8/4/2/2        56.55n ± 0%   41.95n ± 1%  -25.81% (p=0.000 n=20)
    8/4/2/3        56.68n ± 0%   41.73n ± 0%  -26.37% (p=0.000 n=20)
    8/4/3/0        56.73n ± 0%   41.96n ± 0%  -26.05% (p=0.000 n=20)
    8/4/3/1        56.76n ± 0%   41.73n ± 0%  -26.47% (p=0.000 n=20)
    8/4/3/2        56.67n ± 0%   42.02n ± 1%  -25.85% (p=0.000 n=20)
    8/4/3/3        56.67n ± 0%   41.77n ± 0%  -26.29% (p=0.000 n=20)
    8/4/4/4        57.30n ± 0%   41.73n ± 0%  -27.17% (p=0.000 n=20)
    8/8/0/0        67.99n ± 0%   51.12n ± 1%  -24.81% (p=0.000 n=20)
    8/8/0/1        68.08n ± 0%   50.93n ± 0%  -25.20% (p=0.000 n=20)
    8/8/0/2        67.95n ± 0%   51.00n ± 1%  -24.94% (p=0.000 n=20)
    8/8/0/3        67.97n ± 1%   50.93n ± 0%  -25.07% (p=0.000 n=20)
    8/8/1/0        67.96n ± 0%   51.17n ± 0%  -24.71% (p=0.000 n=20)
    8/8/1/1        67.81n ± 0%   51.08n ± 0%  -24.67% (p=0.000 n=20)
    8/8/1/2        67.88n ± 1%   51.15n ± 0%  -24.66% (p=0.000 n=20)
    8/8/1/3        67.84n ± 1%   51.26n ± 0%  -24.44% (p=0.000 n=20)
    8/8/2/0        67.85n ± 0%   50.92n ± 0%  -24.96% (p=0.000 n=20)
    8/8/2/1        68.05n ± 1%   50.94n ± 0%  -25.14% (p=0.000 n=20)
    8/8/2/2        67.87n ± 1%   50.89n ± 1%  -25.02% (p=0.000 n=20)
    8/8/2/3        67.99n ± 0%   50.91n ± 0%  -25.13% (p=0.000 n=20)
    8/8/3/0        67.98n ± 1%   50.93n ± 1%  -25.07% (p=0.000 n=20)
    8/8/3/1        68.23n ± 1%   51.10n ± 1%  -25.11% (p=0.000 n=20)
    8/8/3/2        67.91n ± 1%   50.91n ± 0%  -25.03% (p=0.000 n=20)
    8/8/3/3        68.06n ± 0%   50.97n ± 0%  -25.11% (p=0.000 n=20)
    8/8/4/4        68.03n ± 1%   51.05n ± 1%  -24.96% (p=0.000 n=20)
    8/16/0/0       92.77n ± 0%   69.84n ± 1%  -24.72% (p=0.000 n=20)
    8/16/0/1       93.02n ± 0%   70.29n ± 1%  -24.44% (p=0.000 n=20)
    8/16/0/2       92.83n ± 0%   70.15n ± 2%  -24.43% (p=0.000 n=20)
    8/16/0/3       92.77n ± 1%   69.99n ± 1%  -24.55% (p=0.000 n=20)
    8/16/1/0       92.68n ± 1%   72.09n ± 2%  -22.21% (p=0.000 n=20)
    8/16/1/1       92.77n ± 0%   71.12n ± 2%  -23.34% (p=0.000 n=20)
    8/16/1/2       92.56n ± 0%   72.38n ± 3%  -21.80% (p=0.000 n=20)
    8/16/1/3       92.78n ± 1%   71.17n ± 2%  -23.29% (p=0.000 n=20)
    8/16/2/0       92.82n ± 0%   69.98n ± 1%  -24.60% (p=0.000 n=20)
    8/16/2/1       92.70n ± 0%   70.20n ± 1%  -24.27% (p=0.000 n=20)
    8/16/2/2       92.56n ± 0%   70.02n ± 1%  -24.35% (p=0.000 n=20)
    8/16/2/3       92.85n ± 0%   70.16n ± 0%  -24.44% (p=0.000 n=20)
    8/16/3/0       92.70n ± 0%   69.99n ± 0%  -24.49% (p=0.000 n=20)
    8/16/3/1       92.82n ± 0%   70.07n ± 0%  -24.51% (p=0.000 n=20)
    8/16/3/2       92.72n ± 1%   69.74n ± 0%  -24.78% (p=0.000 n=20)
    8/16/3/3       92.59n ± 1%   69.92n ± 1%  -24.48% (p=0.000 n=20)
    8/16/4/4       94.89n ± 0%   70.21n ± 0%  -26.00% (p=0.000 n=20)
    16/8/0/0      116.73n ± 1%   89.59n ± 0%  -23.25% (p=0.000 n=20)
    16/8/0/1      116.70n ± 0%   89.59n ± 0%  -23.23% (p=0.000 n=20)
    16/8/0/2      116.73n ± 0%   89.78n ± 0%  -23.09% (p=0.000 n=20)
    16/8/0/3      117.07n ± 0%   89.65n ± 0%  -23.42% (p=0.000 n=20)
    16/8/1/0      116.82n ± 0%   89.66n ± 0%  -23.25% (p=0.000 n=20)
    16/8/1/1      116.53n ± 1%   89.71n ± 1%  -23.01% (p=0.000 n=20)
    16/8/1/2      117.03n ± 0%   89.67n ± 0%  -23.37% (p=0.000 n=20)
    16/8/1/3      116.83n ± 0%   89.73n ± 0%  -23.20% (p=0.000 n=20)
    16/8/2/0      116.58n ± 0%   89.75n ± 0%  -23.01% (p=0.000 n=20)
    16/8/2/1      117.12n ± 0%   89.68n ± 0%  -23.43% (p=0.000 n=20)
    16/8/2/2      116.78n ± 0%   89.40n ± 0%  -23.44% (p=0.000 n=20)
    16/8/2/3      116.80n ± 0%   89.63n ± 0%  -23.26% (p=0.000 n=20)
    16/8/3/0      116.67n ± 1%   89.69n ± 0%  -23.12% (p=0.000 n=20)
    16/8/3/1      116.47n ± 0%   89.78n ± 0%  -22.91% (p=0.000 n=20)
    16/8/3/2      116.64n ± 0%   89.59n ± 1%  -23.19% (p=0.000 n=20)
    16/8/3/3      116.72n ± 0%   89.60n ± 0%  -23.24% (p=0.000 n=20)
    16/8/4/4      117.08n ± 1%   89.69n ± 0%  -23.39% (p=0.000 n=20)
    16/16/0/0      165.1n ± 0%   123.6n ± 1%  -25.15% (p=0.000 n=20)
    16/16/0/1      165.2n ± 0%   123.8n ± 1%  -25.07% (p=0.000 n=20)
    16/16/0/2      164.7n ± 0%   123.4n ± 0%  -25.09% (p=0.000 n=20)
    16/16/0/3      165.5n ± 0%   123.8n ± 1%  -25.24% (p=0.000 n=20)
    16/16/1/0      164.7n ± 1%   126.0n ± 2%  -23.54% (p=0.000 n=20)
    16/16/1/1      165.2n ± 1%   125.1n ± 1%  -24.32% (p=0.000 n=20)
    16/16/1/2      165.0n ± 0%   125.5n ± 2%  -23.95% (p=0.000 n=20)
    16/16/1/3      165.0n ± 0%   124.8n ± 3%  -24.34% (p=0.000 n=20)
    16/16/2/0      164.9n ± 1%   123.3n ± 1%  -25.21% (p=0.000 n=20)
    16/16/2/1      164.4n ± 1%   123.6n ± 0%  -24.83% (p=0.000 n=20)
    16/16/2/2      164.7n ± 1%   123.4n ± 0%  -25.10% (p=0.000 n=20)
    16/16/2/3      164.8n ± 1%   123.3n ± 0%  -25.18% (p=0.000 n=20)
    16/16/3/0      165.7n ± 1%   123.5n ± 0%  -25.48% (p=0.000 n=20)
    16/16/3/1      164.9n ± 0%   123.7n ± 1%  -25.00% (p=0.000 n=20)
    16/16/3/2      164.9n ± 1%   123.6n ± 0%  -25.04% (p=0.000 n=20)
    16/16/3/3      164.9n ± 0%   123.3n ± 0%  -25.24% (p=0.000 n=20)
    16/16/4/4      169.3n ± 1%   123.3n ± 0%  -27.16% (p=0.000 n=20)
    16/32/0/0      273.1n ± 0%   204.5n ± 0%  -25.11% (p=0.000 n=20)
    16/32/0/1      272.4n ± 0%   204.8n ± 0%  -24.80% (p=0.000 n=20)
    16/32/0/2      273.0n ± 0%   204.5n ± 0%  -25.10% (p=0.000 n=20)
    16/32/0/3      272.7n ± 0%   204.5n ± 0%  -25.02% (p=0.000 n=20)
    16/32/1/0      272.6n ± 0%   205.2n ± 0%  -24.75% (p=0.000 n=20)
    16/32/1/1      272.9n ± 0%   205.2n ± 0%  -24.79% (p=0.000 n=20)
    16/32/1/2      272.6n ± 0%   205.1n ± 0%  -24.77% (p=0.000 n=20)
    16/32/1/3      272.9n ± 0%   205.0n ± 0%  -24.87% (p=0.000 n=20)
    16/32/2/0      272.6n ± 0%   204.6n ± 0%  -24.93% (p=0.000 n=20)
    16/32/2/1      272.5n ± 0%   204.7n ± 0%  -24.88% (p=0.000 n=20)
    16/32/2/2      272.8n ± 0%   204.6n ± 0%  -24.97% (p=0.000 n=20)
    16/32/2/3      272.8n ± 0%   204.8n ± 0%  -24.94% (p=0.000 n=20)
    16/32/3/0      272.7n ± 0%   204.8n ± 0%  -24.89% (p=0.000 n=20)
    16/32/3/1      272.7n ± 0%   205.0n ± 0%  -24.81% (p=0.000 n=20)
    16/32/3/2      272.8n ± 0%   204.9n ± 0%  -24.89% (p=0.000 n=20)
    16/32/3/3      272.7n ± 0%   204.9n ± 0%  -24.86% (p=0.000 n=20)
    16/32/4/4      279.4n ± 0%   204.7n ± 0%  -26.74% (p=0.000 n=20)
    32/16/0/0      309.0n ± 0%   232.3n ± 0%  -24.80% (p=0.000 n=20)
    32/16/0/1      309.1n ± 0%   232.7n ± 1%  -24.72% (p=0.000 n=20)
    32/16/0/2      309.2n ± 0%   232.1n ± 1%  -24.94% (p=0.000 n=20)
    32/16/0/3      309.8n ± 0%   231.6n ± 0%  -25.25% (p=0.000 n=20)
    32/16/1/0      309.1n ± 0%   233.8n ± 1%  -24.34% (p=0.000 n=20)
    32/16/1/1      309.0n ± 0%   234.4n ± 1%  -24.13% (p=0.000 n=20)
    32/16/1/2      309.9n ± 0%   234.2n ± 1%  -24.41% (p=0.000 n=20)
    32/16/1/3      309.0n ± 1%   233.8n ± 1%  -24.34% (p=0.000 n=20)
    32/16/2/0      310.2n ± 0%   231.8n ± 0%  -25.26% (p=0.000 n=20)
    32/16/2/1      309.8n ± 0%   231.5n ± 1%  -25.26% (p=0.000 n=20)
    32/16/2/2      309.7n ± 0%   232.1n ± 0%  -25.05% (p=0.000 n=20)
    32/16/2/3      309.5n ± 0%   232.0n ± 0%  -25.02% (p=0.000 n=20)
    32/16/3/0      308.7n ± 0%   231.3n ± 1%  -25.08% (p=0.000 n=20)
    32/16/3/1      309.7n ± 0%   232.0n ± 0%  -25.10% (p=0.000 n=20)
    32/16/3/2      310.0n ± 1%   231.6n ± 0%  -25.29% (p=0.000 n=20)
    32/16/3/3      309.2n ± 0%   231.5n ± 1%  -25.13% (p=0.000 n=20)
    32/16/4/4      317.4n ± 0%   232.3n ± 0%  -26.82% (p=0.000 n=20)
    32/32/0/0      540.6n ± 0%   390.7n ± 0%  -27.73% (p=0.000 n=20)
    32/32/0/1      541.4n ± 0%   390.6n ± 0%  -27.85% (p=0.000 n=20)
    32/32/0/2      541.3n ± 0%   390.6n ± 0%  -27.84% (p=0.000 n=20)
    32/32/0/3      540.3n ± 0%   390.7n ± 0%  -27.69% (p=0.000 n=20)
    32/32/1/0      540.2n ± 0%   391.6n ± 0%  -27.51% (p=0.000 n=20)
    32/32/1/1      540.7n ± 0%   391.9n ± 0%  -27.52% (p=0.000 n=20)
    32/32/1/2      540.8n ± 0%   392.0n ± 0%  -27.53% (p=0.000 n=20)
    32/32/1/3      541.3n ± 0%   391.8n ± 0%  -27.61% (p=0.000 n=20)
    32/32/2/0      541.9n ± 0%   390.3n ± 0%  -27.98% (p=0.000 n=20)
    32/32/2/1      540.8n ± 0%   390.7n ± 0%  -27.77% (p=0.000 n=20)
    32/32/2/2      541.0n ± 0%   390.6n ± 0%  -27.81% (p=0.000 n=20)
    32/32/2/3      540.9n ± 0%   390.7n ± 0%  -27.77% (p=0.000 n=20)
    32/32/3/0      540.6n ± 0%   390.7n ± 0%  -27.74% (p=0.000 n=20)
    32/32/3/1      540.9n ± 0%   390.7n ± 0%  -27.76% (p=0.000 n=20)
    32/32/3/2      540.7n ± 0%   390.5n ± 0%  -27.78% (p=0.000 n=20)
    32/32/3/3      540.6n ± 0%   390.8n ± 0%  -27.71% (p=0.000 n=20)
    32/32/4/4      555.0n ± 0%   390.8n ± 0%  -29.58% (p=0.000 n=20)
    32/64/0/0     1013.5n ± 0%   775.0n ± 0%  -23.53% (p=0.000 n=20)
    32/64/0/1     1011.5n ± 0%   775.3n ± 0%  -23.35% (p=0.000 n=20)
    32/64/0/2     1011.9n ± 0%   775.6n ± 0%  -23.35% (p=0.000 n=20)
    32/64/0/3     1012.6n ± 0%   775.0n ± 0%  -23.46% (p=0.000 n=20)
    32/64/1/0     1012.4n ± 0%   774.9n ± 0%  -23.46% (p=0.000 n=20)
    32/64/1/1     1012.8n ± 0%   775.7n ± 0%  -23.41% (p=0.000 n=20)
    32/64/1/2     1011.8n ± 0%   775.2n ± 0%  -23.39% (p=0.000 n=20)
    32/64/1/3     1012.3n ± 0%   776.0n ± 0%  -23.34% (p=0.000 n=20)
    32/64/2/0     1012.0n ± 0%   775.5n ± 0%  -23.37% (p=0.000 n=20)
    32/64/2/1     1013.3n ± 0%   775.4n ± 0%  -23.48% (p=0.000 n=20)
    32/64/2/2     1012.6n ± 0%   774.9n ± 0%  -23.47% (p=0.000 n=20)
    32/64/2/3     1012.3n ± 0%   775.6n ± 0%  -23.39% (p=0.000 n=20)
    32/64/3/0     1011.9n ± 0%   775.0n ± 0%  -23.41% (p=0.000 n=20)
    32/64/3/1     1012.1n ± 0%   775.3n ± 0%  -23.40% (p=0.000 n=20)
    32/64/3/2     1012.5n ± 0%   775.1n ± 0%  -23.45% (p=0.000 n=20)
    32/64/3/3     1012.8n ± 0%   775.4n ± 0%  -23.43% (p=0.000 n=20)
    32/64/4/4     1026.1n ± 0%   773.9n ± 0%  -24.58% (p=0.000 n=20)
    64/32/0/0     1049.8n ± 0%   775.5n ± 0%  -26.13% (p=0.000 n=20)
    64/32/0/1     1049.7n ± 0%   775.6n ± 0%  -26.11% (p=0.000 n=20)
    64/32/0/2     1050.3n ± 0%   775.4n ± 0%  -26.18% (p=0.000 n=20)
    64/32/0/3     1050.3n ± 0%   775.5n ± 0%  -26.16% (p=0.000 n=20)
    64/32/1/0     1051.1n ± 0%   775.5n ± 0%  -26.21% (p=0.000 n=20)
    64/32/1/1     1049.1n ± 0%   775.3n ± 0%  -26.10% (p=0.000 n=20)
    64/32/1/2     1050.3n ± 0%   775.6n ± 0%  -26.15% (p=0.000 n=20)
    64/32/1/3     1049.9n ± 0%   775.4n ± 0%  -26.15% (p=0.000 n=20)
    64/32/2/0     1050.4n ± 0%   775.6n ± 0%  -26.16% (p=0.000 n=20)
    64/32/2/1     1050.0n ± 0%   775.5n ± 0%  -26.14% (p=0.000 n=20)
    64/32/2/2     1050.2n ± 0%   775.4n ± 0%  -26.17% (p=0.000 n=20)
    64/32/2/3     1050.9n ± 0%   775.5n ± 0%  -26.20% (p=0.000 n=20)
    64/32/3/0     1050.3n ± 0%   775.2n ± 0%  -26.20% (p=0.000 n=20)
    64/32/3/1     1051.0n ± 0%   775.5n ± 0%  -26.22% (p=0.000 n=20)
    64/32/3/2     1051.4n ± 0%   775.9n ± 0%  -26.21% (p=0.000 n=20)
    64/32/3/3     1049.9n ± 0%   775.9n ± 0%  -26.10% (p=0.000 n=20)
    64/32/4/4     1078.1n ± 1%   776.3n ± 0%  -28.00% (p=0.000 n=20)
    64/64/0/0      1.996µ ± 0%   1.523µ ± 0%  -23.72% (p=0.000 n=20)
    64/64/0/1      1.997µ ± 0%   1.524µ ± 0%  -23.69% (p=0.000 n=20)
    64/64/0/2      1.997µ ± 0%   1.522µ ± 0%  -23.75% (p=0.000 n=20)
    64/64/0/3      1.996µ ± 0%   1.523µ ± 0%  -23.72% (p=0.000 n=20)
    64/64/1/0      1.997µ ± 0%   1.525µ ± 0%  -23.65% (p=0.000 n=20)
    64/64/1/1      1.998µ ± 0%   1.523µ ± 0%  -23.79% (p=0.000 n=20)
    64/64/1/2      1.997µ ± 0%   1.523µ ± 0%  -23.71% (p=0.000 n=20)
    64/64/1/3      1.997µ ± 0%   1.525µ ± 0%  -23.65% (p=0.000 n=20)
    64/64/2/0      1.998µ ± 0%   1.523µ ± 0%  -23.76% (p=0.000 n=20)
    64/64/2/1      1.997µ ± 0%   1.524µ ± 0%  -23.67% (p=0.000 n=20)
    64/64/2/2      1.996µ ± 0%   1.524µ ± 0%  -23.66% (p=0.000 n=20)
    64/64/2/3      1.996µ ± 0%   1.523µ ± 0%  -23.69% (p=0.000 n=20)
    64/64/3/0      1.997µ ± 0%   1.523µ ± 0%  -23.72% (p=0.000 n=20)
    64/64/3/1      1.997µ ± 0%   1.525µ ± 0%  -23.63% (p=0.000 n=20)
    64/64/3/2      1.996µ ± 0%   1.523µ ± 0%  -23.71% (p=0.000 n=20)
    64/64/3/3      1.997µ ± 0%   1.523µ ± 0%  -23.78% (p=0.000 n=20)
    64/64/4/4      2.030µ ± 0%   1.522µ ± 0%  -24.99% (p=0.000 n=20)
    64/128/0/0     4.284µ ± 0%   3.347µ ± 0%  -21.86% (p=0.000 n=20)
    64/128/0/1     4.284µ ± 0%   3.349µ ± 0%  -21.82% (p=0.000 n=20)
    64/128/0/2     4.289µ ± 0%   3.347µ ± 0%  -21.97% (p=0.000 n=20)
    64/128/0/3     4.287µ ± 0%   3.347µ ± 0%  -21.92% (p=0.000 n=20)
    64/128/1/0     4.289µ ± 0%   3.345µ ± 0%  -22.00% (p=0.000 n=20)
    64/128/1/1     4.282µ ± 0%   3.355µ ± 0%  -21.65% (p=0.000 n=20)
    64/128/1/2     4.285µ ± 0%   3.355µ ± 0%  -21.71% (p=0.000 n=20)
    64/128/1/3     4.283µ ± 0%   3.356µ ± 0%  -21.65% (p=0.000 n=20)
    64/128/2/0     4.282µ ± 0%   3.350µ ± 0%  -21.76% (p=0.000 n=20)
    64/128/2/1     4.283µ ± 0%   3.359µ ± 0%  -21.58% (p=0.000 n=20)
    64/128/2/2     4.283µ ± 0%   3.359µ ± 1%  -21.57% (p=0.000 n=20)
    64/128/2/3     4.283µ ± 0%   3.353µ ± 0%  -21.70% (p=0.000 n=20)
    64/128/3/0     4.283µ ± 0%   3.349µ ± 1%  -21.81% (p=0.000 n=20)
    64/128/3/1     4.281µ ± 0%   3.352µ ± 0%  -21.71% (p=0.000 n=20)
    64/128/3/2     4.283µ ± 0%   3.353µ ± 0%  -21.71% (p=0.000 n=20)
    64/128/3/3     4.281µ ± 0%   3.352µ ± 0%  -21.71% (p=0.000 n=20)
    64/128/4/4     4.314µ ± 0%   3.348µ ± 0%  -22.39% (p=0.000 n=20)
    128/64/0/0     4.046µ ± 1%   3.104µ ± 0%  -23.29% (p=0.000 n=20)
    128/64/0/1     4.049µ ± 0%   3.103µ ± 0%  -23.37% (p=0.000 n=20)
    128/64/0/2     4.048µ ± 0%   3.105µ ± 0%  -23.31% (p=0.000 n=20)
    128/64/0/3     4.050µ ± 1%   3.105µ ± 0%  -23.31% (p=0.000 n=20)
    128/64/1/0     4.048µ ± 1%   3.107µ ± 0%  -23.25% (p=0.000 n=20)
    128/64/1/1     4.045µ ± 0%   3.107µ ± 0%  -23.20% (p=0.000 n=20)
    128/64/1/2     4.049µ ± 0%   3.107µ ± 0%  -23.27% (p=0.000 n=20)
    128/64/1/3     4.049µ ± 1%   3.107µ ± 0%  -23.28% (p=0.000 n=20)
    128/64/2/0     4.050µ ± 0%   3.107µ ± 0%  -23.27% (p=0.000 n=20)
    128/64/2/1     4.049µ ± 0%   3.110µ ± 0%  -23.21% (p=0.000 n=20)
    128/64/2/2     4.049µ ± 0%   3.109µ ± 0%  -23.21% (p=0.000 n=20)
    128/64/2/3     4.048µ ± 0%   3.104µ ± 0%  -23.31% (p=0.000 n=20)
    128/64/3/0     4.049µ ± 1%   3.106µ ± 0%  -23.28% (p=0.000 n=20)
    128/64/3/1     4.048µ ± 0%   3.104µ ± 1%  -23.32% (p=0.000 n=20)
    128/64/3/2     4.050µ ± 0%   3.106µ ± 0%  -23.30% (p=0.000 n=20)
    128/64/3/3     4.049µ ± 0%   3.104µ ± 0%  -23.34% (p=0.000 n=20)
    128/64/4/4     4.117µ ± 0%   3.105µ ± 1%  -24.59% (p=0.000 n=20)
    128/128/0/0    8.866µ ± 0%   6.868µ ± 0%  -22.54% (p=0.000 n=20)
    128/128/0/1    8.868µ ± 0%   6.882µ ± 1%  -22.40% (p=0.000 n=20)
    128/128/0/2    8.863µ ± 0%   6.863µ ± 1%  -22.56% (p=0.000 n=20)
    128/128/0/3    8.860µ ± 0%   6.884µ ± 0%  -22.31% (p=0.000 n=20)
    128/128/1/0    8.868µ ± 0%   6.869µ ± 1%  -22.54% (p=0.000 n=20)
    128/128/1/1    8.863µ ± 1%   6.857µ ± 0%  -22.63% (p=0.000 n=20)
    128/128/1/2    8.879µ ± 0%   6.905µ ± 1%  -22.23% (p=0.000 n=20)
    128/128/1/3    8.875µ ± 0%   6.870µ ± 0%  -22.59% (p=0.000 n=20)
    128/128/2/0    8.867µ ± 1%   6.876µ ± 1%  -22.44% (p=0.000 n=20)
    128/128/2/1    8.867µ ± 0%   6.868µ ± 1%  -22.55% (p=0.000 n=20)
    128/128/2/2    8.859µ ± 0%   6.870µ ± 1%  -22.45% (p=0.000 n=20)
    128/128/2/3    8.873µ ± 0%   6.862µ ± 1%  -22.66% (p=0.000 n=20)
    128/128/3/0    8.856µ ± 0%   6.864µ ± 0%  -22.49% (p=0.000 n=20)
    128/128/3/1    8.858µ ± 0%   6.853µ ± 0%  -22.64% (p=0.000 n=20)
    128/128/3/2    8.871µ ± 0%   6.859µ ± 1%  -22.69% (p=0.000 n=20)
    128/128/3/3    8.869µ ± 0%   6.881µ ± 1%  -22.41% (p=0.000 n=20)
    128/128/4/4    9.319µ ± 0%   6.865µ ± 0%  -26.33% (p=0.000 n=20)
    4/16/0/0       82.12n ± 0%   57.29n ± 0%  -30.24% (p=0.000 n=20)
    4/16/0/1       82.14n ± 0%   57.08n ± 0%  -30.51% (p=0.000 n=20)
    4/16/0/2       82.24n ± 0%   57.30n ± 0%  -30.32% (p=0.000 n=20)
    4/16/0/3       82.01n ± 0%   57.25n ± 0%  -30.20% (p=0.000 n=20)
    4/16/1/0       82.15n ± 0%   57.45n ± 0%  -30.06% (p=0.000 n=20)
    4/16/1/1       82.07n ± 0%   57.24n ± 0%  -30.26% (p=0.000 n=20)
    4/16/1/2       81.97n ± 0%   57.26n ± 0%  -30.14% (p=0.000 n=20)
    4/16/1/3       82.20n ± 0%   57.29n ± 1%  -30.30% (p=0.000 n=20)
    4/16/2/0       82.06n ± 0%   57.35n ± 1%  -30.11% (p=0.000 n=20)
    4/16/2/1       82.12n ± 0%   57.28n ± 0%  -30.25% (p=0.000 n=20)
    4/16/2/2       82.22n ± 0%   57.24n ± 0%  -30.39% (p=0.000 n=20)
    4/16/2/3       82.09n ± 0%   57.20n ± 0%  -30.31% (p=0.000 n=20)
    4/16/3/0       82.10n ± 0%   57.17n ± 1%  -30.36% (p=0.000 n=20)
    4/16/3/1       82.20n ± 0%   57.19n ± 1%  -30.42% (p=0.000 n=20)
    4/16/3/2       82.18n ± 0%   57.20n ± 0%  -30.40% (p=0.000 n=20)
    4/16/3/3       82.16n ± 0%   57.02n ± 1%  -30.60% (p=0.000 n=20)
    4/16/4/4       85.34n ± 0%   57.45n ± 1%  -32.68% (p=0.000 n=20)
    16/4/0/0       94.63n ± 0%   72.85n ± 1%  -23.01% (p=0.000 n=20)
    16/4/0/1       94.69n ± 0%   72.59n ± 0%  -23.34% (p=0.000 n=20)
    16/4/0/2       94.87n ± 0%   72.66n ± 1%  -23.42% (p=0.000 n=20)
    16/4/0/3       94.71n ± 0%   72.67n ± 1%  -23.27% (p=0.000 n=20)
    16/4/1/0       94.51n ± 0%   72.91n ± 0%  -22.85% (p=0.000 n=20)
    16/4/1/1       94.39n ± 0%   72.58n ± 0%  -23.10% (p=0.000 n=20)
    16/4/1/2       94.73n ± 0%   72.61n ± 0%  -23.35% (p=0.000 n=20)
    16/4/1/3       94.55n ± 1%   72.62n ± 1%  -23.19% (p=0.000 n=20)
    16/4/2/0       94.54n ± 0%   72.91n ± 1%  -22.87% (p=0.000 n=20)
    16/4/2/1       94.75n ± 0%   72.82n ± 0%  -23.15% (p=0.000 n=20)
    16/4/2/2       94.73n ± 0%   72.59n ± 1%  -23.37% (p=0.000 n=20)
    16/4/2/3       94.70n ± 0%   72.85n ± 1%  -23.07% (p=0.000 n=20)
    16/4/3/0       94.63n ± 0%   72.56n ± 0%  -23.32% (p=0.000 n=20)
    16/4/3/1       94.86n ± 0%   72.79n ± 0%  -23.27% (p=0.000 n=20)
    16/4/3/2       94.81n ± 0%   72.66n ± 1%  -23.36% (p=0.000 n=20)
    16/4/3/3       94.60n ± 0%   72.61n ± 1%  -23.24% (p=0.000 n=20)
    16/4/4/4       93.88n ± 0%   72.65n ± 1%  -22.61% (p=0.000 n=20)
    8/32/0/0       147.0n ± 0%   110.2n ± 0%  -25.06% (p=0.000 n=20)
    8/32/0/1       146.9n ± 0%   110.4n ± 0%  -24.86% (p=0.000 n=20)
    8/32/0/2       147.0n ± 0%   110.1n ± 0%  -25.12% (p=0.000 n=20)
    8/32/0/3       146.8n ± 0%   110.1n ± 0%  -25.01% (p=0.000 n=20)
    8/32/1/0       147.0n ± 0%   110.9n ± 1%  -24.59% (p=0.000 n=20)
    8/32/1/1       147.0n ± 0%   110.6n ± 1%  -24.79% (p=0.000 n=20)
    8/32/1/2       147.0n ± 0%   111.2n ± 1%  -24.36% (p=0.000 n=20)
    8/32/1/3       146.9n ± 0%   111.0n ± 1%  -24.43% (p=0.000 n=20)
    8/32/2/0       147.1n ± 0%   109.9n ± 0%  -25.26% (p=0.000 n=20)
    8/32/2/1       147.0n ± 0%   110.4n ± 0%  -24.89% (p=0.000 n=20)
    8/32/2/2       147.2n ± 0%   109.7n ± 0%  -25.50% (p=0.000 n=20)
    8/32/2/3       146.9n ± 0%   109.8n ± 0%  -25.30% (p=0.000 n=20)
    8/32/3/0       146.8n ± 0%   109.8n ± 0%  -25.19% (p=0.000 n=20)
    8/32/3/1       147.0n ± 0%   110.3n ± 0%  -24.93% (p=0.000 n=20)
    8/32/3/2       147.2n ± 0%   109.8n ± 0%  -25.42% (p=0.000 n=20)
    8/32/3/3       147.1n ± 0%   109.7n ± 0%  -25.43% (p=0.000 n=20)
    8/32/4/4       150.5n ± 0%   110.0n ± 0%  -26.94% (p=0.000 n=20)
    32/8/0/0       215.3n ± 0%   166.5n ± 0%  -22.66% (p=0.000 n=20)
    32/8/0/1       215.8n ± 0%   167.1n ± 0%  -22.56% (p=0.000 n=20)
    32/8/0/2       215.7n ± 0%   167.1n ± 0%  -22.51% (p=0.000 n=20)
    32/8/0/3       215.0n ± 0%   167.1n ± 0%  -22.29% (p=0.000 n=20)
    32/8/1/0       215.0n ± 1%   167.4n ± 0%  -22.12% (p=0.000 n=20)
    32/8/1/1       215.4n ± 0%   166.8n ± 1%  -22.55% (p=0.000 n=20)
    32/8/1/2       215.5n ± 0%   167.0n ± 0%  -22.52% (p=0.000 n=20)
    32/8/1/3       215.4n ± 0%   167.2n ± 0%  -22.37% (p=0.000 n=20)
    32/8/2/0       215.1n ± 0%   166.7n ± 0%  -22.50% (p=0.000 n=20)
    32/8/2/1       215.3n ± 0%   166.9n ± 0%  -22.48% (p=0.000 n=20)
    32/8/2/2       215.6n ± 0%   167.4n ± 0%  -22.35% (p=0.000 n=20)
    32/8/2/3       215.6n ± 0%   167.2n ± 0%  -22.44% (p=0.000 n=20)
    32/8/3/0       215.4n ± 0%   166.9n ± 0%  -22.50% (p=0.000 n=20)
    32/8/3/1       215.1n ± 0%   167.0n ± 0%  -22.38% (p=0.000 n=20)
    32/8/3/2       215.5n ± 0%   166.8n ± 1%  -22.58% (p=0.000 n=20)
    32/8/3/3       215.2n ± 0%   167.3n ± 0%  -22.26% (p=0.000 n=20)
    32/8/4/4       214.5n ± 0%   166.6n ± 0%  -22.32% (p=0.000 n=20)
    16/64/0/0      524.3n ± 0%   389.8n ± 0%  -25.65% (p=0.000 n=20)
    16/64/0/1      523.4n ± 0%   390.6n ± 0%  -25.36% (p=0.000 n=20)
    16/64/0/2      523.5n ± 0%   389.8n ± 0%  -25.54% (p=0.000 n=20)
    16/64/0/3      523.5n ± 0%   390.1n ± 0%  -25.47% (p=0.000 n=20)
    16/64/1/0      523.6n ± 0%   390.1n ± 0%  -25.50% (p=0.000 n=20)
    16/64/1/1      523.5n ± 0%   390.3n ± 0%  -25.46% (p=0.000 n=20)
    16/64/1/2      523.2n ± 0%   390.5n ± 0%  -25.37% (p=0.000 n=20)
    16/64/1/3      523.4n ± 0%   390.6n ± 0%  -25.37% (p=0.000 n=20)
    16/64/2/0      523.7n ± 0%   390.2n ± 0%  -25.48% (p=0.000 n=20)
    16/64/2/1      523.2n ± 0%   390.2n ± 0%  -25.42% (p=0.000 n=20)
    16/64/2/2      522.8n ± 0%   390.2n ± 0%  -25.36% (p=0.000 n=20)
    16/64/2/3      523.7n ± 0%   390.0n ± 0%  -25.53% (p=0.000 n=20)
    16/64/3/0      523.1n ± 0%   390.3n ± 0%  -25.40% (p=0.000 n=20)
    16/64/3/1      523.6n ± 0%   390.5n ± 0%  -25.42% (p=0.000 n=20)
    16/64/3/2      524.2n ± 0%   390.2n ± 0%  -25.57% (p=0.000 n=20)
    16/64/3/3      523.1n ± 0%   390.1n ± 0%  -25.41% (p=0.000 n=20)
    16/64/4/4      529.6n ± 0%   389.2n ± 0%  -26.51% (p=0.000 n=20)
    64/16/0/0      611.0n ± 0%   447.8n ± 0%  -26.72% (p=0.000 n=20)
    64/16/0/1      612.0n ± 0%   450.2n ± 1%  -26.45% (p=0.000 n=20)
    64/16/0/2      614.9n ± 0%   449.4n ± 0%  -26.91% (p=0.000 n=20)
    64/16/0/3      610.5n ± 0%   448.8n ± 0%  -26.50% (p=0.000 n=20)
    64/16/1/0      611.3n ± 0%   450.4n ± 1%  -26.32% (p=0.000 n=20)
    64/16/1/1      611.5n ± 1%   450.1n ± 0%  -26.40% (p=0.000 n=20)
    64/16/1/2      612.5n ± 0%   450.9n ± 1%  -26.39% (p=0.000 n=20)
    64/16/1/3      611.0n ± 0%   450.6n ± 0%  -26.24% (p=0.000 n=20)
    64/16/2/0      613.4n ± 1%   448.9n ± 0%  -26.82% (p=0.000 n=20)
    64/16/2/1      612.3n ± 0%   448.8n ± 0%  -26.70% (p=0.000 n=20)
    64/16/2/2      612.6n ± 0%   448.7n ± 1%  -26.75% (p=0.000 n=20)
    64/16/2/3      612.9n ± 0%   450.8n ± 0%  -26.46% (p=0.000 n=20)
    64/16/3/0      611.6n ± 0%   449.2n ± 0%  -26.56% (p=0.000 n=20)
    64/16/3/1      611.9n ± 1%   448.9n ± 0%  -26.65% (p=0.000 n=20)
    64/16/3/2      611.2n ± 1%   448.5n ± 1%  -26.62% (p=0.000 n=20)
    64/16/3/3      612.6n ± 0%   448.3n ± 0%  -26.81% (p=0.000 n=20)
    64/16/4/4      632.0n ± 0%   449.0n ± 0%  -28.97% (p=0.000 n=20)

    Change-Id: Id9020151c4c4df5f3a2019ea2d8c1ee48329336c

diff --git a/aom_dsp/x86/convolve_avx2.h b/aom_dsp/x86/convolve_avx2.h
index 1191a28a67..9e7b69e896 100644
--- a/aom_dsp/x86/convolve_avx2.h
+++ b/aom_dsp/x86/convolve_avx2.h
@@ -502,134 +502,108 @@ static inline void sr_2d_ver_round_and_store(__m256i res_a, __m256i res_b,
     s[10] = s[11];                                                             \
   }

-#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP                        \
-  do {                                                                  \
-    for (i = 0; i < im_h; i += 2) {                                     \
-      __m256i data =                                                    \
-          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));    \
-      if (i + 1 < im_h)                                                 \
-        data = _mm256_inserti128_si256(                                 \
-            data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \
-      src_h += (src_stride << 1);                                       \
-      __m256i res = convolve_lowbd_x(data, coeffs_x, filt);             \
-                                                                        \
-      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),      \
-                             round_shift_h);                            \
-                                                                        \
-      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);     \
-    }                                                                   \
-  } while (0)
-
-#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP                                 \
+#define JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j_off)                       \
   do {                                                                         \
-    __m256i s[8];                                                              \
-    __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));    \
-    __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));    \
-    __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));    \
-    __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));    \
-    __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));    \
-    __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));    \
-                                                                               \
-    s[0] = _mm256_unpacklo_epi16(s0, s1);                                      \
-    s[1] = _mm256_unpacklo_epi16(s2, s3);                                      \
-    s[2] = _mm256_unpacklo_epi16(s4, s5);                                      \
-                                                                               \
-    s[4] = _mm256_unpackhi_epi16(s0, s1);                                      \
-    s[5] = _mm256_unpackhi_epi16(s2, s3);                                      \
-    s[6] = _mm256_unpackhi_epi16(s4, s5);                                      \
-                                                                               \
-    for (i = 0; i < h; i += 2) {                                               \
-      const int16_t *data = &im_block[i * im_stride];                          \
-                                                                               \
-      const __m256i s6 =                                                       \
-          _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));               \
-      const __m256i s7 =                                                       \
-          _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));               \
-                                                                               \
-      s[3] = _mm256_unpacklo_epi16(s6, s7);                                    \
-      s[7] = _mm256_unpackhi_epi16(s6, s7);                                    \
-                                                                               \
-      const __m256i res_a = convolve(s, coeffs_y);                             \
-      const __m256i res_a_round = _mm256_sra_epi32(                            \
-          _mm256_add_epi32(res_a, round_const_v), round_shift_v);              \
-                                                                               \
-      if (w - j > 4) {                                                         \
-        const __m256i res_b = convolve(s + 4, coeffs_y);                       \
-        const __m256i res_b_round = _mm256_sra_epi32(                          \
-            _mm256_add_epi32(res_b, round_const_v), round_shift_v);            \
-        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);  \
-        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
-                                                                               \
-        if (do_average) {                                                      \
-          const __m256i data_ref_0 =                                           \
-              load_line2_avx2(&dst[i * dst_stride + j],                        \
-                              &dst[i * dst_stride + j + dst_stride]);          \
-          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
-                                                &wt, use_dist_wtd_comp_avg);   \
-                                                                               \
-          const __m256i round_result = convolve_rounding(                      \
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
-                                                                               \
-          const __m256i res_8 =                                                \
-              _mm256_packus_epi16(round_result, round_result);                 \
-          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
-          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
-                                                                               \
-          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);    \
-          _mm_storel_epi64(                                                    \
-              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \
-        } else {                                                               \
-          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
-                                                                               \
-          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
-                          res_1);                                              \
-        }                                                                      \
+    if (do_average) {                                                          \
+      const __m256i data_ref_0 =                                               \
+          load_line2_avx2(&dst[i * dst_stride + (j_off)],                      \
+                          &dst[i * dst_stride + (j_off) + dst_stride]);        \
+      const __m256i comp_avg_res =                                             \
+          comp_avg(&data_ref_0, &(res_unsigned), &wt, use_dist_wtd_comp_avg);  \
+      const __m256i res_signed = _mm256_sub_epi16(comp_avg_res, offset_const); \
+      const __m256i round_result =                                             \
+          _mm256_srai_epi16(_mm256_add_epi16(res_signed, rounding_const), 4);  \
+      const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);   \
+      const __m128i res_0 = _mm256_castsi256_si128(res_8);                     \
+      const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);                \
+      if (w - (j_off) > 4) {                                                   \
+        _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + (j_off)]),        \
+                         res_0);                                               \
+        _mm_storel_epi64(                                                      \
+            (__m128i *)(&dst0[i * dst_stride0 + (j_off) + dst_stride0]),       \
+            res_1);                                                            \
       } else {                                                                 \
-        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);  \
-        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
-                                                                               \
-        if (do_average) {                                                      \
-          const __m256i data_ref_0 =                                           \
-              load_line2_avx2(&dst[i * dst_stride + j],                        \
-                              &dst[i * dst_stride + j + dst_stride]);          \
-                                                                               \
-          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
-                                                &wt, use_dist_wtd_comp_avg);   \
-                                                                               \
-          const __m256i round_result = convolve_rounding(                      \
-              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
-                                                                               \
-          const __m256i res_8 =                                                \
-              _mm256_packus_epi16(round_result, round_result);                 \
-          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
-          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
-                                                                               \
-          *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);     \
-          *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =                 \
-              _mm_cvtsi128_si32(res_1);                                        \
-                                                                               \
-        } else {                                                               \
-          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
-                                                                               \
-          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
-          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
-                          res_1);                                              \
-        }                                                                      \
+        *(int *)(&dst0[i * dst_stride0 + (j_off)]) = _mm_cvtsi128_si32(res_0); \
+        *(int *)(&dst0[i * dst_stride0 + (j_off) + dst_stride0]) =             \
+            _mm_cvtsi128_si32(res_1);                                          \
       }                                                                        \
-                                                                               \
-      s[0] = s[1];                                                             \
-      s[1] = s[2];                                                             \
-      s[2] = s[3];                                                             \
-                                                                               \
-      s[4] = s[5];                                                             \
-      s[5] = s[6];                                                             \
-      s[6] = s[7];                                                             \
+    } else {                                                                   \
+      const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);              \
+      _mm_store_si128((__m128i *)(&dst[i * dst_stride + (j_off)]), res_0);     \
+      const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);         \
+      _mm_store_si128(                                                         \
+          (__m128i *)(&dst[i * dst_stride + (j_off) + dst_stride]), res_1);    \
     }                                                                          \
   } while (0)

+#define JNT_CONVOLVE_HORIZONTAL_FILTER(src_h_start, convolve_fn, coeffs) \
+  do {                                                                   \
+    const uint8_t *src_h = (src_h_start);                                \
+    for (i = 0; i < im_h; i += 2) {                                      \
+      const __m256i data = load_line2_avx2(src_h, src_h + src_stride);   \
+      src_h += (src_stride << 1);                                        \
+      __m256i res = convolve_fn(data, coeffs, filt);                     \
+      res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);  \
+      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);      \
+    }                                                                    \
+  } while (0)
+
+#define JNT_CONVOLVE_VERTICAL_FILTER_8TAP                                     \
+  do {                                                                        \
+    __m256i s[8];                                                             \
+    __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));   \
+    __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));   \
+    __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));   \
+    __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));   \
+    __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));   \
+    __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));   \
+                                                                              \
+    s[0] = _mm256_unpacklo_epi16(s0, s1);                                     \
+    s[1] = _mm256_unpacklo_epi16(s2, s3);                                     \
+    s[2] = _mm256_unpacklo_epi16(s4, s5);                                     \
+                                                                              \
+    s[4] = _mm256_unpackhi_epi16(s0, s1);                                     \
+    s[5] = _mm256_unpackhi_epi16(s2, s3);                                     \
+    s[6] = _mm256_unpackhi_epi16(s4, s5);                                     \
+                                                                              \
+    for (i = 0; i < h; i += 2) {                                              \
+      const int16_t *data = &im_block[i * im_stride];                         \
+                                                                              \
+      const __m256i s6 =                                                      \
+          _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));              \
+      const __m256i s7 =                                                      \
+          _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));              \
+                                                                              \
+      s[3] = _mm256_unpacklo_epi16(s6, s7);                                   \
+      s[7] = _mm256_unpackhi_epi16(s6, s7);                                   \
+                                                                              \
+      const __m256i res_a = convolve(s, coeffs_y);                            \
+      const __m256i res_a_round =                                             \
+          _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);       \
+                                                                              \
+      if (w - j > 4) {                                                        \
+        const __m256i res_b = convolve(s + 4, coeffs_y);                      \
+        const __m256i res_b_round =                                           \
+            _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 7);     \
+        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \
+        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \
+        JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j);                         \
+      } else {                                                                \
+        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \
+        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \
+        JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j);                         \
+      }                                                                       \
+                                                                              \
+      s[0] = s[1];                                                            \
+      s[1] = s[2];                                                            \
+      s[2] = s[3];                                                            \
+                                                                              \
+      s[4] = s[5];                                                            \
+      s[5] = s[6];                                                            \
+      s[6] = s[7];                                                            \
+    }                                                                         \
+  } while (0)
+
 static inline void prepare_coeffs_2t_ssse3(
     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
     __m128i *const coeffs /* [4] */) {
diff --git a/av1/common/x86/jnt_convolve_avx2.c b/av1/common/x86/jnt_convolve_avx2.c
index 925fe47cf5..e3ac6d466d 100644
--- a/av1/common/x86/jnt_convolve_avx2.c
+++ b/av1/common/x86/jnt_convolve_avx2.c
@@ -612,14 +612,13 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,

   const __m256i round_const_h = _mm256_set1_epi16(
       ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
-  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
-
   const __m256i round_const_v = _mm256_set1_epi32(
       ((1 << conv_params->round_1) >> 1) -
       (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
-  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);

-  __m256i filt[4], coeffs_x[4], coeffs_y[4];
+  DECLARE_ALIGNED(32, __m256i, filt[4]);
+  DECLARE_ALIGNED(32, __m256i, coeffs_x[4]);
+  DECLARE_ALIGNED(32, __m256i, coeffs_y[4]);

   filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
   filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
@@ -635,32 +634,425 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
   if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0)))
     is_vert_4tap = 1;

-  if (is_horiz_4tap) {
-    int im_h = h + filter_params_y->taps - 1;
+  if (is_horiz_4tap && is_vert_4tap) {
+    int im_h = h + 4;
+    const int fo_vert = 1;
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+    if (w > 4) {
+      if (do_average) {
+        if (use_dist_wtd_comp_avg) {
+          const __m256i comp_const = _mm256_set1_epi32(-98176);
+          for (int j = 0; j < w; j += 8) {
+            JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x_4tap,
+                                           coeffs_x + 1);
+
+            /* Vertical filter */
+            __m256i s[6];
+            __m256i s0 =
+                _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+            __m256i s1 =
+                _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+            __m256i s2 =
+                _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+            __m256i s3 =
+                _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+            s[0] = _mm256_unpacklo_epi16(s0, s1);
+            s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+            s[3] = _mm256_unpackhi_epi16(s0, s1);
+            s[4] = _mm256_unpackhi_epi16(s2, s3);
+
+            for (i = 0; i < h; i += 2) {
+              const int16_t *data = &im_block[i * im_stride];
+
+              const __m256i s4 =
+                  _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+              const __m256i s5 =
+                  _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+              s[2] = _mm256_unpacklo_epi16(s4, s5);
+              s[5] = _mm256_unpackhi_epi16(s4, s5);
+
+              const __m256i res_a_1 = _mm256_madd_epi16(s[0], coeffs_y[1]);
+              const __m256i res_b_1 = _mm256_madd_epi16(s[3], coeffs_y[1]);
+              const __m256i res_a_2 = _mm256_madd_epi16(s[1], coeffs_y[2]);
+              const __m256i res_b_2 = _mm256_madd_epi16(s[4], coeffs_y[2]);
+              const __m256i res_a = _mm256_add_epi32(res_a_1, res_a_2);
+              const __m256i res_b = _mm256_add_epi32(res_b_1, res_b_2);
+
+              const __m256i res_a_round =
+                  _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);
+              const __m256i res_b_round =
+                  _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 7);
+              const __m256i res_16b =
+                  _mm256_packs_epi32(res_a_round, res_b_round);
+              const __m256i res_unsigned =
+                  _mm256_add_epi16(res_16b, offset_const);
+
+              const __m256i data_ref_0 =
+                  load_line2_avx2(&dst[i * dst_stride + j],
+                                  &dst[i * dst_stride + j + dst_stride]);
+
+              const __m256i data_lo =
+                  _mm256_unpacklo_epi16(data_ref_0, res_unsigned);
+              const __m256i data_hi =
+                  _mm256_unpackhi_epi16(data_ref_0, res_unsigned);
+
+              const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, wt);
+              const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, wt);
+
+              const __m256i add_lo = _mm256_add_epi32(wt_res_lo, comp_const);
+              const __m256i add_hi = _mm256_add_epi32(wt_res_hi, comp_const);
+              const __m256i fused_lo = _mm256_srai_epi32(add_lo, 8);
+              const __m256i fused_hi = _mm256_srai_epi32(add_hi, 8);
+
+              const __m256i round_result =
+                  _mm256_packs_epi32(fused_lo, fused_hi);
+
+              const __m256i res_8 =
+                  _mm256_packus_epi16(round_result, round_result);
+              const __m128i res_0 = _mm256_castsi256_si128(res_8);
+              const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+              _mm_storel_epi64(
+                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+                  res_1);
+
+              s[0] = s[1];
+              s[1] = s[2];
+              s[3] = s[4];
+              s[4] = s[5];
+            }
+          }
+        } else {
+          for (int j = 0; j < w; j += 8) {
+            JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x_4tap,
+                                           coeffs_x + 1);
+
+            /* Vertical filter */
+            __m256i s[6];
+            __m256i s0 =
+                _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+            __m256i s1 =
+                _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+            __m256i s2 =
+                _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+            __m256i s3 =
+                _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+            s[0] = _mm256_unpacklo_epi16(s0, s1);
+            s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+            s[3] = _mm256_unpackhi_epi16(s0, s1);
+            s[4] = _mm256_unpackhi_epi16(s2, s3);
+
+            for (i = 0; i < h; i += 2) {
+              const int16_t *data = &im_block[i * im_stride];
+
+              const __m256i s4 =
+                  _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+              const __m256i s5 =
+                  _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+              s[2] = _mm256_unpacklo_epi16(s4, s5);
+              s[5] = _mm256_unpackhi_epi16(s4, s5);
+
+              const __m256i res_a_1 = _mm256_madd_epi16(s[0], coeffs_y[1]);
+              const __m256i res_b_1 = _mm256_madd_epi16(s[3], coeffs_y[1]);
+              const __m256i res_a_2 = _mm256_madd_epi16(s[1], coeffs_y[2]);
+              const __m256i res_b_2 = _mm256_madd_epi16(s[4], coeffs_y[2]);
+              const __m256i res_a = _mm256_add_epi32(res_a_1, res_a_2);
+              const __m256i res_b = _mm256_add_epi32(res_b_1, res_b_2);
+
+              const __m256i res_a_round =
+                  _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);
+              const __m256i res_b_round =
+                  _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 7);
+              const __m256i res_16b =
+                  _mm256_packs_epi32(res_a_round, res_b_round);
+              const __m256i res_unsigned =
+                  _mm256_add_epi16(res_16b, offset_const);
+
+              const __m256i data_ref_0 =
+                  load_line2_avx2(&dst[i * dst_stride + j],
+                                  &dst[i * dst_stride + j + dst_stride]);
+              const __m256i wt_res = _mm256_add_epi16(data_ref_0, res_unsigned);
+              const __m256i comp_avg_res = _mm256_srai_epi16(wt_res, 1);
+
+              const __m256i res_signed =
+                  _mm256_sub_epi16(comp_avg_res, offset_const);
+              const __m256i round_result = _mm256_srai_epi16(
+                  _mm256_add_epi16(res_signed, rounding_const), 4);
+
+              const __m256i res_8 =
+                  _mm256_packus_epi16(round_result, round_result);
+              const __m128i res_0 = _mm256_castsi256_si128(res_8);
+              const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+              _mm_storel_epi64(
+                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+                  res_1);
+
+              s[0] = s[1];
+              s[1] = s[2];
+              s[3] = s[4];
+              s[4] = s[5];
+            }
+          }
+        }
+      } else {
+        for (int j = 0; j < w; j += 8) {
+          JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x_4tap,
+                                         coeffs_x + 1);
+
+          /* Vertical filter */
+          __m256i s[6];
+          __m256i s0 =
+              _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+          __m256i s1 =
+              _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+          __m256i s2 =
+              _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+          __m256i s3 =
+              _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+          s[0] = _mm256_unpacklo_epi16(s0, s1);
+          s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+          s[3] = _mm256_unpackhi_epi16(s0, s1);
+          s[4] = _mm256_unpackhi_epi16(s2, s3);
+
+          for (i = 0; i < h; i += 2) {
+            const int16_t *data = &im_block[i * im_stride];
+
+            const __m256i s4 =
+                _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+            const __m256i s5 =
+                _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+            s[2] = _mm256_unpacklo_epi16(s4, s5);
+            s[5] = _mm256_unpackhi_epi16(s4, s5);
+
+            const __m256i res_a_1 = _mm256_madd_epi16(s[0], coeffs_y[1]);
+            const __m256i res_b_1 = _mm256_madd_epi16(s[3], coeffs_y[1]);
+            const __m256i res_a_2 = _mm256_madd_epi16(s[1], coeffs_y[2]);
+            const __m256i res_b_2 = _mm256_madd_epi16(s[4], coeffs_y[2]);
+            const __m256i res_a = _mm256_add_epi32(res_a_1, res_a_2);
+            const __m256i res_b = _mm256_add_epi32(res_b_1, res_b_2);
+
+            const __m256i res_a_round =
+                _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);
+            const __m256i res_b_round =
+                _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 7);
+            const __m256i res_16b =
+                _mm256_packs_epi32(res_a_round, res_b_round);
+            const __m256i res_unsigned =
+                _mm256_add_epi16(res_16b, offset_const);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+
+            s[0] = s[1];
+            s[1] = s[2];
+            s[3] = s[4];
+            s[4] = s[5];
+          }
+        }
+      }
+    } else {
+      if (do_average) {
+        if (use_dist_wtd_comp_avg) {
+          const __m256i comp_const = _mm256_set1_epi32(-98176);
+          JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr, convolve_lowbd_x_4tap,
+                                         coeffs_x + 1);
+
+          /* Vertical filter */
+          __m256i s[3];
+          __m256i s0 =
+              _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+          __m256i s1 =
+              _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+          __m256i s2 =
+              _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+          __m256i s3 =
+              _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+          s[0] = _mm256_unpacklo_epi16(s0, s1);
+          s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+          for (i = 0; i < h; i += 2) {
+            const int16_t *data = &im_block[i * im_stride];
+
+            const __m256i s4 =
+                _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+            const __m256i s5 =
+                _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+            s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+            const __m256i res_a_1 = _mm256_madd_epi16(s[0], coeffs_y[1]);
+            const __m256i res_a_2 = _mm256_madd_epi16(s[1], coeffs_y[2]);
+            const __m256i res_a = _mm256_add_epi32(res_a_1, res_a_2);
+
+            const __m256i res_a_round =
+                _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);
+            const __m256i res_16b =
+                _mm256_packs_epi32(res_a_round, res_a_round);
+            const __m256i res_unsigned =
+                _mm256_add_epi16(res_16b, offset_const);
+
+            const __m256i data_ref_0 = load_line2_avx2(
+                &dst[i * dst_stride], &dst[i * dst_stride + dst_stride]);
+
+            const __m256i data_lo =
+                _mm256_unpacklo_epi16(data_ref_0, res_unsigned);
+
+            const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, wt);
+
+            const __m256i fused_lo =
+                _mm256_srai_epi32(_mm256_add_epi32(wt_res_lo, comp_const), 8);
+
+            const __m256i round_result = _mm256_packs_epi32(fused_lo, fused_lo);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            *(int *)(&dst0[i * dst_stride0]) = _mm_cvtsi128_si32(res_0);
+            *(int *)(&dst0[i * dst_stride0 + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+
+            s[0] = s[1];
+            s[1] = s[2];
+          }
+        } else {
+          JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr, convolve_lowbd_x_4tap,
+                                         coeffs_x + 1);
+
+          /* Vertical filter */
+          __m256i s[3];
+          __m256i s0 =
+              _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+          __m256i s1 =
+              _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+          __m256i s2 =
+              _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+          __m256i s3 =
+              _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+          s[0] = _mm256_unpacklo_epi16(s0, s1);
+          s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+          for (i = 0; i < h; i += 2) {
+            const int16_t *data = &im_block[i * im_stride];
+
+            const __m256i s4 =
+                _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+            const __m256i s5 =
+                _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+            s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+            const __m256i res_a_1 = _mm256_madd_epi16(s[0], coeffs_y[1]);
+            const __m256i res_a_2 = _mm256_madd_epi16(s[1], coeffs_y[2]);
+            const __m256i res_a = _mm256_add_epi32(res_a_1, res_a_2);
+
+            const __m256i res_a_round =
+                _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);
+            const __m256i res_16b =
+                _mm256_packs_epi32(res_a_round, res_a_round);
+            const __m256i res_unsigned =
+                _mm256_add_epi16(res_16b, offset_const);
+
+            const __m256i data_ref_0 = load_line2_avx2(
+                &dst[i * dst_stride], &dst[i * dst_stride + dst_stride]);
+            const __m256i wt_res = _mm256_add_epi16(data_ref_0, res_unsigned);
+            const __m256i comp_avg_res = _mm256_srai_epi16(wt_res, 1);
+
+            const __m256i res_signed =
+                _mm256_sub_epi16(comp_avg_res, offset_const);
+            const __m256i round_result = _mm256_srai_epi16(
+                _mm256_add_epi16(res_signed, rounding_const), 4);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            *(int *)(&dst0[i * dst_stride0]) = _mm_cvtsi128_si32(res_0);
+            *(int *)(&dst0[i * dst_stride0 + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+
+            s[0] = s[1];
+            s[1] = s[2];
+          }
+        }
+      } else {
+        JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr, convolve_lowbd_x_4tap,
+                                       coeffs_x + 1);
+
+        /* Vertical filter */
+        __m256i s[3];
+        __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+        __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+        __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+        __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+        s[0] = _mm256_unpacklo_epi16(s0, s1);
+        s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+        for (i = 0; i < h; i += 2) {
+          const int16_t *data = &im_block[i * im_stride];
+
+          const __m256i s4 =
+              _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+          const __m256i s5 =
+              _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+          s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+          const __m256i res_a_1 = _mm256_madd_epi16(s[0], coeffs_y[1]);
+          const __m256i res_a_2 = _mm256_madd_epi16(s[1], coeffs_y[2]);
+          const __m256i res_a = _mm256_add_epi32(res_a_1, res_a_2);
+
+          const __m256i res_a_round =
+              _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);
+          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
+          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride]), res_0);
+
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + dst_stride]),
+                          res_1);
+
+          s[0] = s[1];
+          s[1] = s[2];
+        }
+      }
+    }
+  } else if (is_horiz_4tap) {
+    int im_h = h + 8;
     const int fo_vert = filter_params_y->taps / 2 - 1;
     const int fo_horiz = 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
     for (int j = 0; j < w; j += 8) {
-      /* Horizontal filter */
-      const uint8_t *src_h = src_ptr + j;
-      for (i = 0; i < im_h; i += 2) {
-        __m256i data =
-            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));
-        if (i + 1 < im_h)
-          data = _mm256_inserti128_si256(
-              data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
-        src_h += (src_stride << 1);
-        __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt);
-
-        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
-                               round_shift_h);
-
-        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
-      }
-      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+      JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x_4tap,
+                                     coeffs_x + 1);
+      JNT_CONVOLVE_VERTICAL_FILTER_8TAP;
     }
   } else if (is_vert_4tap) {
-    int im_h = h + 3;
+    int im_h = h + 4;
     const int fo_vert = 1;
     const int fo_horiz = filter_params_x->taps / 2 - 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
@@ -669,9 +1061,7 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
     filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));

     for (int j = 0; j < w; j += 8) {
-      /* Horizontal filter */
-      const uint8_t *src_h = src_ptr + j;
-      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+      JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x, coeffs_x);

       /* Vertical filter */
       __m256i s[6];
@@ -698,74 +1088,22 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
         s[5] = _mm256_unpackhi_epi16(s4, s5);

         const __m256i res_a = convolve_4tap(s, coeffs_y + 1);
-        const __m256i res_a_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+        const __m256i res_a_round =
+            _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);

         if (w - j > 4) {
           const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1);
-          const __m256i res_b_round = _mm256_sra_epi32(
-              _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+          const __m256i res_b_round =
+              _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 7);
           const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
           const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);

-          if (do_average) {
-            const __m256i data_ref_0 =
-                load_line2_avx2(&dst[i * dst_stride + j],
-                                &dst[i * dst_stride + j + dst_stride]);
-            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
-                                                  &wt, use_dist_wtd_comp_avg);
-
-            const __m256i round_result = convolve_rounding(
-                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-            const __m256i res_8 =
-                _mm256_packus_epi16(round_result, round_result);
-            const __m128i res_0 = _mm256_castsi256_si128(res_8);
-            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
-
-            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
-            _mm_storel_epi64(
-                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
-          } else {
-            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-
-            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                            res_1);
-          }
+          JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j);
         } else {
           const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
           const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);

-          if (do_average) {
-            const __m256i data_ref_0 =
-                load_line2_avx2(&dst[i * dst_stride + j],
-                                &dst[i * dst_stride + j + dst_stride]);
-
-            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
-                                                  &wt, use_dist_wtd_comp_avg);
-
-            const __m256i round_result = convolve_rounding(
-                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
-            const __m256i res_8 =
-                _mm256_packus_epi16(round_result, round_result);
-            const __m128i res_0 = _mm256_castsi256_si128(res_8);
-            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
-
-            *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
-            *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
-                _mm_cvtsi128_si32(res_1);
-
-          } else {
-            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-
-            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
-            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
-                            res_1);
-          }
+          JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j);
         }
         s[0] = s[1];
         s[1] = s[2];
@@ -774,7 +1112,7 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
       }
     }
   } else {
-    int im_h = h + filter_params_y->taps - 1;
+    int im_h = h + 8;
     const int fo_vert = filter_params_y->taps / 2 - 1;
     const int fo_horiz = filter_params_x->taps / 2 - 1;
     const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
@@ -783,11 +1121,9 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
     filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));

     for (int j = 0; j < w; j += 8) {
-      /* Horizontal filter */
-      const uint8_t *src_h = src_ptr + j;
-      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+      JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x, coeffs_x);

-      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+      JNT_CONVOLVE_VERTICAL_FILTER_8TAP;
     }
   }
 }