Commit 3360063d11 for aom
commit 3360063d11607252a5cc72b5cd714ce6460f8863
Author: Jeremy Dorfman <jdorfman@google.com>
Date: Thu May 14 13:18:04 2026 -0400
[convolve] Optimize av1_dist_wtd_convolve_2d_avx2
This change optimizes the AVX2 implementation of joint 2D convolution by
introducing dedicated for combined 4-tap horizontal and vertical
filtering. It ends up around 24% faster than the previous
implementation.
BlockWidth/BlockHeight/HorizontalFilter/VerticalFilter
Baseline CPU New CPU Improvement
4/4/0/0 54.80n ± 1% 38.74n ± 1% -29.31% (p=0.000 n=20)
4/4/0/1 54.85n ± 0% 38.40n ± 1% -29.99% (p=0.000 n=20)
4/4/0/2 54.79n ± 0% 38.41n ± 1% -29.89% (p=0.000 n=20)
4/4/0/3 54.74n ± 0% 38.56n ± 1% -29.56% (p=0.000 n=20)
4/4/1/0 54.66n ± 0% 38.57n ± 1% -29.44% (p=0.000 n=20)
4/4/1/1 54.73n ± 0% 38.44n ± 1% -29.77% (p=0.000 n=20)
4/4/1/2 54.60n ± 1% 38.46n ± 1% -29.56% (p=0.000 n=20)
4/4/1/3 54.68n ± 0% 38.40n ± 1% -29.78% (p=0.000 n=20)
4/4/2/0 54.88n ± 1% 38.53n ± 1% -29.80% (p=0.000 n=20)
4/4/2/1 54.79n ± 0% 38.36n ± 1% -29.98% (p=0.000 n=20)
4/4/2/2 54.83n ± 0% 38.42n ± 1% -29.92% (p=0.000 n=20)
4/4/2/3 54.69n ± 0% 38.46n ± 1% -29.68% (p=0.000 n=20)
4/4/3/0 54.82n ± 0% 38.44n ± 1% -29.88% (p=0.000 n=20)
4/4/3/1 54.81n ± 1% 38.40n ± 0% -29.95% (p=0.000 n=20)
4/4/3/2 54.71n ± 1% 38.65n ± 1% -29.35% (p=0.000 n=20)
4/4/3/3 54.72n ± 0% 38.24n ± 1% -30.11% (p=0.000 n=20)
4/4/4/4 54.45n ± 0% 38.49n ± 1% -29.31% (p=0.000 n=20)
4/8/0/0 62.69n ± 1% 45.15n ± 1% -27.97% (p=0.000 n=20)
4/8/0/1 62.61n ± 0% 45.19n ± 1% -27.83% (p=0.000 n=20)
4/8/0/2 62.62n ± 0% 45.18n ± 1% -27.85% (p=0.000 n=20)
4/8/0/3 62.52n ± 0% 45.14n ± 1% -27.80% (p=0.000 n=20)
4/8/1/0 62.67n ± 1% 45.01n ± 1% -28.18% (p=0.000 n=20)
4/8/1/1 62.54n ± 1% 44.99n ± 1% -28.06% (p=0.000 n=20)
4/8/1/2 62.77n ± 0% 45.32n ± 1% -27.81% (p=0.000 n=20)
4/8/1/3 62.71n ± 0% 45.12n ± 0% -28.05% (p=0.000 n=20)
4/8/2/0 62.57n ± 0% 45.25n ± 1% -27.68% (p=0.000 n=20)
4/8/2/1 62.81n ± 0% 45.34n ± 1% -27.82% (p=0.000 n=20)
4/8/2/2 62.74n ± 0% 45.13n ± 0% -28.07% (p=0.000 n=20)
4/8/2/3 62.82n ± 0% 44.99n ± 1% -28.39% (p=0.000 n=20)
4/8/3/0 62.69n ± 1% 45.21n ± 0% -27.89% (p=0.000 n=20)
4/8/3/1 62.68n ± 0% 45.12n ± 1% -28.02% (p=0.000 n=20)
4/8/3/2 62.67n ± 0% 45.16n ± 1% -27.94% (p=0.000 n=20)
4/8/3/3 62.71n ± 1% 45.18n ± 1% -27.95% (p=0.000 n=20)
4/8/4/4 62.13n ± 1% 45.19n ± 0% -27.26% (p=0.000 n=20)
8/4/0/0 56.76n ± 0% 41.83n ± 0% -26.29% (p=0.000 n=20)
8/4/0/1 56.53n ± 0% 41.86n ± 1% -25.96% (p=0.000 n=20)
8/4/0/2 56.66n ± 0% 41.89n ± 0% -26.07% (p=0.000 n=20)
8/4/0/3 56.72n ± 0% 41.91n ± 1% -26.12% (p=0.000 n=20)
8/4/1/0 56.56n ± 0% 41.81n ± 1% -26.08% (p=0.000 n=20)
8/4/1/1 56.46n ± 0% 41.80n ± 1% -25.97% (p=0.000 n=20)
8/4/1/2 56.68n ± 0% 41.83n ± 0% -26.19% (p=0.000 n=20)
8/4/1/3 56.65n ± 0% 41.88n ± 0% -26.07% (p=0.000 n=20)
8/4/2/0 56.65n ± 0% 41.74n ± 1% -26.31% (p=0.000 n=20)
8/4/2/1 56.59n ± 0% 41.84n ± 3% -26.05% (p=0.000 n=20)
8/4/2/2 56.55n ± 0% 41.95n ± 1% -25.81% (p=0.000 n=20)
8/4/2/3 56.68n ± 0% 41.73n ± 0% -26.37% (p=0.000 n=20)
8/4/3/0 56.73n ± 0% 41.96n ± 0% -26.05% (p=0.000 n=20)
8/4/3/1 56.76n ± 0% 41.73n ± 0% -26.47% (p=0.000 n=20)
8/4/3/2 56.67n ± 0% 42.02n ± 1% -25.85% (p=0.000 n=20)
8/4/3/3 56.67n ± 0% 41.77n ± 0% -26.29% (p=0.000 n=20)
8/4/4/4 57.30n ± 0% 41.73n ± 0% -27.17% (p=0.000 n=20)
8/8/0/0 67.99n ± 0% 51.12n ± 1% -24.81% (p=0.000 n=20)
8/8/0/1 68.08n ± 0% 50.93n ± 0% -25.20% (p=0.000 n=20)
8/8/0/2 67.95n ± 0% 51.00n ± 1% -24.94% (p=0.000 n=20)
8/8/0/3 67.97n ± 1% 50.93n ± 0% -25.07% (p=0.000 n=20)
8/8/1/0 67.96n ± 0% 51.17n ± 0% -24.71% (p=0.000 n=20)
8/8/1/1 67.81n ± 0% 51.08n ± 0% -24.67% (p=0.000 n=20)
8/8/1/2 67.88n ± 1% 51.15n ± 0% -24.66% (p=0.000 n=20)
8/8/1/3 67.84n ± 1% 51.26n ± 0% -24.44% (p=0.000 n=20)
8/8/2/0 67.85n ± 0% 50.92n ± 0% -24.96% (p=0.000 n=20)
8/8/2/1 68.05n ± 1% 50.94n ± 0% -25.14% (p=0.000 n=20)
8/8/2/2 67.87n ± 1% 50.89n ± 1% -25.02% (p=0.000 n=20)
8/8/2/3 67.99n ± 0% 50.91n ± 0% -25.13% (p=0.000 n=20)
8/8/3/0 67.98n ± 1% 50.93n ± 1% -25.07% (p=0.000 n=20)
8/8/3/1 68.23n ± 1% 51.10n ± 1% -25.11% (p=0.000 n=20)
8/8/3/2 67.91n ± 1% 50.91n ± 0% -25.03% (p=0.000 n=20)
8/8/3/3 68.06n ± 0% 50.97n ± 0% -25.11% (p=0.000 n=20)
8/8/4/4 68.03n ± 1% 51.05n ± 1% -24.96% (p=0.000 n=20)
8/16/0/0 92.77n ± 0% 69.84n ± 1% -24.72% (p=0.000 n=20)
8/16/0/1 93.02n ± 0% 70.29n ± 1% -24.44% (p=0.000 n=20)
8/16/0/2 92.83n ± 0% 70.15n ± 2% -24.43% (p=0.000 n=20)
8/16/0/3 92.77n ± 1% 69.99n ± 1% -24.55% (p=0.000 n=20)
8/16/1/0 92.68n ± 1% 72.09n ± 2% -22.21% (p=0.000 n=20)
8/16/1/1 92.77n ± 0% 71.12n ± 2% -23.34% (p=0.000 n=20)
8/16/1/2 92.56n ± 0% 72.38n ± 3% -21.80% (p=0.000 n=20)
8/16/1/3 92.78n ± 1% 71.17n ± 2% -23.29% (p=0.000 n=20)
8/16/2/0 92.82n ± 0% 69.98n ± 1% -24.60% (p=0.000 n=20)
8/16/2/1 92.70n ± 0% 70.20n ± 1% -24.27% (p=0.000 n=20)
8/16/2/2 92.56n ± 0% 70.02n ± 1% -24.35% (p=0.000 n=20)
8/16/2/3 92.85n ± 0% 70.16n ± 0% -24.44% (p=0.000 n=20)
8/16/3/0 92.70n ± 0% 69.99n ± 0% -24.49% (p=0.000 n=20)
8/16/3/1 92.82n ± 0% 70.07n ± 0% -24.51% (p=0.000 n=20)
8/16/3/2 92.72n ± 1% 69.74n ± 0% -24.78% (p=0.000 n=20)
8/16/3/3 92.59n ± 1% 69.92n ± 1% -24.48% (p=0.000 n=20)
8/16/4/4 94.89n ± 0% 70.21n ± 0% -26.00% (p=0.000 n=20)
16/8/0/0 116.73n ± 1% 89.59n ± 0% -23.25% (p=0.000 n=20)
16/8/0/1 116.70n ± 0% 89.59n ± 0% -23.23% (p=0.000 n=20)
16/8/0/2 116.73n ± 0% 89.78n ± 0% -23.09% (p=0.000 n=20)
16/8/0/3 117.07n ± 0% 89.65n ± 0% -23.42% (p=0.000 n=20)
16/8/1/0 116.82n ± 0% 89.66n ± 0% -23.25% (p=0.000 n=20)
16/8/1/1 116.53n ± 1% 89.71n ± 1% -23.01% (p=0.000 n=20)
16/8/1/2 117.03n ± 0% 89.67n ± 0% -23.37% (p=0.000 n=20)
16/8/1/3 116.83n ± 0% 89.73n ± 0% -23.20% (p=0.000 n=20)
16/8/2/0 116.58n ± 0% 89.75n ± 0% -23.01% (p=0.000 n=20)
16/8/2/1 117.12n ± 0% 89.68n ± 0% -23.43% (p=0.000 n=20)
16/8/2/2 116.78n ± 0% 89.40n ± 0% -23.44% (p=0.000 n=20)
16/8/2/3 116.80n ± 0% 89.63n ± 0% -23.26% (p=0.000 n=20)
16/8/3/0 116.67n ± 1% 89.69n ± 0% -23.12% (p=0.000 n=20)
16/8/3/1 116.47n ± 0% 89.78n ± 0% -22.91% (p=0.000 n=20)
16/8/3/2 116.64n ± 0% 89.59n ± 1% -23.19% (p=0.000 n=20)
16/8/3/3 116.72n ± 0% 89.60n ± 0% -23.24% (p=0.000 n=20)
16/8/4/4 117.08n ± 1% 89.69n ± 0% -23.39% (p=0.000 n=20)
16/16/0/0 165.1n ± 0% 123.6n ± 1% -25.15% (p=0.000 n=20)
16/16/0/1 165.2n ± 0% 123.8n ± 1% -25.07% (p=0.000 n=20)
16/16/0/2 164.7n ± 0% 123.4n ± 0% -25.09% (p=0.000 n=20)
16/16/0/3 165.5n ± 0% 123.8n ± 1% -25.24% (p=0.000 n=20)
16/16/1/0 164.7n ± 1% 126.0n ± 2% -23.54% (p=0.000 n=20)
16/16/1/1 165.2n ± 1% 125.1n ± 1% -24.32% (p=0.000 n=20)
16/16/1/2 165.0n ± 0% 125.5n ± 2% -23.95% (p=0.000 n=20)
16/16/1/3 165.0n ± 0% 124.8n ± 3% -24.34% (p=0.000 n=20)
16/16/2/0 164.9n ± 1% 123.3n ± 1% -25.21% (p=0.000 n=20)
16/16/2/1 164.4n ± 1% 123.6n ± 0% -24.83% (p=0.000 n=20)
16/16/2/2 164.7n ± 1% 123.4n ± 0% -25.10% (p=0.000 n=20)
16/16/2/3 164.8n ± 1% 123.3n ± 0% -25.18% (p=0.000 n=20)
16/16/3/0 165.7n ± 1% 123.5n ± 0% -25.48% (p=0.000 n=20)
16/16/3/1 164.9n ± 0% 123.7n ± 1% -25.00% (p=0.000 n=20)
16/16/3/2 164.9n ± 1% 123.6n ± 0% -25.04% (p=0.000 n=20)
16/16/3/3 164.9n ± 0% 123.3n ± 0% -25.24% (p=0.000 n=20)
16/16/4/4 169.3n ± 1% 123.3n ± 0% -27.16% (p=0.000 n=20)
16/32/0/0 273.1n ± 0% 204.5n ± 0% -25.11% (p=0.000 n=20)
16/32/0/1 272.4n ± 0% 204.8n ± 0% -24.80% (p=0.000 n=20)
16/32/0/2 273.0n ± 0% 204.5n ± 0% -25.10% (p=0.000 n=20)
16/32/0/3 272.7n ± 0% 204.5n ± 0% -25.02% (p=0.000 n=20)
16/32/1/0 272.6n ± 0% 205.2n ± 0% -24.75% (p=0.000 n=20)
16/32/1/1 272.9n ± 0% 205.2n ± 0% -24.79% (p=0.000 n=20)
16/32/1/2 272.6n ± 0% 205.1n ± 0% -24.77% (p=0.000 n=20)
16/32/1/3 272.9n ± 0% 205.0n ± 0% -24.87% (p=0.000 n=20)
16/32/2/0 272.6n ± 0% 204.6n ± 0% -24.93% (p=0.000 n=20)
16/32/2/1 272.5n ± 0% 204.7n ± 0% -24.88% (p=0.000 n=20)
16/32/2/2 272.8n ± 0% 204.6n ± 0% -24.97% (p=0.000 n=20)
16/32/2/3 272.8n ± 0% 204.8n ± 0% -24.94% (p=0.000 n=20)
16/32/3/0 272.7n ± 0% 204.8n ± 0% -24.89% (p=0.000 n=20)
16/32/3/1 272.7n ± 0% 205.0n ± 0% -24.81% (p=0.000 n=20)
16/32/3/2 272.8n ± 0% 204.9n ± 0% -24.89% (p=0.000 n=20)
16/32/3/3 272.7n ± 0% 204.9n ± 0% -24.86% (p=0.000 n=20)
16/32/4/4 279.4n ± 0% 204.7n ± 0% -26.74% (p=0.000 n=20)
32/16/0/0 309.0n ± 0% 232.3n ± 0% -24.80% (p=0.000 n=20)
32/16/0/1 309.1n ± 0% 232.7n ± 1% -24.72% (p=0.000 n=20)
32/16/0/2 309.2n ± 0% 232.1n ± 1% -24.94% (p=0.000 n=20)
32/16/0/3 309.8n ± 0% 231.6n ± 0% -25.25% (p=0.000 n=20)
32/16/1/0 309.1n ± 0% 233.8n ± 1% -24.34% (p=0.000 n=20)
32/16/1/1 309.0n ± 0% 234.4n ± 1% -24.13% (p=0.000 n=20)
32/16/1/2 309.9n ± 0% 234.2n ± 1% -24.41% (p=0.000 n=20)
32/16/1/3 309.0n ± 1% 233.8n ± 1% -24.34% (p=0.000 n=20)
32/16/2/0 310.2n ± 0% 231.8n ± 0% -25.26% (p=0.000 n=20)
32/16/2/1 309.8n ± 0% 231.5n ± 1% -25.26% (p=0.000 n=20)
32/16/2/2 309.7n ± 0% 232.1n ± 0% -25.05% (p=0.000 n=20)
32/16/2/3 309.5n ± 0% 232.0n ± 0% -25.02% (p=0.000 n=20)
32/16/3/0 308.7n ± 0% 231.3n ± 1% -25.08% (p=0.000 n=20)
32/16/3/1 309.7n ± 0% 232.0n ± 0% -25.10% (p=0.000 n=20)
32/16/3/2 310.0n ± 1% 231.6n ± 0% -25.29% (p=0.000 n=20)
32/16/3/3 309.2n ± 0% 231.5n ± 1% -25.13% (p=0.000 n=20)
32/16/4/4 317.4n ± 0% 232.3n ± 0% -26.82% (p=0.000 n=20)
32/32/0/0 540.6n ± 0% 390.7n ± 0% -27.73% (p=0.000 n=20)
32/32/0/1 541.4n ± 0% 390.6n ± 0% -27.85% (p=0.000 n=20)
32/32/0/2 541.3n ± 0% 390.6n ± 0% -27.84% (p=0.000 n=20)
32/32/0/3 540.3n ± 0% 390.7n ± 0% -27.69% (p=0.000 n=20)
32/32/1/0 540.2n ± 0% 391.6n ± 0% -27.51% (p=0.000 n=20)
32/32/1/1 540.7n ± 0% 391.9n ± 0% -27.52% (p=0.000 n=20)
32/32/1/2 540.8n ± 0% 392.0n ± 0% -27.53% (p=0.000 n=20)
32/32/1/3 541.3n ± 0% 391.8n ± 0% -27.61% (p=0.000 n=20)
32/32/2/0 541.9n ± 0% 390.3n ± 0% -27.98% (p=0.000 n=20)
32/32/2/1 540.8n ± 0% 390.7n ± 0% -27.77% (p=0.000 n=20)
32/32/2/2 541.0n ± 0% 390.6n ± 0% -27.81% (p=0.000 n=20)
32/32/2/3 540.9n ± 0% 390.7n ± 0% -27.77% (p=0.000 n=20)
32/32/3/0 540.6n ± 0% 390.7n ± 0% -27.74% (p=0.000 n=20)
32/32/3/1 540.9n ± 0% 390.7n ± 0% -27.76% (p=0.000 n=20)
32/32/3/2 540.7n ± 0% 390.5n ± 0% -27.78% (p=0.000 n=20)
32/32/3/3 540.6n ± 0% 390.8n ± 0% -27.71% (p=0.000 n=20)
32/32/4/4 555.0n ± 0% 390.8n ± 0% -29.58% (p=0.000 n=20)
32/64/0/0 1013.5n ± 0% 775.0n ± 0% -23.53% (p=0.000 n=20)
32/64/0/1 1011.5n ± 0% 775.3n ± 0% -23.35% (p=0.000 n=20)
32/64/0/2 1011.9n ± 0% 775.6n ± 0% -23.35% (p=0.000 n=20)
32/64/0/3 1012.6n ± 0% 775.0n ± 0% -23.46% (p=0.000 n=20)
32/64/1/0 1012.4n ± 0% 774.9n ± 0% -23.46% (p=0.000 n=20)
32/64/1/1 1012.8n ± 0% 775.7n ± 0% -23.41% (p=0.000 n=20)
32/64/1/2 1011.8n ± 0% 775.2n ± 0% -23.39% (p=0.000 n=20)
32/64/1/3 1012.3n ± 0% 776.0n ± 0% -23.34% (p=0.000 n=20)
32/64/2/0 1012.0n ± 0% 775.5n ± 0% -23.37% (p=0.000 n=20)
32/64/2/1 1013.3n ± 0% 775.4n ± 0% -23.48% (p=0.000 n=20)
32/64/2/2 1012.6n ± 0% 774.9n ± 0% -23.47% (p=0.000 n=20)
32/64/2/3 1012.3n ± 0% 775.6n ± 0% -23.39% (p=0.000 n=20)
32/64/3/0 1011.9n ± 0% 775.0n ± 0% -23.41% (p=0.000 n=20)
32/64/3/1 1012.1n ± 0% 775.3n ± 0% -23.40% (p=0.000 n=20)
32/64/3/2 1012.5n ± 0% 775.1n ± 0% -23.45% (p=0.000 n=20)
32/64/3/3 1012.8n ± 0% 775.4n ± 0% -23.43% (p=0.000 n=20)
32/64/4/4 1026.1n ± 0% 773.9n ± 0% -24.58% (p=0.000 n=20)
64/32/0/0 1049.8n ± 0% 775.5n ± 0% -26.13% (p=0.000 n=20)
64/32/0/1 1049.7n ± 0% 775.6n ± 0% -26.11% (p=0.000 n=20)
64/32/0/2 1050.3n ± 0% 775.4n ± 0% -26.18% (p=0.000 n=20)
64/32/0/3 1050.3n ± 0% 775.5n ± 0% -26.16% (p=0.000 n=20)
64/32/1/0 1051.1n ± 0% 775.5n ± 0% -26.21% (p=0.000 n=20)
64/32/1/1 1049.1n ± 0% 775.3n ± 0% -26.10% (p=0.000 n=20)
64/32/1/2 1050.3n ± 0% 775.6n ± 0% -26.15% (p=0.000 n=20)
64/32/1/3 1049.9n ± 0% 775.4n ± 0% -26.15% (p=0.000 n=20)
64/32/2/0 1050.4n ± 0% 775.6n ± 0% -26.16% (p=0.000 n=20)
64/32/2/1 1050.0n ± 0% 775.5n ± 0% -26.14% (p=0.000 n=20)
64/32/2/2 1050.2n ± 0% 775.4n ± 0% -26.17% (p=0.000 n=20)
64/32/2/3 1050.9n ± 0% 775.5n ± 0% -26.20% (p=0.000 n=20)
64/32/3/0 1050.3n ± 0% 775.2n ± 0% -26.20% (p=0.000 n=20)
64/32/3/1 1051.0n ± 0% 775.5n ± 0% -26.22% (p=0.000 n=20)
64/32/3/2 1051.4n ± 0% 775.9n ± 0% -26.21% (p=0.000 n=20)
64/32/3/3 1049.9n ± 0% 775.9n ± 0% -26.10% (p=0.000 n=20)
64/32/4/4 1078.1n ± 1% 776.3n ± 0% -28.00% (p=0.000 n=20)
64/64/0/0 1.996µ ± 0% 1.523µ ± 0% -23.72% (p=0.000 n=20)
64/64/0/1 1.997µ ± 0% 1.524µ ± 0% -23.69% (p=0.000 n=20)
64/64/0/2 1.997µ ± 0% 1.522µ ± 0% -23.75% (p=0.000 n=20)
64/64/0/3 1.996µ ± 0% 1.523µ ± 0% -23.72% (p=0.000 n=20)
64/64/1/0 1.997µ ± 0% 1.525µ ± 0% -23.65% (p=0.000 n=20)
64/64/1/1 1.998µ ± 0% 1.523µ ± 0% -23.79% (p=0.000 n=20)
64/64/1/2 1.997µ ± 0% 1.523µ ± 0% -23.71% (p=0.000 n=20)
64/64/1/3 1.997µ ± 0% 1.525µ ± 0% -23.65% (p=0.000 n=20)
64/64/2/0 1.998µ ± 0% 1.523µ ± 0% -23.76% (p=0.000 n=20)
64/64/2/1 1.997µ ± 0% 1.524µ ± 0% -23.67% (p=0.000 n=20)
64/64/2/2 1.996µ ± 0% 1.524µ ± 0% -23.66% (p=0.000 n=20)
64/64/2/3 1.996µ ± 0% 1.523µ ± 0% -23.69% (p=0.000 n=20)
64/64/3/0 1.997µ ± 0% 1.523µ ± 0% -23.72% (p=0.000 n=20)
64/64/3/1 1.997µ ± 0% 1.525µ ± 0% -23.63% (p=0.000 n=20)
64/64/3/2 1.996µ ± 0% 1.523µ ± 0% -23.71% (p=0.000 n=20)
64/64/3/3 1.997µ ± 0% 1.523µ ± 0% -23.78% (p=0.000 n=20)
64/64/4/4 2.030µ ± 0% 1.522µ ± 0% -24.99% (p=0.000 n=20)
64/128/0/0 4.284µ ± 0% 3.347µ ± 0% -21.86% (p=0.000 n=20)
64/128/0/1 4.284µ ± 0% 3.349µ ± 0% -21.82% (p=0.000 n=20)
64/128/0/2 4.289µ ± 0% 3.347µ ± 0% -21.97% (p=0.000 n=20)
64/128/0/3 4.287µ ± 0% 3.347µ ± 0% -21.92% (p=0.000 n=20)
64/128/1/0 4.289µ ± 0% 3.345µ ± 0% -22.00% (p=0.000 n=20)
64/128/1/1 4.282µ ± 0% 3.355µ ± 0% -21.65% (p=0.000 n=20)
64/128/1/2 4.285µ ± 0% 3.355µ ± 0% -21.71% (p=0.000 n=20)
64/128/1/3 4.283µ ± 0% 3.356µ ± 0% -21.65% (p=0.000 n=20)
64/128/2/0 4.282µ ± 0% 3.350µ ± 0% -21.76% (p=0.000 n=20)
64/128/2/1 4.283µ ± 0% 3.359µ ± 0% -21.58% (p=0.000 n=20)
64/128/2/2 4.283µ ± 0% 3.359µ ± 1% -21.57% (p=0.000 n=20)
64/128/2/3 4.283µ ± 0% 3.353µ ± 0% -21.70% (p=0.000 n=20)
64/128/3/0 4.283µ ± 0% 3.349µ ± 1% -21.81% (p=0.000 n=20)
64/128/3/1 4.281µ ± 0% 3.352µ ± 0% -21.71% (p=0.000 n=20)
64/128/3/2 4.283µ ± 0% 3.353µ ± 0% -21.71% (p=0.000 n=20)
64/128/3/3 4.281µ ± 0% 3.352µ ± 0% -21.71% (p=0.000 n=20)
64/128/4/4 4.314µ ± 0% 3.348µ ± 0% -22.39% (p=0.000 n=20)
128/64/0/0 4.046µ ± 1% 3.104µ ± 0% -23.29% (p=0.000 n=20)
128/64/0/1 4.049µ ± 0% 3.103µ ± 0% -23.37% (p=0.000 n=20)
128/64/0/2 4.048µ ± 0% 3.105µ ± 0% -23.31% (p=0.000 n=20)
128/64/0/3 4.050µ ± 1% 3.105µ ± 0% -23.31% (p=0.000 n=20)
128/64/1/0 4.048µ ± 1% 3.107µ ± 0% -23.25% (p=0.000 n=20)
128/64/1/1 4.045µ ± 0% 3.107µ ± 0% -23.20% (p=0.000 n=20)
128/64/1/2 4.049µ ± 0% 3.107µ ± 0% -23.27% (p=0.000 n=20)
128/64/1/3 4.049µ ± 1% 3.107µ ± 0% -23.28% (p=0.000 n=20)
128/64/2/0 4.050µ ± 0% 3.107µ ± 0% -23.27% (p=0.000 n=20)
128/64/2/1 4.049µ ± 0% 3.110µ ± 0% -23.21% (p=0.000 n=20)
128/64/2/2 4.049µ ± 0% 3.109µ ± 0% -23.21% (p=0.000 n=20)
128/64/2/3 4.048µ ± 0% 3.104µ ± 0% -23.31% (p=0.000 n=20)
128/64/3/0 4.049µ ± 1% 3.106µ ± 0% -23.28% (p=0.000 n=20)
128/64/3/1 4.048µ ± 0% 3.104µ ± 1% -23.32% (p=0.000 n=20)
128/64/3/2 4.050µ ± 0% 3.106µ ± 0% -23.30% (p=0.000 n=20)
128/64/3/3 4.049µ ± 0% 3.104µ ± 0% -23.34% (p=0.000 n=20)
128/64/4/4 4.117µ ± 0% 3.105µ ± 1% -24.59% (p=0.000 n=20)
128/128/0/0 8.866µ ± 0% 6.868µ ± 0% -22.54% (p=0.000 n=20)
128/128/0/1 8.868µ ± 0% 6.882µ ± 1% -22.40% (p=0.000 n=20)
128/128/0/2 8.863µ ± 0% 6.863µ ± 1% -22.56% (p=0.000 n=20)
128/128/0/3 8.860µ ± 0% 6.884µ ± 0% -22.31% (p=0.000 n=20)
128/128/1/0 8.868µ ± 0% 6.869µ ± 1% -22.54% (p=0.000 n=20)
128/128/1/1 8.863µ ± 1% 6.857µ ± 0% -22.63% (p=0.000 n=20)
128/128/1/2 8.879µ ± 0% 6.905µ ± 1% -22.23% (p=0.000 n=20)
128/128/1/3 8.875µ ± 0% 6.870µ ± 0% -22.59% (p=0.000 n=20)
128/128/2/0 8.867µ ± 1% 6.876µ ± 1% -22.44% (p=0.000 n=20)
128/128/2/1 8.867µ ± 0% 6.868µ ± 1% -22.55% (p=0.000 n=20)
128/128/2/2 8.859µ ± 0% 6.870µ ± 1% -22.45% (p=0.000 n=20)
128/128/2/3 8.873µ ± 0% 6.862µ ± 1% -22.66% (p=0.000 n=20)
128/128/3/0 8.856µ ± 0% 6.864µ ± 0% -22.49% (p=0.000 n=20)
128/128/3/1 8.858µ ± 0% 6.853µ ± 0% -22.64% (p=0.000 n=20)
128/128/3/2 8.871µ ± 0% 6.859µ ± 1% -22.69% (p=0.000 n=20)
128/128/3/3 8.869µ ± 0% 6.881µ ± 1% -22.41% (p=0.000 n=20)
128/128/4/4 9.319µ ± 0% 6.865µ ± 0% -26.33% (p=0.000 n=20)
4/16/0/0 82.12n ± 0% 57.29n ± 0% -30.24% (p=0.000 n=20)
4/16/0/1 82.14n ± 0% 57.08n ± 0% -30.51% (p=0.000 n=20)
4/16/0/2 82.24n ± 0% 57.30n ± 0% -30.32% (p=0.000 n=20)
4/16/0/3 82.01n ± 0% 57.25n ± 0% -30.20% (p=0.000 n=20)
4/16/1/0 82.15n ± 0% 57.45n ± 0% -30.06% (p=0.000 n=20)
4/16/1/1 82.07n ± 0% 57.24n ± 0% -30.26% (p=0.000 n=20)
4/16/1/2 81.97n ± 0% 57.26n ± 0% -30.14% (p=0.000 n=20)
4/16/1/3 82.20n ± 0% 57.29n ± 1% -30.30% (p=0.000 n=20)
4/16/2/0 82.06n ± 0% 57.35n ± 1% -30.11% (p=0.000 n=20)
4/16/2/1 82.12n ± 0% 57.28n ± 0% -30.25% (p=0.000 n=20)
4/16/2/2 82.22n ± 0% 57.24n ± 0% -30.39% (p=0.000 n=20)
4/16/2/3 82.09n ± 0% 57.20n ± 0% -30.31% (p=0.000 n=20)
4/16/3/0 82.10n ± 0% 57.17n ± 1% -30.36% (p=0.000 n=20)
4/16/3/1 82.20n ± 0% 57.19n ± 1% -30.42% (p=0.000 n=20)
4/16/3/2 82.18n ± 0% 57.20n ± 0% -30.40% (p=0.000 n=20)
4/16/3/3 82.16n ± 0% 57.02n ± 1% -30.60% (p=0.000 n=20)
4/16/4/4 85.34n ± 0% 57.45n ± 1% -32.68% (p=0.000 n=20)
16/4/0/0 94.63n ± 0% 72.85n ± 1% -23.01% (p=0.000 n=20)
16/4/0/1 94.69n ± 0% 72.59n ± 0% -23.34% (p=0.000 n=20)
16/4/0/2 94.87n ± 0% 72.66n ± 1% -23.42% (p=0.000 n=20)
16/4/0/3 94.71n ± 0% 72.67n ± 1% -23.27% (p=0.000 n=20)
16/4/1/0 94.51n ± 0% 72.91n ± 0% -22.85% (p=0.000 n=20)
16/4/1/1 94.39n ± 0% 72.58n ± 0% -23.10% (p=0.000 n=20)
16/4/1/2 94.73n ± 0% 72.61n ± 0% -23.35% (p=0.000 n=20)
16/4/1/3 94.55n ± 1% 72.62n ± 1% -23.19% (p=0.000 n=20)
16/4/2/0 94.54n ± 0% 72.91n ± 1% -22.87% (p=0.000 n=20)
16/4/2/1 94.75n ± 0% 72.82n ± 0% -23.15% (p=0.000 n=20)
16/4/2/2 94.73n ± 0% 72.59n ± 1% -23.37% (p=0.000 n=20)
16/4/2/3 94.70n ± 0% 72.85n ± 1% -23.07% (p=0.000 n=20)
16/4/3/0 94.63n ± 0% 72.56n ± 0% -23.32% (p=0.000 n=20)
16/4/3/1 94.86n ± 0% 72.79n ± 0% -23.27% (p=0.000 n=20)
16/4/3/2 94.81n ± 0% 72.66n ± 1% -23.36% (p=0.000 n=20)
16/4/3/3 94.60n ± 0% 72.61n ± 1% -23.24% (p=0.000 n=20)
16/4/4/4 93.88n ± 0% 72.65n ± 1% -22.61% (p=0.000 n=20)
8/32/0/0 147.0n ± 0% 110.2n ± 0% -25.06% (p=0.000 n=20)
8/32/0/1 146.9n ± 0% 110.4n ± 0% -24.86% (p=0.000 n=20)
8/32/0/2 147.0n ± 0% 110.1n ± 0% -25.12% (p=0.000 n=20)
8/32/0/3 146.8n ± 0% 110.1n ± 0% -25.01% (p=0.000 n=20)
8/32/1/0 147.0n ± 0% 110.9n ± 1% -24.59% (p=0.000 n=20)
8/32/1/1 147.0n ± 0% 110.6n ± 1% -24.79% (p=0.000 n=20)
8/32/1/2 147.0n ± 0% 111.2n ± 1% -24.36% (p=0.000 n=20)
8/32/1/3 146.9n ± 0% 111.0n ± 1% -24.43% (p=0.000 n=20)
8/32/2/0 147.1n ± 0% 109.9n ± 0% -25.26% (p=0.000 n=20)
8/32/2/1 147.0n ± 0% 110.4n ± 0% -24.89% (p=0.000 n=20)
8/32/2/2 147.2n ± 0% 109.7n ± 0% -25.50% (p=0.000 n=20)
8/32/2/3 146.9n ± 0% 109.8n ± 0% -25.30% (p=0.000 n=20)
8/32/3/0 146.8n ± 0% 109.8n ± 0% -25.19% (p=0.000 n=20)
8/32/3/1 147.0n ± 0% 110.3n ± 0% -24.93% (p=0.000 n=20)
8/32/3/2 147.2n ± 0% 109.8n ± 0% -25.42% (p=0.000 n=20)
8/32/3/3 147.1n ± 0% 109.7n ± 0% -25.43% (p=0.000 n=20)
8/32/4/4 150.5n ± 0% 110.0n ± 0% -26.94% (p=0.000 n=20)
32/8/0/0 215.3n ± 0% 166.5n ± 0% -22.66% (p=0.000 n=20)
32/8/0/1 215.8n ± 0% 167.1n ± 0% -22.56% (p=0.000 n=20)
32/8/0/2 215.7n ± 0% 167.1n ± 0% -22.51% (p=0.000 n=20)
32/8/0/3 215.0n ± 0% 167.1n ± 0% -22.29% (p=0.000 n=20)
32/8/1/0 215.0n ± 1% 167.4n ± 0% -22.12% (p=0.000 n=20)
32/8/1/1 215.4n ± 0% 166.8n ± 1% -22.55% (p=0.000 n=20)
32/8/1/2 215.5n ± 0% 167.0n ± 0% -22.52% (p=0.000 n=20)
32/8/1/3 215.4n ± 0% 167.2n ± 0% -22.37% (p=0.000 n=20)
32/8/2/0 215.1n ± 0% 166.7n ± 0% -22.50% (p=0.000 n=20)
32/8/2/1 215.3n ± 0% 166.9n ± 0% -22.48% (p=0.000 n=20)
32/8/2/2 215.6n ± 0% 167.4n ± 0% -22.35% (p=0.000 n=20)
32/8/2/3 215.6n ± 0% 167.2n ± 0% -22.44% (p=0.000 n=20)
32/8/3/0 215.4n ± 0% 166.9n ± 0% -22.50% (p=0.000 n=20)
32/8/3/1 215.1n ± 0% 167.0n ± 0% -22.38% (p=0.000 n=20)
32/8/3/2 215.5n ± 0% 166.8n ± 1% -22.58% (p=0.000 n=20)
32/8/3/3 215.2n ± 0% 167.3n ± 0% -22.26% (p=0.000 n=20)
32/8/4/4 214.5n ± 0% 166.6n ± 0% -22.32% (p=0.000 n=20)
16/64/0/0 524.3n ± 0% 389.8n ± 0% -25.65% (p=0.000 n=20)
16/64/0/1 523.4n ± 0% 390.6n ± 0% -25.36% (p=0.000 n=20)
16/64/0/2 523.5n ± 0% 389.8n ± 0% -25.54% (p=0.000 n=20)
16/64/0/3 523.5n ± 0% 390.1n ± 0% -25.47% (p=0.000 n=20)
16/64/1/0 523.6n ± 0% 390.1n ± 0% -25.50% (p=0.000 n=20)
16/64/1/1 523.5n ± 0% 390.3n ± 0% -25.46% (p=0.000 n=20)
16/64/1/2 523.2n ± 0% 390.5n ± 0% -25.37% (p=0.000 n=20)
16/64/1/3 523.4n ± 0% 390.6n ± 0% -25.37% (p=0.000 n=20)
16/64/2/0 523.7n ± 0% 390.2n ± 0% -25.48% (p=0.000 n=20)
16/64/2/1 523.2n ± 0% 390.2n ± 0% -25.42% (p=0.000 n=20)
16/64/2/2 522.8n ± 0% 390.2n ± 0% -25.36% (p=0.000 n=20)
16/64/2/3 523.7n ± 0% 390.0n ± 0% -25.53% (p=0.000 n=20)
16/64/3/0 523.1n ± 0% 390.3n ± 0% -25.40% (p=0.000 n=20)
16/64/3/1 523.6n ± 0% 390.5n ± 0% -25.42% (p=0.000 n=20)
16/64/3/2 524.2n ± 0% 390.2n ± 0% -25.57% (p=0.000 n=20)
16/64/3/3 523.1n ± 0% 390.1n ± 0% -25.41% (p=0.000 n=20)
16/64/4/4 529.6n ± 0% 389.2n ± 0% -26.51% (p=0.000 n=20)
64/16/0/0 611.0n ± 0% 447.8n ± 0% -26.72% (p=0.000 n=20)
64/16/0/1 612.0n ± 0% 450.2n ± 1% -26.45% (p=0.000 n=20)
64/16/0/2 614.9n ± 0% 449.4n ± 0% -26.91% (p=0.000 n=20)
64/16/0/3 610.5n ± 0% 448.8n ± 0% -26.50% (p=0.000 n=20)
64/16/1/0 611.3n ± 0% 450.4n ± 1% -26.32% (p=0.000 n=20)
64/16/1/1 611.5n ± 1% 450.1n ± 0% -26.40% (p=0.000 n=20)
64/16/1/2 612.5n ± 0% 450.9n ± 1% -26.39% (p=0.000 n=20)
64/16/1/3 611.0n ± 0% 450.6n ± 0% -26.24% (p=0.000 n=20)
64/16/2/0 613.4n ± 1% 448.9n ± 0% -26.82% (p=0.000 n=20)
64/16/2/1 612.3n ± 0% 448.8n ± 0% -26.70% (p=0.000 n=20)
64/16/2/2 612.6n ± 0% 448.7n ± 1% -26.75% (p=0.000 n=20)
64/16/2/3 612.9n ± 0% 450.8n ± 0% -26.46% (p=0.000 n=20)
64/16/3/0 611.6n ± 0% 449.2n ± 0% -26.56% (p=0.000 n=20)
64/16/3/1 611.9n ± 1% 448.9n ± 0% -26.65% (p=0.000 n=20)
64/16/3/2 611.2n ± 1% 448.5n ± 1% -26.62% (p=0.000 n=20)
64/16/3/3 612.6n ± 0% 448.3n ± 0% -26.81% (p=0.000 n=20)
64/16/4/4 632.0n ± 0% 449.0n ± 0% -28.97% (p=0.000 n=20)
Change-Id: Id9020151c4c4df5f3a2019ea2d8c1ee48329336c
diff --git a/aom_dsp/x86/convolve_avx2.h b/aom_dsp/x86/convolve_avx2.h
index 1191a28a67..9e7b69e896 100644
--- a/aom_dsp/x86/convolve_avx2.h
+++ b/aom_dsp/x86/convolve_avx2.h
@@ -502,134 +502,108 @@ static inline void sr_2d_ver_round_and_store(__m256i res_a, __m256i res_b,
s[10] = s[11]; \
}
-#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP \
- do { \
- for (i = 0; i < im_h; i += 2) { \
- __m256i data = \
- _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); \
- if (i + 1 < im_h) \
- data = _mm256_inserti128_si256( \
- data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \
- src_h += (src_stride << 1); \
- __m256i res = convolve_lowbd_x(data, coeffs_x, filt); \
- \
- res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), \
- round_shift_h); \
- \
- _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
- } \
- } while (0)
-
-#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP \
+#define JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j_off) \
do { \
- __m256i s[8]; \
- __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \
- __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \
- __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \
- __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \
- __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \
- __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \
- \
- s[0] = _mm256_unpacklo_epi16(s0, s1); \
- s[1] = _mm256_unpacklo_epi16(s2, s3); \
- s[2] = _mm256_unpacklo_epi16(s4, s5); \
- \
- s[4] = _mm256_unpackhi_epi16(s0, s1); \
- s[5] = _mm256_unpackhi_epi16(s2, s3); \
- s[6] = _mm256_unpackhi_epi16(s4, s5); \
- \
- for (i = 0; i < h; i += 2) { \
- const int16_t *data = &im_block[i * im_stride]; \
- \
- const __m256i s6 = \
- _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
- const __m256i s7 = \
- _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
- \
- s[3] = _mm256_unpacklo_epi16(s6, s7); \
- s[7] = _mm256_unpackhi_epi16(s6, s7); \
- \
- const __m256i res_a = convolve(s, coeffs_y); \
- const __m256i res_a_round = _mm256_sra_epi32( \
- _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
- \
- if (w - j > 4) { \
- const __m256i res_b = convolve(s + 4, coeffs_y); \
- const __m256i res_b_round = _mm256_sra_epi32( \
- _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
- const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \
- const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \
- \
- if (do_average) { \
- const __m256i data_ref_0 = \
- load_line2_avx2(&dst[i * dst_stride + j], \
- &dst[i * dst_stride + j + dst_stride]); \
- const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \
- &wt, use_dist_wtd_comp_avg); \
- \
- const __m256i round_result = convolve_rounding( \
- &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \
- \
- const __m256i res_8 = \
- _mm256_packus_epi16(round_result, round_result); \
- const __m128i res_0 = _mm256_castsi256_si128(res_8); \
- const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \
- \
- _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \
- _mm_storel_epi64( \
- (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \
- } else { \
- const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \
- \
- const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \
- res_1); \
- } \
+ if (do_average) { \
+ const __m256i data_ref_0 = \
+ load_line2_avx2(&dst[i * dst_stride + (j_off)], \
+ &dst[i * dst_stride + (j_off) + dst_stride]); \
+ const __m256i comp_avg_res = \
+ comp_avg(&data_ref_0, &(res_unsigned), &wt, use_dist_wtd_comp_avg); \
+ const __m256i res_signed = _mm256_sub_epi16(comp_avg_res, offset_const); \
+ const __m256i round_result = \
+ _mm256_srai_epi16(_mm256_add_epi16(res_signed, rounding_const), 4); \
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \
+ if (w - (j_off) > 4) { \
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + (j_off)]), \
+ res_0); \
+ _mm_storel_epi64( \
+ (__m128i *)(&dst0[i * dst_stride0 + (j_off) + dst_stride0]), \
+ res_1); \
} else { \
- const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \
- const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \
- \
- if (do_average) { \
- const __m256i data_ref_0 = \
- load_line2_avx2(&dst[i * dst_stride + j], \
- &dst[i * dst_stride + j + dst_stride]); \
- \
- const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \
- &wt, use_dist_wtd_comp_avg); \
- \
- const __m256i round_result = convolve_rounding( \
- &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \
- \
- const __m256i res_8 = \
- _mm256_packus_epi16(round_result, round_result); \
- const __m128i res_0 = _mm256_castsi256_si128(res_8); \
- const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \
- \
- *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \
- *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \
- _mm_cvtsi128_si32(res_1); \
- \
- } else { \
- const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \
- \
- const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \
- res_1); \
- } \
+ *(int *)(&dst0[i * dst_stride0 + (j_off)]) = _mm_cvtsi128_si32(res_0); \
+ *(int *)(&dst0[i * dst_stride0 + (j_off) + dst_stride0]) = \
+ _mm_cvtsi128_si32(res_1); \
} \
- \
- s[0] = s[1]; \
- s[1] = s[2]; \
- s[2] = s[3]; \
- \
- s[4] = s[5]; \
- s[5] = s[6]; \
- s[6] = s[7]; \
+ } else { \
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + (j_off)]), res_0); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \
+ _mm_store_si128( \
+ (__m128i *)(&dst[i * dst_stride + (j_off) + dst_stride]), res_1); \
} \
} while (0)
+#define JNT_CONVOLVE_HORIZONTAL_FILTER(src_h_start, convolve_fn, coeffs) \
+ do { \
+ const uint8_t *src_h = (src_h_start); \
+ for (i = 0; i < im_h; i += 2) { \
+ const __m256i data = load_line2_avx2(src_h, src_h + src_stride); \
+ src_h += (src_stride << 1); \
+ __m256i res = convolve_fn(data, coeffs, filt); \
+ res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2); \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
+ } \
+ } while (0)
+
+#define JNT_CONVOLVE_VERTICAL_FILTER_8TAP \
+ do { \
+ __m256i s[8]; \
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \
+ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \
+ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \
+ \
+ s[0] = _mm256_unpacklo_epi16(s0, s1); \
+ s[1] = _mm256_unpacklo_epi16(s2, s3); \
+ s[2] = _mm256_unpacklo_epi16(s4, s5); \
+ \
+ s[4] = _mm256_unpackhi_epi16(s0, s1); \
+ s[5] = _mm256_unpackhi_epi16(s2, s3); \
+ s[6] = _mm256_unpackhi_epi16(s4, s5); \
+ \
+ for (i = 0; i < h; i += 2) { \
+ const int16_t *data = &im_block[i * im_stride]; \
+ \
+ const __m256i s6 = \
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
+ const __m256i s7 = \
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
+ \
+ s[3] = _mm256_unpacklo_epi16(s6, s7); \
+ s[7] = _mm256_unpackhi_epi16(s6, s7); \
+ \
+ const __m256i res_a = convolve(s, coeffs_y); \
+ const __m256i res_a_round = \
+ _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7); \
+ \
+ if (w - j > 4) { \
+ const __m256i res_b = convolve(s + 4, coeffs_y); \
+ const __m256i res_b_round = \
+ _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 7); \
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \
+ JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j); \
+ } else { \
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \
+ JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j); \
+ } \
+ \
+ s[0] = s[1]; \
+ s[1] = s[2]; \
+ s[2] = s[3]; \
+ \
+ s[4] = s[5]; \
+ s[5] = s[6]; \
+ s[6] = s[7]; \
+ } \
+ } while (0)
+
static inline void prepare_coeffs_2t_ssse3(
const InterpFilterParams *const filter_params, const int32_t subpel_q4,
__m128i *const coeffs /* [4] */) {
diff --git a/av1/common/x86/jnt_convolve_avx2.c b/av1/common/x86/jnt_convolve_avx2.c
index 925fe47cf5..e3ac6d466d 100644
--- a/av1/common/x86/jnt_convolve_avx2.c
+++ b/av1/common/x86/jnt_convolve_avx2.c
@@ -612,14 +612,13 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
const __m256i round_const_h = _mm256_set1_epi16(
((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
- const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
-
const __m256i round_const_v = _mm256_set1_epi32(
((1 << conv_params->round_1) >> 1) -
(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
- const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
- __m256i filt[4], coeffs_x[4], coeffs_y[4];
+ DECLARE_ALIGNED(32, __m256i, filt[4]);
+ DECLARE_ALIGNED(32, __m256i, coeffs_x[4]);
+ DECLARE_ALIGNED(32, __m256i, coeffs_y[4]);
filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
@@ -635,32 +634,425 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0)))
is_vert_4tap = 1;
- if (is_horiz_4tap) {
- int im_h = h + filter_params_y->taps - 1;
+ if (is_horiz_4tap && is_vert_4tap) {
+ int im_h = h + 4;
+ const int fo_vert = 1;
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+ if (w > 4) {
+ if (do_average) {
+ if (use_dist_wtd_comp_avg) {
+ const __m256i comp_const = _mm256_set1_epi32(-98176);
+ for (int j = 0; j < w; j += 8) {
+ JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x_4tap,
+ coeffs_x + 1);
+
+ /* Vertical filter */
+ __m256i s[6];
+ __m256i s0 =
+ _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 =
+ _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 =
+ _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 =
+ _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+ s[3] = _mm256_unpackhi_epi16(s0, s1);
+ s[4] = _mm256_unpackhi_epi16(s2, s3);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s4 =
+ _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+ const __m256i s5 =
+ _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+ s[5] = _mm256_unpackhi_epi16(s4, s5);
+
+ const __m256i res_a_1 = _mm256_madd_epi16(s[0], coeffs_y[1]);
+ const __m256i res_b_1 = _mm256_madd_epi16(s[3], coeffs_y[1]);
+ const __m256i res_a_2 = _mm256_madd_epi16(s[1], coeffs_y[2]);
+ const __m256i res_b_2 = _mm256_madd_epi16(s[4], coeffs_y[2]);
+ const __m256i res_a = _mm256_add_epi32(res_a_1, res_a_2);
+ const __m256i res_b = _mm256_add_epi32(res_b_1, res_b_2);
+
+ const __m256i res_a_round =
+ _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);
+ const __m256i res_b_round =
+ _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 7);
+ const __m256i res_16b =
+ _mm256_packs_epi32(res_a_round, res_b_round);
+ const __m256i res_unsigned =
+ _mm256_add_epi16(res_16b, offset_const);
+
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+
+ const __m256i data_lo =
+ _mm256_unpacklo_epi16(data_ref_0, res_unsigned);
+ const __m256i data_hi =
+ _mm256_unpackhi_epi16(data_ref_0, res_unsigned);
+
+ const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, wt);
+ const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, wt);
+
+ const __m256i add_lo = _mm256_add_epi32(wt_res_lo, comp_const);
+ const __m256i add_hi = _mm256_add_epi32(wt_res_hi, comp_const);
+ const __m256i fused_lo = _mm256_srai_epi32(add_lo, 8);
+ const __m256i fused_hi = _mm256_srai_epi32(add_hi, 8);
+
+ const __m256i round_result =
+ _mm256_packs_epi32(fused_lo, fused_hi);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+ res_1);
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ } else {
+ for (int j = 0; j < w; j += 8) {
+ JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x_4tap,
+ coeffs_x + 1);
+
+ /* Vertical filter */
+ __m256i s[6];
+ __m256i s0 =
+ _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 =
+ _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 =
+ _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 =
+ _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+ s[3] = _mm256_unpackhi_epi16(s0, s1);
+ s[4] = _mm256_unpackhi_epi16(s2, s3);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s4 =
+ _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+ const __m256i s5 =
+ _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+ s[5] = _mm256_unpackhi_epi16(s4, s5);
+
+ const __m256i res_a_1 = _mm256_madd_epi16(s[0], coeffs_y[1]);
+ const __m256i res_b_1 = _mm256_madd_epi16(s[3], coeffs_y[1]);
+ const __m256i res_a_2 = _mm256_madd_epi16(s[1], coeffs_y[2]);
+ const __m256i res_b_2 = _mm256_madd_epi16(s[4], coeffs_y[2]);
+ const __m256i res_a = _mm256_add_epi32(res_a_1, res_a_2);
+ const __m256i res_b = _mm256_add_epi32(res_b_1, res_b_2);
+
+ const __m256i res_a_round =
+ _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);
+ const __m256i res_b_round =
+ _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 7);
+ const __m256i res_16b =
+ _mm256_packs_epi32(res_a_round, res_b_round);
+ const __m256i res_unsigned =
+ _mm256_add_epi16(res_16b, offset_const);
+
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+ const __m256i wt_res = _mm256_add_epi16(data_ref_0, res_unsigned);
+ const __m256i comp_avg_res = _mm256_srai_epi16(wt_res, 1);
+
+ const __m256i res_signed =
+ _mm256_sub_epi16(comp_avg_res, offset_const);
+ const __m256i round_result = _mm256_srai_epi16(
+ _mm256_add_epi16(res_signed, rounding_const), 4);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+ res_1);
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ }
+ } else {
+ for (int j = 0; j < w; j += 8) {
+ JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x_4tap,
+ coeffs_x + 1);
+
+ /* Vertical filter */
+ __m256i s[6];
+ __m256i s0 =
+ _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 =
+ _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 =
+ _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 =
+ _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+ s[3] = _mm256_unpackhi_epi16(s0, s1);
+ s[4] = _mm256_unpackhi_epi16(s2, s3);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s4 =
+ _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+ const __m256i s5 =
+ _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+ s[5] = _mm256_unpackhi_epi16(s4, s5);
+
+ const __m256i res_a_1 = _mm256_madd_epi16(s[0], coeffs_y[1]);
+ const __m256i res_b_1 = _mm256_madd_epi16(s[3], coeffs_y[1]);
+ const __m256i res_a_2 = _mm256_madd_epi16(s[1], coeffs_y[2]);
+ const __m256i res_b_2 = _mm256_madd_epi16(s[4], coeffs_y[2]);
+ const __m256i res_a = _mm256_add_epi32(res_a_1, res_a_2);
+ const __m256i res_b = _mm256_add_epi32(res_b_1, res_b_2);
+
+ const __m256i res_a_round =
+ _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);
+ const __m256i res_b_round =
+ _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 7);
+ const __m256i res_16b =
+ _mm256_packs_epi32(res_a_round, res_b_round);
+ const __m256i res_unsigned =
+ _mm256_add_epi16(res_16b, offset_const);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ }
+ } else {
+ if (do_average) {
+ if (use_dist_wtd_comp_avg) {
+ const __m256i comp_const = _mm256_set1_epi32(-98176);
+ JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr, convolve_lowbd_x_4tap,
+ coeffs_x + 1);
+
+ /* Vertical filter */
+ __m256i s[3];
+ __m256i s0 =
+ _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 =
+ _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 =
+ _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 =
+ _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s4 =
+ _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+ const __m256i s5 =
+ _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+ const __m256i res_a_1 = _mm256_madd_epi16(s[0], coeffs_y[1]);
+ const __m256i res_a_2 = _mm256_madd_epi16(s[1], coeffs_y[2]);
+ const __m256i res_a = _mm256_add_epi32(res_a_1, res_a_2);
+
+ const __m256i res_a_round =
+ _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);
+ const __m256i res_16b =
+ _mm256_packs_epi32(res_a_round, res_a_round);
+ const __m256i res_unsigned =
+ _mm256_add_epi16(res_16b, offset_const);
+
+ const __m256i data_ref_0 = load_line2_avx2(
+ &dst[i * dst_stride], &dst[i * dst_stride + dst_stride]);
+
+ const __m256i data_lo =
+ _mm256_unpacklo_epi16(data_ref_0, res_unsigned);
+
+ const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, wt);
+
+ const __m256i fused_lo =
+ _mm256_srai_epi32(_mm256_add_epi32(wt_res_lo, comp_const), 8);
+
+ const __m256i round_result = _mm256_packs_epi32(fused_lo, fused_lo);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ *(int *)(&dst0[i * dst_stride0]) = _mm_cvtsi128_si32(res_0);
+ *(int *)(&dst0[i * dst_stride0 + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+
+ s[0] = s[1];
+ s[1] = s[2];
+ }
+ } else {
+ JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr, convolve_lowbd_x_4tap,
+ coeffs_x + 1);
+
+ /* Vertical filter */
+ __m256i s[3];
+ __m256i s0 =
+ _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 =
+ _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 =
+ _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 =
+ _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s4 =
+ _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+ const __m256i s5 =
+ _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+ const __m256i res_a_1 = _mm256_madd_epi16(s[0], coeffs_y[1]);
+ const __m256i res_a_2 = _mm256_madd_epi16(s[1], coeffs_y[2]);
+ const __m256i res_a = _mm256_add_epi32(res_a_1, res_a_2);
+
+ const __m256i res_a_round =
+ _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);
+ const __m256i res_16b =
+ _mm256_packs_epi32(res_a_round, res_a_round);
+ const __m256i res_unsigned =
+ _mm256_add_epi16(res_16b, offset_const);
+
+ const __m256i data_ref_0 = load_line2_avx2(
+ &dst[i * dst_stride], &dst[i * dst_stride + dst_stride]);
+ const __m256i wt_res = _mm256_add_epi16(data_ref_0, res_unsigned);
+ const __m256i comp_avg_res = _mm256_srai_epi16(wt_res, 1);
+
+ const __m256i res_signed =
+ _mm256_sub_epi16(comp_avg_res, offset_const);
+ const __m256i round_result = _mm256_srai_epi16(
+ _mm256_add_epi16(res_signed, rounding_const), 4);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ *(int *)(&dst0[i * dst_stride0]) = _mm_cvtsi128_si32(res_0);
+ *(int *)(&dst0[i * dst_stride0 + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+
+ s[0] = s[1];
+ s[1] = s[2];
+ }
+ }
+ } else {
+ JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr, convolve_lowbd_x_4tap,
+ coeffs_x + 1);
+
+ /* Vertical filter */
+ __m256i s[3];
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s4 =
+ _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+ const __m256i s5 =
+ _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+ const __m256i res_a_1 = _mm256_madd_epi16(s[0], coeffs_y[1]);
+ const __m256i res_a_2 = _mm256_madd_epi16(s[1], coeffs_y[2]);
+ const __m256i res_a = _mm256_add_epi32(res_a_1, res_a_2);
+
+ const __m256i res_a_round =
+ _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + dst_stride]),
+ res_1);
+
+ s[0] = s[1];
+ s[1] = s[2];
+ }
+ }
+ }
+ } else if (is_horiz_4tap) {
+ int im_h = h + 8;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = 1;
const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
for (int j = 0; j < w; j += 8) {
- /* Horizontal filter */
- const uint8_t *src_h = src_ptr + j;
- for (i = 0; i < im_h; i += 2) {
- __m256i data =
- _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));
- if (i + 1 < im_h)
- data = _mm256_inserti128_si256(
- data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
- src_h += (src_stride << 1);
- __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt);
-
- res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
- round_shift_h);
-
- _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
- }
- DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+ JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x_4tap,
+ coeffs_x + 1);
+ JNT_CONVOLVE_VERTICAL_FILTER_8TAP;
}
} else if (is_vert_4tap) {
- int im_h = h + 3;
+ int im_h = h + 4;
const int fo_vert = 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
@@ -669,9 +1061,7 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
for (int j = 0; j < w; j += 8) {
- /* Horizontal filter */
- const uint8_t *src_h = src_ptr + j;
- DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+ JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x, coeffs_x);
/* Vertical filter */
__m256i s[6];
@@ -698,74 +1088,22 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
s[5] = _mm256_unpackhi_epi16(s4, s5);
const __m256i res_a = convolve_4tap(s, coeffs_y + 1);
- const __m256i res_a_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+ const __m256i res_a_round =
+ _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);
if (w - j > 4) {
const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1);
- const __m256i res_b_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+ const __m256i res_b_round =
+ _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 7);
const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
- if (do_average) {
- const __m256i data_ref_0 =
- load_line2_avx2(&dst[i * dst_stride + j],
- &dst[i * dst_stride + j + dst_stride]);
- const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
- &wt, use_dist_wtd_comp_avg);
-
- const __m256i round_result = convolve_rounding(
- &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
- const __m256i res_8 =
- _mm256_packus_epi16(round_result, round_result);
- const __m128i res_0 = _mm256_castsi256_si128(res_8);
- const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
-
- _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
- _mm_storel_epi64(
- (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
- } else {
- const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-
- const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
- res_1);
- }
+ JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j);
} else {
const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
- if (do_average) {
- const __m256i data_ref_0 =
- load_line2_avx2(&dst[i * dst_stride + j],
- &dst[i * dst_stride + j + dst_stride]);
-
- const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
- &wt, use_dist_wtd_comp_avg);
-
- const __m256i round_result = convolve_rounding(
- &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
-
- const __m256i res_8 =
- _mm256_packus_epi16(round_result, round_result);
- const __m128i res_0 = _mm256_castsi256_si128(res_8);
- const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
-
- *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
- *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
- _mm_cvtsi128_si32(res_1);
-
- } else {
- const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
-
- const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
- res_1);
- }
+ JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j);
}
s[0] = s[1];
s[1] = s[2];
@@ -774,7 +1112,7 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
}
}
} else {
- int im_h = h + filter_params_y->taps - 1;
+ int im_h = h + 8;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
@@ -783,11 +1121,9 @@ void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
for (int j = 0; j < w; j += 8) {
- /* Horizontal filter */
- const uint8_t *src_h = src_ptr + j;
- DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+ JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x, coeffs_x);
- DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+ JNT_CONVOLVE_VERTICAL_FILTER_8TAP;
}
}
}