diff mbox series

[FFmpeg-devel,1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions.

Message ID 20220926091504.3459110-1-gjb@semihalf.com
State New
Headers show
Series [FFmpeg-devel,1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions. | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Grzegorz Bernacki Sept. 26, 2022, 9:15 a.m. UTC
Provide optimized implementation of pix_abs8 function for arm64.

Performance comparison tests are shown below:
pix_abs_1_1_c: 162.5
pix_abs_1_1_neon: 27.0
pix_abs_1_2_c: 174.0
pix_abs_1_2_neon: 23.5
pix_abs_1_3_c: 203.2
pix_abs_1_3_neon: 34.7

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
---
 libavcodec/aarch64/me_cmp_init_aarch64.c |   9 ++
 libavcodec/aarch64/me_cmp_neon.S         | 193 +++++++++++++++++++++++
 2 files changed, 202 insertions(+)

Comments

Martin Storsjö Sept. 28, 2022, 9:06 a.m. UTC | #1
On Mon, 26 Sep 2022, Grzegorz Bernacki wrote:

> Provide optimized implementation of pix_abs8 function for arm64.
>
> Performance comparison tests are shown below:
> pix_abs_1_1_c: 162.5
> pix_abs_1_1_neon: 27.0
> pix_abs_1_2_c: 174.0
> pix_abs_1_2_neon: 23.5
> pix_abs_1_3_c: 203.2
> pix_abs_1_3_neon: 34.7
>
> Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
>
> Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
> ---
> libavcodec/aarch64/me_cmp_init_aarch64.c |   9 ++
> libavcodec/aarch64/me_cmp_neon.S         | 193 +++++++++++++++++++++++
> 2 files changed, 202 insertions(+)

I don't see any changes compared to the version you sent last week? If 
reposting a patchset, please do mention what has changed - or ping the old 
one. (I had it on my radar to review, but reviews of larger amounts of 
code doesn't happen immediately, unfortunately.)

Overall, you seem to have mixed in tabs among regular spaces. Please do 
fix that. (I would have kinda expected us to have a fate test that checks 
this, but apparently we don't.)

> diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
> index e143f0816e..3459403ee5 100644
> --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> @@ -59,6 +59,12 @@ int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t
>                           ptrdiff_t stride, int h);
> int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
>                          ptrdiff_t stride, int h);
> +int ff_pix_abs8_x2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
> +                        ptrdiff_t stride, int h);
> +int ff_pix_abs8_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
> +                        ptrdiff_t stride, int h);
> +int ff_pix_abs8_xy2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
> +                         ptrdiff_t stride, int h);
> 
> av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
> {
> @@ -70,6 +76,9 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
>         c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
>         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
>         c->pix_abs[1][0] = ff_pix_abs8_neon;
> +	    c->pix_abs[1][1] = ff_pix_abs8_x2_neon;
> +	    c->pix_abs[1][2] = ff_pix_abs8_y2_neon;
> +	    c->pix_abs[1][3] = ff_pix_abs8_xy2_neon;

In most mediums, the diff here shows that there's something off - tabs.

>
>         c->sad[0] = ff_pix_abs16_neon;
>         c->sad[1] = ff_pix_abs8_neon;
> diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
> index 11af4849f9..e03c0c26cd 100644
> --- a/libavcodec/aarch64/me_cmp_neon.S
> +++ b/libavcodec/aarch64/me_cmp_neon.S
> @@ -119,6 +119,199 @@ function ff_pix_abs8_neon, export=1
>         ret
> endfunc
> 
> +function ff_pix_abs8_x2_neon, export=1
> +        // x0           unused
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +
> +        cmp             w4, #4
> +        movi            v26.8h, #0
> +        add             x5, x2, #1 // pix2 + 1
> +        b.lt            2f
> +
> +// make 4 iterations at once
> +1:
> +        ld1             {v1.8b}, [x2], x3
> +        ld1             {v2.8b}, [x5], x3
> +        ld1             {v0.8b}, [x1], x3
> +        ld1             {v4.8b}, [x2], x3
> +	urhadd          v30.8b, v1.8b, v2.8b
> +        ld1             {v5.8b}, [x5], x3
> +        uabal           v26.8h, v0.8b, v30.8b
> +        ld1             {v6.8b}, [x1], x3
> +	urhadd          v29.8b, v4.8b, v5.8b
> +        ld1             {v7.8b}, [x2], x3
> +        ld1             {v20.8b}, [x5], x3
> +        uabal           v26.8h, v6.8b, v29.8b
> +        ld1             {v21.8b}, [x1], x3
> +	urhadd          v28.8b, v7.8b, v20.8b
> +        ld1             {v22.8b}, [x2], x3
> +        ld1             {v23.8b}, [x5], x3
> +        uabal           v26.8h, v21.8b, v28.8b
> +        sub             w4, w4, #4
> +        ld1             {v24.8b}, [x1], x3
> +	urhadd          v27.8b, v22.8b, v23.8b
> +        cmp             w4, #4
> +        uabal           v26.8h, v24.8b, v27.8b
> +
> +        b.ge            1b
> +        cbz             w4, 3f
> +
> +// iterate by one
> +2:
> +        ld1             {v1.8b}, [x2], x3
> +        ld1             {v2.8b}, [x5], x3
> +        ld1             {v0.8b}, [x1], x3
> +	urhadd          v30.8b, v1.8b, v2.8b
> +        subs            w4, w4, #1
> +        uabal           v26.8h, v0.8b, v30.8b
> +
> +        b.ne            2b
> +3:
> +        uaddlv          s20, v26.8h
> +        fmov            w0, s20
> +
> +        ret
> +
> +endfunc
> +
> +function ff_pix_abs8_y2_neon, export=1
> +        // x0           unused
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +
> +        cmp             w4, #4
> +        movi            v26.8h, #0
> +        ld1             {v1.8b}, [x2], x3
> +        b.lt            2f
> +
> +// make 4 iterations at once
> +1:
> +        ld1             {v2.8b}, [x2], x3
> +        ld1             {v0.8b}, [x1], x3
> +        ld1             {v6.8b}, [x1], x3
> +	urhadd          v30.8b, v1.8b, v2.8b

Good thing that you wrote this better than the current 
ff_pix_abs16_y2_neon, reusing the data from the previous round. But the 
scheduling could be much improved here (why load v6 directly after v0 - x1 
will delay it due to being updated in the previous instruction, and v6 
won't be needed for a long time here yet).

By fixing the scheduling of this function (and getting rid of the 
unnecessary mov instruction at the end of the unrolled loop), I got it 
down from 74 cycles to 62 cycles on a Cortex A53. I'm attaching the fixup 
I did, so you can apply it on top instead of having to describe it 
verbally.

> +        ld1             {v5.8b}, [x2], x3
> +        ld1             {v21.8b}, [x1], x3
> +        uabal           v26.8h, v0.8b, v30.8b
> +	urhadd          v29.8b, v2.8b, v5.8b
> +        ld1             {v20.8b}, [x2], x3
> +        ld1             {v24.8b}, [x1], x3
> +        uabal           v26.8h, v6.8b, v29.8b
> +	urhadd          v28.8b, v5.8b, v20.8b
> +        uabal           v26.8h, v21.8b, v28.8b
> +        ld1             {v23.8b}, [x2], x3
> +        mov             v1.8b, v23.8b
> +        sub             w4, w4, #4
> +	urhadd          v27.8b, v20.8b, v23.8b
> +        cmp             w4, #4
> +        uabal           v26.8h, v24.8b, v27.8b
> +
> +        b.ge            1b
> +        cbz             w4, 3f
> +
> +// iterate by one
> +2:
> +        ld1             {v0.8b}, [x1], x3
> +        ld1             {v2.8b}, [x2], x3
> +	urhadd          v30.8b, v1.8b, v2.8b
> +        subs            w4, w4, #1
> +        uabal           v26.8h, v0.8b, v30.8b
> +        mov             v1.8b, v2.8b
> +
> +        b.ne            2b
> +3:
> +        uaddlv          s20, v26.8h
> +        fmov            w0, s20
> +
> +        ret
> +
> +endfunc
> +
> +function ff_pix_abs8_xy2_neon, export=1
> +        // x0           unused
> +        // x1           uint8_t *pix1
> +        // x2           uint8_t *pix2
> +        // x3           ptrdiff_t stride
> +        // w4           int h
> +
> +        movi            v31.8h, #0
> +        add             x0, x2, 1  // pix2 + 1
> +
> +        add             x5, x2, x3 // pix2 + stride = pix3
> +        cmp             w4, #4
> +        add             x6, x5, 1 // pix3 + stride + 1
> +
> +        b.lt            2f
> +
> +        ld1             {v0.8b}, [x2], x3
> +        ld1             {v1.8b}, [x0], x3
> +        uaddl           v2.8h, v0.8b, v1.8b

If we'd start out with h<4, we'd jump to the 2f label above, missing the 
setup of v0-v2 here.

Other than that, the patch looks reasonable to me.

// Martin
Grzegorz Bernacki Sept. 28, 2022, 12:22 p.m. UTC | #2
Hi Martin,

I resent the patchset, because the first try did not reach ffmpeg-devel
maillist. I apologize, I should have mentioned about that in cover letter.
Thanks a lot for your review, I will apply the changes and send v2 soon.

thanks,
grzegorz

śr., 28 wrz 2022 o 11:07 Martin Storsjö <martin@martin.st> napisał(a):

> On Mon, 26 Sep 2022, Grzegorz Bernacki wrote:
>
> > Provide optimized implementation of pix_abs8 function for arm64.
> >
> > Performance comparison tests are shown below:
> > pix_abs_1_1_c: 162.5
> > pix_abs_1_1_neon: 27.0
> > pix_abs_1_2_c: 174.0
> > pix_abs_1_2_neon: 23.5
> > pix_abs_1_3_c: 203.2
> > pix_abs_1_3_neon: 34.7
> >
> > Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
> >
> > Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
> > ---
> > libavcodec/aarch64/me_cmp_init_aarch64.c |   9 ++
> > libavcodec/aarch64/me_cmp_neon.S         | 193 +++++++++++++++++++++++
> > 2 files changed, 202 insertions(+)
>
> I don't see any changes compared to the version you sent last week? If
> reposting a patchset, please do mention what has changed - or ping the old
> one. (I had it on my radar to review, but reviews of larger amounts of
> code doesn't happen immediately, unfortunately.)
>
> Overall, you seem to have mixed in tabs among regular spaces. Please do
> fix that. (I would have kinda expected us to have a fate test that checks
> this, but apparently we don't.)
>
> > diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c
> b/libavcodec/aarch64/me_cmp_init_aarch64.c
> > index e143f0816e..3459403ee5 100644
> > --- a/libavcodec/aarch64/me_cmp_init_aarch64.c
> > +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
> > @@ -59,6 +59,12 @@ int pix_median_abs16_neon(MpegEncContext *v, const
> uint8_t *pix1, const uint8_t
> >                           ptrdiff_t stride, int h);
> > int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const
> uint8_t *pix2,
> >                          ptrdiff_t stride, int h);
> > +int ff_pix_abs8_x2_neon(MpegEncContext *v, const uint8_t *pix1, const
> uint8_t *pix2,
> > +                        ptrdiff_t stride, int h);
> > +int ff_pix_abs8_y2_neon(MpegEncContext *v, const uint8_t *pix1, const
> uint8_t *pix2,
> > +                        ptrdiff_t stride, int h);
> > +int ff_pix_abs8_xy2_neon(MpegEncContext *v, const uint8_t *pix1, const
> uint8_t *pix2,
> > +                         ptrdiff_t stride, int h);
> >
> > av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext
> *avctx)
> > {
> > @@ -70,6 +76,9 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c,
> AVCodecContext *avctx)
> >         c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
> >         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
> >         c->pix_abs[1][0] = ff_pix_abs8_neon;
> > +         c->pix_abs[1][1] = ff_pix_abs8_x2_neon;
> > +         c->pix_abs[1][2] = ff_pix_abs8_y2_neon;
> > +         c->pix_abs[1][3] = ff_pix_abs8_xy2_neon;
>
> In most mediums, the diff here shows that there's something off - tabs.
>
> >
> >         c->sad[0] = ff_pix_abs16_neon;
> >         c->sad[1] = ff_pix_abs8_neon;
> > diff --git a/libavcodec/aarch64/me_cmp_neon.S
> b/libavcodec/aarch64/me_cmp_neon.S
> > index 11af4849f9..e03c0c26cd 100644
> > --- a/libavcodec/aarch64/me_cmp_neon.S
> > +++ b/libavcodec/aarch64/me_cmp_neon.S
> > @@ -119,6 +119,199 @@ function ff_pix_abs8_neon, export=1
> >         ret
> > endfunc
> >
> > +function ff_pix_abs8_x2_neon, export=1
> > +        // x0           unused
> > +        // x1           uint8_t *pix1
> > +        // x2           uint8_t *pix2
> > +        // x3           ptrdiff_t stride
> > +        // w4           int h
> > +
> > +        cmp             w4, #4
> > +        movi            v26.8h, #0
> > +        add             x5, x2, #1 // pix2 + 1
> > +        b.lt            2f
> > +
> > +// make 4 iterations at once
> > +1:
> > +        ld1             {v1.8b}, [x2], x3
> > +        ld1             {v2.8b}, [x5], x3
> > +        ld1             {v0.8b}, [x1], x3
> > +        ld1             {v4.8b}, [x2], x3
> > +     urhadd          v30.8b, v1.8b, v2.8b
> > +        ld1             {v5.8b}, [x5], x3
> > +        uabal           v26.8h, v0.8b, v30.8b
> > +        ld1             {v6.8b}, [x1], x3
> > +     urhadd          v29.8b, v4.8b, v5.8b
> > +        ld1             {v7.8b}, [x2], x3
> > +        ld1             {v20.8b}, [x5], x3
> > +        uabal           v26.8h, v6.8b, v29.8b
> > +        ld1             {v21.8b}, [x1], x3
> > +     urhadd          v28.8b, v7.8b, v20.8b
> > +        ld1             {v22.8b}, [x2], x3
> > +        ld1             {v23.8b}, [x5], x3
> > +        uabal           v26.8h, v21.8b, v28.8b
> > +        sub             w4, w4, #4
> > +        ld1             {v24.8b}, [x1], x3
> > +     urhadd          v27.8b, v22.8b, v23.8b
> > +        cmp             w4, #4
> > +        uabal           v26.8h, v24.8b, v27.8b
> > +
> > +        b.ge            1b
> > +        cbz             w4, 3f
> > +
> > +// iterate by one
> > +2:
> > +        ld1             {v1.8b}, [x2], x3
> > +        ld1             {v2.8b}, [x5], x3
> > +        ld1             {v0.8b}, [x1], x3
> > +     urhadd          v30.8b, v1.8b, v2.8b
> > +        subs            w4, w4, #1
> > +        uabal           v26.8h, v0.8b, v30.8b
> > +
> > +        b.ne            2b
> > +3:
> > +        uaddlv          s20, v26.8h
> > +        fmov            w0, s20
> > +
> > +        ret
> > +
> > +endfunc
> > +
> > +function ff_pix_abs8_y2_neon, export=1
> > +        // x0           unused
> > +        // x1           uint8_t *pix1
> > +        // x2           uint8_t *pix2
> > +        // x3           ptrdiff_t stride
> > +        // w4           int h
> > +
> > +        cmp             w4, #4
> > +        movi            v26.8h, #0
> > +        ld1             {v1.8b}, [x2], x3
> > +        b.lt            2f
> > +
> > +// make 4 iterations at once
> > +1:
> > +        ld1             {v2.8b}, [x2], x3
> > +        ld1             {v0.8b}, [x1], x3
> > +        ld1             {v6.8b}, [x1], x3
> > +     urhadd          v30.8b, v1.8b, v2.8b
>
> Good thing that you wrote this better than the current
> ff_pix_abs16_y2_neon, reusing the data from the previous round. But the
> scheduling could be much improved here (why load v6 directly after v0 - x1
> will delay it due to being updated in the previous instruction, and v6
> won't be needed for a long time here yet).
>
> By fixing the scheduling of this function (and getting rid of the
> unnecessary mov instruction at the end of the unrolled loop), I got it
> down from 74 cycles to 62 cycles on a Cortex A53. I'm attaching the fixup
> I did, so you can apply it on top instead of having to describe it
> verbally.
>
> > +        ld1             {v5.8b}, [x2], x3
> > +        ld1             {v21.8b}, [x1], x3
> > +        uabal           v26.8h, v0.8b, v30.8b
> > +     urhadd          v29.8b, v2.8b, v5.8b
> > +        ld1             {v20.8b}, [x2], x3
> > +        ld1             {v24.8b}, [x1], x3
> > +        uabal           v26.8h, v6.8b, v29.8b
> > +     urhadd          v28.8b, v5.8b, v20.8b
> > +        uabal           v26.8h, v21.8b, v28.8b
> > +        ld1             {v23.8b}, [x2], x3
> > +        mov             v1.8b, v23.8b
> > +        sub             w4, w4, #4
> > +     urhadd          v27.8b, v20.8b, v23.8b
> > +        cmp             w4, #4
> > +        uabal           v26.8h, v24.8b, v27.8b
> > +
> > +        b.ge            1b
> > +        cbz             w4, 3f
> > +
> > +// iterate by one
> > +2:
> > +        ld1             {v0.8b}, [x1], x3
> > +        ld1             {v2.8b}, [x2], x3
> > +     urhadd          v30.8b, v1.8b, v2.8b
> > +        subs            w4, w4, #1
> > +        uabal           v26.8h, v0.8b, v30.8b
> > +        mov             v1.8b, v2.8b
> > +
> > +        b.ne            2b
> > +3:
> > +        uaddlv          s20, v26.8h
> > +        fmov            w0, s20
> > +
> > +        ret
> > +
> > +endfunc
> > +
> > +function ff_pix_abs8_xy2_neon, export=1
> > +        // x0           unused
> > +        // x1           uint8_t *pix1
> > +        // x2           uint8_t *pix2
> > +        // x3           ptrdiff_t stride
> > +        // w4           int h
> > +
> > +        movi            v31.8h, #0
> > +        add             x0, x2, 1  // pix2 + 1
> > +
> > +        add             x5, x2, x3 // pix2 + stride = pix3
> > +        cmp             w4, #4
> > +        add             x6, x5, 1 // pix3 + stride + 1
> > +
> > +        b.lt            2f
> > +
> > +        ld1             {v0.8b}, [x2], x3
> > +        ld1             {v1.8b}, [x0], x3
> > +        uaddl           v2.8h, v0.8b, v1.8b
>
> If we'd start out with h<4, we'd jump to the 2f label above, missing the
> setup of v0-v2 here.
>
> Other than that, the patch looks reasonable to me.
>
> // Martin
>
diff mbox series

Patch

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c
index e143f0816e..3459403ee5 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -59,6 +59,12 @@  int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t
                           ptrdiff_t stride, int h);
 int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
                          ptrdiff_t stride, int h);
+int ff_pix_abs8_x2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+                        ptrdiff_t stride, int h);
+int ff_pix_abs8_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+                        ptrdiff_t stride, int h);
+int ff_pix_abs8_xy2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
+                         ptrdiff_t stride, int h);
 
 av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
 {
@@ -70,6 +76,9 @@  av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
         c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
         c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
         c->pix_abs[1][0] = ff_pix_abs8_neon;
+	    c->pix_abs[1][1] = ff_pix_abs8_x2_neon;
+	    c->pix_abs[1][2] = ff_pix_abs8_y2_neon;
+	    c->pix_abs[1][3] = ff_pix_abs8_xy2_neon;
 
         c->sad[0] = ff_pix_abs16_neon;
         c->sad[1] = ff_pix_abs8_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 11af4849f9..e03c0c26cd 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -119,6 +119,199 @@  function ff_pix_abs8_neon, export=1
         ret
 endfunc
 
+function ff_pix_abs8_x2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        cmp             w4, #4
+        movi            v26.8h, #0
+        add             x5, x2, #1 // pix2 + 1
+        b.lt            2f
+
+// make 4 iterations at once
+1:
+        ld1             {v1.8b}, [x2], x3
+        ld1             {v2.8b}, [x5], x3
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v4.8b}, [x2], x3
+	urhadd          v30.8b, v1.8b, v2.8b
+        ld1             {v5.8b}, [x5], x3
+        uabal           v26.8h, v0.8b, v30.8b
+        ld1             {v6.8b}, [x1], x3
+	urhadd          v29.8b, v4.8b, v5.8b
+        ld1             {v7.8b}, [x2], x3
+        ld1             {v20.8b}, [x5], x3
+        uabal           v26.8h, v6.8b, v29.8b
+        ld1             {v21.8b}, [x1], x3
+	urhadd          v28.8b, v7.8b, v20.8b
+        ld1             {v22.8b}, [x2], x3
+        ld1             {v23.8b}, [x5], x3
+        uabal           v26.8h, v21.8b, v28.8b
+        sub             w4, w4, #4
+        ld1             {v24.8b}, [x1], x3
+	urhadd          v27.8b, v22.8b, v23.8b
+        cmp             w4, #4
+        uabal           v26.8h, v24.8b, v27.8b
+
+        b.ge            1b
+        cbz             w4, 3f
+
+// iterate by one
+2:
+        ld1             {v1.8b}, [x2], x3
+        ld1             {v2.8b}, [x5], x3
+        ld1             {v0.8b}, [x1], x3
+	urhadd          v30.8b, v1.8b, v2.8b
+        subs            w4, w4, #1
+        uabal           v26.8h, v0.8b, v30.8b
+
+        b.ne            2b
+3:
+        uaddlv          s20, v26.8h
+        fmov            w0, s20
+
+        ret
+
+endfunc
+
+function ff_pix_abs8_y2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        cmp             w4, #4
+        movi            v26.8h, #0
+        ld1             {v1.8b}, [x2], x3
+        b.lt            2f
+
+// make 4 iterations at once
+1:
+        ld1             {v2.8b}, [x2], x3
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v6.8b}, [x1], x3
+	urhadd          v30.8b, v1.8b, v2.8b
+        ld1             {v5.8b}, [x2], x3
+        ld1             {v21.8b}, [x1], x3
+        uabal           v26.8h, v0.8b, v30.8b
+	urhadd          v29.8b, v2.8b, v5.8b
+        ld1             {v20.8b}, [x2], x3
+        ld1             {v24.8b}, [x1], x3
+        uabal           v26.8h, v6.8b, v29.8b
+	urhadd          v28.8b, v5.8b, v20.8b
+        uabal           v26.8h, v21.8b, v28.8b
+        ld1             {v23.8b}, [x2], x3
+        mov             v1.8b, v23.8b
+        sub             w4, w4, #4
+	urhadd          v27.8b, v20.8b, v23.8b
+        cmp             w4, #4
+        uabal           v26.8h, v24.8b, v27.8b
+
+        b.ge            1b
+        cbz             w4, 3f
+
+// iterate by one
+2:
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v2.8b}, [x2], x3
+	urhadd          v30.8b, v1.8b, v2.8b
+        subs            w4, w4, #1
+        uabal           v26.8h, v0.8b, v30.8b
+        mov             v1.8b, v2.8b
+
+        b.ne            2b
+3:
+        uaddlv          s20, v26.8h
+        fmov            w0, s20
+
+        ret
+
+endfunc
+
+function ff_pix_abs8_xy2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        movi            v31.8h, #0
+        add             x0, x2, 1  // pix2 + 1
+
+        add             x5, x2, x3 // pix2 + stride = pix3
+        cmp             w4, #4
+        add             x6, x5, 1 // pix3 + stride + 1
+
+        b.lt            2f
+
+        ld1             {v0.8b}, [x2], x3
+        ld1             {v1.8b}, [x0], x3
+        uaddl           v2.8h, v0.8b, v1.8b
+
+// make 4 iterations at once
+1:
+        ld1             {v4.8b}, [x5], x3
+        ld1             {v5.8b}, [x6], x3
+        ld1             {v7.8b}, [x5], x3
+        uaddl           v0.8h, v4.8b, v5.8b
+        ld1             {v16.8b}, [x6], x3
+        add             v4.8h, v0.8h, v2.8h
+        ld1             {v5.8b}, [x1], x3
+        rshrn           v4.8b, v4.8h, #2
+        uaddl           v7.8h, v7.8b, v16.8b
+        uabal           v31.8h, v5.8b, v4.8b
+        add             v2.8h, v0.8h, v7.8h
+        ld1             {v17.8b}, [x1], x3
+        rshrn           v2.8b, v2.8h, #2
+        ld1             {v20.8b}, [x5], x3
+        uabal           v31.8h, v17.8b, v2.8b
+        ld1             {v21.8b}, [x6], x3
+        ld1             {v25.8b}, [x5], x3
+        uaddl           v20.8h, v20.8b, v21.8b
+        ld1             {v26.8b}, [x6], x3
+        add             v7.8h, v7.8h, v20.8h
+        uaddl           v25.8h, v25.8b, v26.8b
+        rshrn           v7.8b, v7.8h, #2
+        ld1             {v22.8b}, [x1], x3
+        mov             v2.16b, v25.16b
+        uabal           v31.8h, v22.8b, v7.8b
+        add             v20.8h, v20.8h, v25.8h
+        ld1             {v27.8b}, [x1], x3
+        sub             w4, w4, #4
+        rshrn           v20.8b, v20.8h, #2
+        cmp             w4, #4
+        uabal           v31.8h, v27.8b, v20.8b
+
+        b.ge            1b
+
+        cbz             w4, 3f
+
+// iterate by one
+2:
+        ld1             {v0.8b}, [x5], x3
+        ld1             {v1.8b}, [x6], x3
+        ld1             {v4.8b}, [x1], x3
+        uaddl           v21.8h, v0.8b, v1.8b
+        subs            w4, w4, #1
+        add             v3.8h, v2.8h, v21.8h
+        mov             v2.16b, v21.16b
+        rshrn           v3.8b, v3.8h, #2
+        uabal           v31.8h, v4.8b, v3.8b
+        b.ne            2b
+
+3:
+        uaddlv          s18, v31.8h
+        fmov            w0, s18
+
+        ret
+
+endfunc
+
+
 function ff_pix_abs16_xy2_neon, export=1
         // x0           unused
         // x1           uint8_t *pix1