diff mbox series

[FFmpeg-devel,11/15] avfilter/vf_bwdif: Add neon for filter_line

Message ID 20230629175729.224383-12-jc@kynesim.co.uk
State New
Headers show
Series avfilter/vf_bwdif: Add aarch64 neon functions | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

John Cox June 29, 2023, 5:57 p.m. UTC
Signed-off-by: John Cox <jc@kynesim.co.uk>
---
 libavfilter/aarch64/vf_bwdif_init_aarch64.c |  21 ++
 libavfilter/aarch64/vf_bwdif_neon.S         | 215 ++++++++++++++++++++
 2 files changed, 236 insertions(+)

Comments

Martin Storsjö July 1, 2023, 9:44 p.m. UTC | #1
On Thu, 29 Jun 2023, John Cox wrote:

> Signed-off-by: John Cox <jc@kynesim.co.uk>
> ---
> libavfilter/aarch64/vf_bwdif_init_aarch64.c |  21 ++
> libavfilter/aarch64/vf_bwdif_neon.S         | 215 ++++++++++++++++++++
> 2 files changed, 236 insertions(+)
>
> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> index e75cf2f204..21e67884ab 100644
> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> @@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
> void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
>                                 int prefs3, int mrefs3, int parity, int clip_max);
>
> +void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
> +                               int prefs3, int mrefs3, int prefs4, int mrefs4,
> +                               int parity, int clip_max);
> +
> +
> +static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
> +                               int prefs3, int mrefs3, int prefs4, int mrefs4,
> +                               int parity, int clip_max)
> +{
> +    const int w0 = clip_max != 255 ? 0 : w & ~15;
> +
> +    ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
> +                              w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
> +
> +    if (w0 < w)
> +        ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
> +                               w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
> +}
>
> static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
>                                int w, int prefs, int mrefs, int prefs2, int mrefs2,
> @@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
>         return;
>
>     s->filter_intra = filter_intra_helper;
> +    s->filter_line  = filter_line_helper;
>     s->filter_edge  = filter_edge_helper;
> }
>
> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
> index a33b235882..675e97d966 100644
> --- a/libavfilter/aarch64/vf_bwdif_neon.S
> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
> @@ -128,6 +128,221 @@ coeffs:
>         .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
>         .hword          5077, 981                       // sp[0] = v0.h[6]
>
> +// ===========================================================================
> +//
> +// void filter_line(
> +//      void *dst1,     // x0
> +//      void *prev1,    // x1
> +//      void *cur1,     // x2
> +//      void *next1,    // x3
> +//      int w,          // w4
> +//      int prefs,      // w5
> +//      int mrefs,      // w6
> +//      int prefs2,     // w7
> +//      int mrefs2,     // [sp, #0]
> +//      int prefs3,     // [sp, #8]
> +//      int mrefs3,     // [sp, #16]
> +//      int prefs4,     // [sp, #24]
> +//      int mrefs4,     // [sp, #32]
> +//      int parity,     // [sp, #40]
> +//      int clip_max)   // [sp, #48]
> +
> +function ff_bwdif_filter_line_neon, export=1
> +        // Sanity check w
> +        cmp             w4, #0
> +        ble             99f
> +
> +        // Rearrange regs to be the same as line3 for ease of debug!
> +        mov             w10, w4                         // w10 = loop count
> +        mov             w9,  w6                         // w9  = mref
> +        mov             w12, w7                         // w12 = pref2
> +        mov             w11, w5                         // w11 = pref
> +        ldr             w8,  [sp, #0]                   // w8 =  mref2
> +        ldr             w7,  [sp, #16]                  // w7  = mref3
> +        ldr             w6,  [sp, #32]                  // w6  = mref4
> +        ldr             w13, [sp, #8]                   // w13 = pref3
> +        ldr             w14, [sp, #24]                  // w14 = pref4

Btw, remember that you can load two arguments from the stack at once with 
ldp, e.g. "ldp x8, x13, [sp, #0]". If they're made intptr_t/ptrdiff_t, you 
won't have an issue with garbage in the upper 32 bits either.



> +
> +        mov             x4,  x3
> +        mov             x3,  x2
> +        mov             x2,  x1
> +
> +// #define prev2 cur
> +//        const uint8_t * restrict next2 = parity ? prev : next;
> +        ldr             w17, [sp, #40]                  // parity
> +        cmp             w17, #0
> +        csel            x17, x2, x4, ne
> +
> +        // We want all the V registers - save all the ones we must
> +        stp             d14, d15, [sp, #-64]!
> +        stp             d8,  d9,  [sp, #48]
> +        stp             d10, d11, [sp, #32]
> +        stp             d12, d13, [sp, #16]

The order looks a bit weird here even if they end up sequential on the 
stack. If you'd fill it from the bottom up, e.g.

stp d8, d9, [sp, #-64]!
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]

they're sequential both in code and on the stack.

// Martin
John Cox July 2, 2023, 10:57 a.m. UTC | #2
On Sun, 2 Jul 2023 00:44:10 +0300 (EEST), you wrote:

>On Thu, 29 Jun 2023, John Cox wrote:
>
>> Signed-off-by: John Cox <jc@kynesim.co.uk>
>> ---
>> libavfilter/aarch64/vf_bwdif_init_aarch64.c |  21 ++
>> libavfilter/aarch64/vf_bwdif_neon.S         | 215 ++++++++++++++++++++
>> 2 files changed, 236 insertions(+)
>>
>> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>> index e75cf2f204..21e67884ab 100644
>> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>> @@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
>> void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
>>                                 int prefs3, int mrefs3, int parity, int clip_max);
>>
>> +void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
>> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
>> +                               int prefs3, int mrefs3, int prefs4, int mrefs4,
>> +                               int parity, int clip_max);
>> +
>> +
>> +static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
>> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
>> +                               int prefs3, int mrefs3, int prefs4, int mrefs4,
>> +                               int parity, int clip_max)
>> +{
>> +    const int w0 = clip_max != 255 ? 0 : w & ~15;
>> +
>> +    ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
>> +                              w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
>> +
>> +    if (w0 < w)
>> +        ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
>> +                               w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
>> +}
>>
>> static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
>>                                int w, int prefs, int mrefs, int prefs2, int mrefs2,
>> @@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
>>         return;
>>
>>     s->filter_intra = filter_intra_helper;
>> +    s->filter_line  = filter_line_helper;
>>     s->filter_edge  = filter_edge_helper;
>> }
>>
>> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
>> index a33b235882..675e97d966 100644
>> --- a/libavfilter/aarch64/vf_bwdif_neon.S
>> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
>> @@ -128,6 +128,221 @@ coeffs:
>>         .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
>>         .hword          5077, 981                       // sp[0] = v0.h[6]
>>
>> +// ===========================================================================
>> +//
>> +// void filter_line(
>> +//      void *dst1,     // x0
>> +//      void *prev1,    // x1
>> +//      void *cur1,     // x2
>> +//      void *next1,    // x3
>> +//      int w,          // w4
>> +//      int prefs,      // w5
>> +//      int mrefs,      // w6
>> +//      int prefs2,     // w7
>> +//      int mrefs2,     // [sp, #0]
>> +//      int prefs3,     // [sp, #8]
>> +//      int mrefs3,     // [sp, #16]
>> +//      int prefs4,     // [sp, #24]
>> +//      int mrefs4,     // [sp, #32]
>> +//      int parity,     // [sp, #40]
>> +//      int clip_max)   // [sp, #48]
>> +
>> +function ff_bwdif_filter_line_neon, export=1
>> +        // Sanity check w
>> +        cmp             w4, #0
>> +        ble             99f
>> +
>> +        // Rearrange regs to be the same as line3 for ease of debug!
>> +        mov             w10, w4                         // w10 = loop count
>> +        mov             w9,  w6                         // w9  = mref
>> +        mov             w12, w7                         // w12 = pref2
>> +        mov             w11, w5                         // w11 = pref
>> +        ldr             w8,  [sp, #0]                   // w8 =  mref2
>> +        ldr             w7,  [sp, #16]                  // w7  = mref3
>> +        ldr             w6,  [sp, #32]                  // w6  = mref4
>> +        ldr             w13, [sp, #8]                   // w13 = pref3
>> +        ldr             w14, [sp, #24]                  // w14 = pref4
>
>Btw, remember that you can load two arguments from the stack at once with 
>ldp, e.g. "ldp x8, x13, [sp, #0]". If they're made intptr_t/ptrdiff_t, you 
>won't have an issue with garbage in the upper 32 bits either.

Fair point - I was indeed worrying about garbage in the upper half (and
this is not performance or size critical code).

>> +
>> +        mov             x4,  x3
>> +        mov             x3,  x2
>> +        mov             x2,  x1
>> +
>> +// #define prev2 cur
>> +//        const uint8_t * restrict next2 = parity ? prev : next;
>> +        ldr             w17, [sp, #40]                  // parity
>> +        cmp             w17, #0
>> +        csel            x17, x2, x4, ne
>> +
>> +        // We want all the V registers - save all the ones we must
>> +        stp             d14, d15, [sp, #-64]!
>> +        stp             d8,  d9,  [sp, #48]
>> +        stp             d10, d11, [sp, #32]
>> +        stp             d12, d13, [sp, #16]
>
>The order looks a bit weird here even if they end up sequential on the 
>stack. If you'd fill it from the bottom up, e.g.
>
>stp d8, d9, [sp, #-64]!
>stp d10, d11, [sp, #16]
>stp d12, d13, [sp, #32]
>stp d14, d15, [sp, #48]
>
>they're sequential both in code and on the stack.

Sure I can tweak that.

JC

>// Martin
Martin Storsjö July 2, 2023, 8:40 p.m. UTC | #3
On Sun, 2 Jul 2023, John Cox wrote:

> On Sun, 2 Jul 2023 00:44:10 +0300 (EEST), you wrote:
>
>> On Thu, 29 Jun 2023, John Cox wrote:
>>
>>> Signed-off-by: John Cox <jc@kynesim.co.uk>
>>> ---
>>> libavfilter/aarch64/vf_bwdif_init_aarch64.c |  21 ++
>>> libavfilter/aarch64/vf_bwdif_neon.S         | 215 ++++++++++++++++++++
>>> 2 files changed, 236 insertions(+)
>>>
>>> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>>> index e75cf2f204..21e67884ab 100644
>>> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>>> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>>> @@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
>>> void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
>>>                                 int prefs3, int mrefs3, int parity, int clip_max);
>>>
>>> +void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
>>> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
>>> +                               int prefs3, int mrefs3, int prefs4, int mrefs4,
>>> +                               int parity, int clip_max);
>>> +
>>> +
>>> +static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
>>> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
>>> +                               int prefs3, int mrefs3, int prefs4, int mrefs4,
>>> +                               int parity, int clip_max)
>>> +{
>>> +    const int w0 = clip_max != 255 ? 0 : w & ~15;
>>> +
>>> +    ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
>>> +                              w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
>>> +
>>> +    if (w0 < w)
>>> +        ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
>>> +                               w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
>>> +}
>>>
>>> static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
>>>                                int w, int prefs, int mrefs, int prefs2, int mrefs2,
>>> @@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
>>>         return;
>>>
>>>     s->filter_intra = filter_intra_helper;
>>> +    s->filter_line  = filter_line_helper;
>>>     s->filter_edge  = filter_edge_helper;
>>> }
>>>
>>> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
>>> index a33b235882..675e97d966 100644
>>> --- a/libavfilter/aarch64/vf_bwdif_neon.S
>>> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
>>> @@ -128,6 +128,221 @@ coeffs:
>>>         .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
>>>         .hword          5077, 981                       // sp[0] = v0.h[6]
>>>
>>> +// ===========================================================================
>>> +//
>>> +// void filter_line(
>>> +//      void *dst1,     // x0
>>> +//      void *prev1,    // x1
>>> +//      void *cur1,     // x2
>>> +//      void *next1,    // x3
>>> +//      int w,          // w4
>>> +//      int prefs,      // w5
>>> +//      int mrefs,      // w6
>>> +//      int prefs2,     // w7
>>> +//      int mrefs2,     // [sp, #0]
>>> +//      int prefs3,     // [sp, #8]
>>> +//      int mrefs3,     // [sp, #16]
>>> +//      int prefs4,     // [sp, #24]
>>> +//      int mrefs4,     // [sp, #32]
>>> +//      int parity,     // [sp, #40]
>>> +//      int clip_max)   // [sp, #48]
>>> +
>>> +function ff_bwdif_filter_line_neon, export=1
>>> +        // Sanity check w
>>> +        cmp             w4, #0
>>> +        ble             99f
>>> +
>>> +        // Rearrange regs to be the same as line3 for ease of debug!
>>> +        mov             w10, w4                         // w10 = loop count
>>> +        mov             w9,  w6                         // w9  = mref
>>> +        mov             w12, w7                         // w12 = pref2
>>> +        mov             w11, w5                         // w11 = pref
>>> +        ldr             w8,  [sp, #0]                   // w8 =  mref2
>>> +        ldr             w7,  [sp, #16]                  // w7  = mref3
>>> +        ldr             w6,  [sp, #32]                  // w6  = mref4
>>> +        ldr             w13, [sp, #8]                   // w13 = pref3
>>> +        ldr             w14, [sp, #24]                  // w14 = pref4
>>
>> Btw, remember that you can load two arguments from the stack at once with
>> ldp, e.g. "ldp x8, x13, [sp, #0]". If they're made intptr_t/ptrdiff_t, you
>> won't have an issue with garbage in the upper 32 bits either.
>
> Fair point - I was indeed worrying about garbage in the upper half (and
> this is not performance or size critical code).

Well as long as you actually do refer to the register in the form of w8 
instead of x8, it shouldn't matter. Checkasm does try to make sure that 
you actually should get garbage in such areas, so if it passes checkasm, 
it should be fine.

// Martin
diff mbox series

Patch

diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index e75cf2f204..21e67884ab 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -31,6 +31,26 @@  void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
 void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
                                 int prefs3, int mrefs3, int parity, int clip_max);
 
+void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int prefs3, int mrefs3, int prefs4, int mrefs4,
+                               int parity, int clip_max);
+
+
+static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int prefs3, int mrefs3, int prefs4, int mrefs4,
+                               int parity, int clip_max)
+{
+    const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+    ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
+                              w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
+
+    if (w0 < w)
+        ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
+                               w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
+}
 
 static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
                                int w, int prefs, int mrefs, int prefs2, int mrefs2,
@@ -71,6 +91,7 @@  ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
         return;
 
     s->filter_intra = filter_intra_helper;
+    s->filter_line  = filter_line_helper;
     s->filter_edge  = filter_edge_helper;
 }
 
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index a33b235882..675e97d966 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -128,6 +128,221 @@  coeffs:
         .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
         .hword          5077, 981                       // sp[0] = v0.h[6]
 
+// ===========================================================================
+//
+// void filter_line(
+//      void *dst1,     // x0
+//      void *prev1,    // x1
+//      void *cur1,     // x2
+//      void *next1,    // x3
+//      int w,          // w4
+//      int prefs,      // w5
+//      int mrefs,      // w6
+//      int prefs2,     // w7
+//      int mrefs2,     // [sp, #0]
+//      int prefs3,     // [sp, #8]
+//      int mrefs3,     // [sp, #16]
+//      int prefs4,     // [sp, #24]
+//      int mrefs4,     // [sp, #32]
+//      int parity,     // [sp, #40]
+//      int clip_max)   // [sp, #48]
+
+function ff_bwdif_filter_line_neon, export=1
+        // Sanity check w
+        cmp             w4, #0
+        ble             99f
+
+        // Rearrange regs to be the same as line3 for ease of debug!
+        mov             w10, w4                         // w10 = loop count
+        mov             w9,  w6                         // w9  = mref
+        mov             w12, w7                         // w12 = pref2
+        mov             w11, w5                         // w11 = pref
+        ldr             w8,  [sp, #0]                   // w8 =  mref2
+        ldr             w7,  [sp, #16]                  // w7  = mref3
+        ldr             w6,  [sp, #32]                  // w6  = mref4
+        ldr             w13, [sp, #8]                   // w13 = pref3
+        ldr             w14, [sp, #24]                  // w14 = pref4
+
+        mov             x4,  x3
+        mov             x3,  x2
+        mov             x2,  x1
+
+// #define prev2 cur
+//        const uint8_t * restrict next2 = parity ? prev : next;
+        ldr             w17, [sp, #40]                  // parity
+        cmp             w17, #0
+        csel            x17, x2, x4, ne
+
+        // We want all the V registers - save all the ones we must
+        stp             d14, d15, [sp, #-64]!
+        stp             d8,  d9,  [sp, #48]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #16]
+
+        ldr             q0, coeffs
+
+//         for (x = 0; x < w; x++) {
+//             int diff0, diff2;
+//             int d0, d2;
+//             int temporal_diff0, temporal_diff2;
+//
+//             int i1, i2;
+//             int j1, j2;
+//             int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
+
+10:
+//             c0 = prev2[0] + next2[0];            // c0 = v20, v21
+//             d0  = c0 >> 1;                       // d0 = v10
+//             temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
+        ldr             q31, [x3]
+        ldr             q21, [x17]
+        uhadd           v10.16b, v31.16b, v21.16b
+        uabd            v11.16b, v31.16b, v21.16b
+        uaddl           v20.8h,  v21.8b,  v31.8b
+        uaddl2          v21.8h,  v21.16b, v31.16b
+
+        ldr             q31, [x3, w6, SXTW]
+        ldr             q23, [x17, w6, SXTW]
+
+//             i1 = coef_hf[0] * c0;                // i1 = v2-v5
+        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[2]
+
+        ldr             q30, [x3, w14, SXTW]
+        ldr             q25, [x17, w14, SXTW]
+
+//             m4 = prev2[mrefs4] + next2[mrefs4];  // m4 = v22,v23
+        uaddl           v22.8h,  v23.8b,  v31.8b
+        uaddl2          v23.8h,  v23.16b, v31.16b
+
+//             p4 = prev2[prefs4] + next2[prefs4];  // p4 = v24,v25, (p4 >> 1) = v12
+        uhadd           v12.16b, v25.16b, v30.16b
+        uaddl           v24.8h,  v25.8b,  v30.8b
+        uaddl2          v25.8h,  v25.16b, v30.16b
+
+//             m3 = cur[mrefs3];                    // m3 = v20
+        ldr             q20, [x3, w7, SXTW]
+
+//             p3 = cur[prefs3];                    // p3 = v21
+        ldr             q21, [x3, w13, SXTW]
+
+//             i1 += coef_hf[2] * (m4 + p4);        // (-m4:v22,v23) (-p4:v24,v25)
+        add             v22.8h,  v22.8h,  v24.8h
+        add             v23.8h,  v23.8h,  v25.8h
+        UMLAL4K         v2, v3, v4, v5, v22, v23, v0.h[4]
+
+        ldr             q29, [x3, w8, SXTW]
+        ldr             q23, [x17, w8, SXTW]
+
+//             i1 -= coef_lf[1] * 4 * (m3 + p3);   // -
+        uaddl           v30.8h,  v20.8b,  v21.8b
+        uaddl2          v31.8h,  v20.16b, v21.16b
+
+        UMLSL4K         v2, v3, v4, v5, v30, v31, v0.h[1]
+
+//             m2 = prev2[mrefs2] + next2[mrefs2];  // m2 = v22,v23, (m2 >> 1) = v13
+        uhadd           v13.16b, v23.16b, v29.16b
+        uaddl           v22.8h,  v23.8b,  v29.8b
+        uaddl2          v23.8h,  v23.16b, v29.16b
+
+        ldr             q31, [x3, w12, SXTW]
+        ldr             q27, [x17, w12, SXTW]
+
+//             j1 += coef_hf[2] * (m2 + p6);        // (-p6:v24,v25)
+        add             v24.8h,  v24.8h,  v22.8h
+        add             v25.8h,  v25.8h,  v23.8h
+        UMLAL4K         v6, v7, v8, v9, v24, v25, v0.h[4]
+
+//             m1 = cur[mrefs];                     // m1 = v24
+        ldr             q24, [x3, w9, SXTW]
+
+//             p2 = prev2[prefs2] + next2[prefs2];  // p2 = v26, v27
+//             temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
+//             d2  = p2 >> 1;                       // d2 = v15
+        uabd            v14.16b, v31.16b, v27.16b
+        uhadd           v15.16b, v31.16b, v27.16b
+        uaddl           v26.8h,  v27.8b,  v31.8b
+        uaddl2          v27.8h,  v27.16b, v31.16b
+
+//             i1 -= coef_hf[1] * (m2 + p2);        // (-m2:v22,v23*) (-p2:v26*,v27*)
+        add             v22.8h,  v22.8h,  v26.8h
+        add             v23.8h,  v23.8h,  v27.8h
+        UMLSL4K         v2, v3, v4, v5, v22, v23, v0.h[3]
+
+//             p1 = cur[prefs];                     // p1 = v22
+        ldr             q22, [x3, w11, SXTW]
+
+//             i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
+        uaddl           v18.8h,  v22.8b,  v24.8b
+        uaddl2          v19.8h,  v22.16b, v24.16b
+        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
+
+        uaddl           v18.8h,  v20.8b,  v21.8b
+        uaddl2          v19.8h,  v20.16b, v21.16b
+        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
+
+        SQSHRUNN        v17, v28, v29, v30, v31, 13
+
+//             i1 += coef_lf[0] * 4 * (m1 + p1);    // p1 = v22, m1 = v24
+        uaddl           v26.8h,  v24.8b,  v22.8b
+        uaddl2          v27.8h,  v24.16b, v22.16b
+        UMLAL4K         v2, v3, v4, v5, v26, v27, v0.h[0]
+
+        ldr             q31, [x2, w9, SXTW]
+        ldr             q29, [x4, w9, SXTW]
+
+        ldr             q30, [x2, w11, SXTW]
+        ldr             q28, [x4, w11, SXTW]
+
+//             i1 >>= 15;                            // i1 = v2, -v3, -v4*, -v5*
+        SQSHRUNN        v2, v2, v3, v4, v5, 15
+
+//             {
+//                 int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
+//                 int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
+        uabd            v30.16b, v22.16b, v30.16b
+        uabd            v31.16b, v24.16b, v31.16b
+        uabd            v28.16b, v22.16b, v28.16b
+        uabd            v29.16b, v24.16b, v29.16b
+        uhadd           v31.16b, v31.16b, v30.16b
+        uhadd           v29.16b, v29.16b, v28.16b
+
+//                 diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
+        ushr            v18.16b, v11.16b, #1
+        umax            v18.16b, v18.16b, v31.16b
+        umax            v18.16b, v18.16b, v29.16b
+
+        // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
+        SPAT_CHECK      v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
+
+        // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
+        INTERPOL        v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
+
+//                 dst[0] = av_clip_uint8(interpol);
+        str             q2,  [x0], #16
+//             }
+//
+//             dst++;
+//             cur++;
+//             prev++;
+//             prev2++;
+//             next++;
+//         }
+
+        subs            w10, w10, #16
+        add             x2,  x2,  #16
+        add             x3,  x3,  #16
+        add             x4,  x4,  #16
+        add             x17, x17, #16
+        bgt             10b
+
+        ldp             d12, d13, [sp, #16]
+        ldp             d10, d11, [sp, #32]
+        ldp             d8,  d9,  [sp, #48]
+        ldp             d14, d15, [sp], #64
+99:
+        ret
+endfunc
+
 // ============================================================================
 //
 // void ff_bwdif_filter_edge_neon(