diff mbox series

[FFmpeg-devel,08/15] avfilter/vf_bwdif: Add neon for filter_edge

Message ID 20230629175729.224383-9-jc@kynesim.co.uk
State New
Headers show
Series avfilter/vf_bwdif: Add aarch64 neon functions | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

John Cox June 29, 2023, 5:57 p.m. UTC
Signed-off-by: John Cox <jc@kynesim.co.uk>
---
 libavfilter/aarch64/vf_bwdif_init_aarch64.c |  20 ++++
 libavfilter/aarch64/vf_bwdif_neon.S         | 104 ++++++++++++++++++++
 2 files changed, 124 insertions(+)

Comments

Martin Storsjö July 1, 2023, 9:40 p.m. UTC | #1
On Thu, 29 Jun 2023, John Cox wrote:

> Signed-off-by: John Cox <jc@kynesim.co.uk>
> ---
> libavfilter/aarch64/vf_bwdif_init_aarch64.c |  20 ++++
> libavfilter/aarch64/vf_bwdif_neon.S         | 104 ++++++++++++++++++++
> 2 files changed, 124 insertions(+)
>
> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> index 3ffaa07ab3..e75cf2f204 100644
> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> @@ -24,10 +24,29 @@
> #include "libavfilter/bwdif.h"
> #include "libavutil/aarch64/cpu.h"
>
> +void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
> +                               int parity, int clip_max, int spat);
> +
> void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
>                                 int prefs3, int mrefs3, int parity, int clip_max);
>
>
> +static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
> +                               int parity, int clip_max, int spat)
> +{
> +    const int w0 = clip_max != 255 ? 0 : w & ~15;
> +
> +    ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2,
> +                              parity, clip_max, spat);
> +
> +    if (w0 < w)
> +        ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
> +                               w - w0, prefs, mrefs, prefs2, mrefs2,
> +                               parity, clip_max, spat);
> +}
> +
> static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
>                                 int prefs3, int mrefs3, int parity, int clip_max)
> {
> @@ -52,5 +71,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
>         return;
>
>     s->filter_intra = filter_intra_helper;
> +    s->filter_edge  = filter_edge_helper;
> }
>
> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
> index 6c5d1598f4..a33b235882 100644
> --- a/libavfilter/aarch64/vf_bwdif_neon.S
> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
> @@ -128,6 +128,110 @@ coeffs:
>         .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
>         .hword          5077, 981                       // sp[0] = v0.h[6]
>
> +// ============================================================================
> +//
> +// void ff_bwdif_filter_edge_neon(
> +//      void *dst1,     // x0
> +//      void *prev1,    // x1
> +//      void *cur1,     // x2
> +//      void *next1,    // x3
> +//      int w,          // w4
> +//      int prefs,      // w5
> +//      int mrefs,      // w6
> +//      int prefs2,     // w7
> +//      int mrefs2,     // [sp, #0]
> +//      int parity,     // [sp, #8]
> +//      int clip_max,   // [sp, #16]  unused
> +//      int spat);      // [sp, #24]

This doesn't hold for macOS targets (and the checkasm tests fail on that 
platform).

On macOS, arguments that aren't passed in registers but on the stack, are 
tightly packed. So since parity is 32 bit and mrefs2 also was 32 bit, 
parity is available at [sp, #4].

Therefore, it's usually simplest for portability reasons, to pass any 
arguments after the first 8, as intptr_t or ptrdiff_t, as that makes them 
consistent across platforms.

// Martin
John Cox July 2, 2023, 10:50 a.m. UTC | #2
On Sun, 2 Jul 2023 00:40:09 +0300 (EEST), you wrote:

>On Thu, 29 Jun 2023, John Cox wrote:
>
>> Signed-off-by: John Cox <jc@kynesim.co.uk>
>> ---
>> libavfilter/aarch64/vf_bwdif_init_aarch64.c |  20 ++++
>> libavfilter/aarch64/vf_bwdif_neon.S         | 104 ++++++++++++++++++++
>> 2 files changed, 124 insertions(+)
>>
>> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>> index 3ffaa07ab3..e75cf2f204 100644
>> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>> @@ -24,10 +24,29 @@
>> #include "libavfilter/bwdif.h"
>> #include "libavutil/aarch64/cpu.h"
>>
>> +void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
>> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
>> +                               int parity, int clip_max, int spat);
>> +
>> void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
>>                                 int prefs3, int mrefs3, int parity, int clip_max);
>>
>>
>> +static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
>> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
>> +                               int parity, int clip_max, int spat)
>> +{
>> +    const int w0 = clip_max != 255 ? 0 : w & ~15;
>> +
>> +    ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2,
>> +                              parity, clip_max, spat);
>> +
>> +    if (w0 < w)
>> +        ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
>> +                               w - w0, prefs, mrefs, prefs2, mrefs2,
>> +                               parity, clip_max, spat);
>> +}
>> +
>> static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
>>                                 int prefs3, int mrefs3, int parity, int clip_max)
>> {
>> @@ -52,5 +71,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
>>         return;
>>
>>     s->filter_intra = filter_intra_helper;
>> +    s->filter_edge  = filter_edge_helper;
>> }
>>
>> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
>> index 6c5d1598f4..a33b235882 100644
>> --- a/libavfilter/aarch64/vf_bwdif_neon.S
>> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
>> @@ -128,6 +128,110 @@ coeffs:
>>         .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
>>         .hword          5077, 981                       // sp[0] = v0.h[6]
>>
>> +// ============================================================================
>> +//
>> +// void ff_bwdif_filter_edge_neon(
>> +//      void *dst1,     // x0
>> +//      void *prev1,    // x1
>> +//      void *cur1,     // x2
>> +//      void *next1,    // x3
>> +//      int w,          // w4
>> +//      int prefs,      // w5
>> +//      int mrefs,      // w6
>> +//      int prefs2,     // w7
>> +//      int mrefs2,     // [sp, #0]
>> +//      int parity,     // [sp, #8]
>> +//      int clip_max,   // [sp, #16]  unused
>> +//      int spat);      // [sp, #24]
>
>This doesn't hold for macOS targets (and the checkasm tests fail on that 
>platform).
>
>On macOS, arguments that aren't passed in registers but on the stack, are 
>tightly packed. So since parity is 32 bit and mrefs2 also was 32 bit, 
>parity is available at [sp, #4].
>
>Therefore, it's usually simplest for portability reasons, to pass any 
>arguments after the first 8, as intptr_t or ptrdiff_t, as that makes them 
>consistent across platforms.

Not my interface - this is already existing code. What do you suggest I
do?

I'm happy either to change the interface or fix my stack offsets if
there is any clue that lets me detect this ABI. As personal preference
I'd choose the latter.

I don't have easy access to a mac. Is there any easy way of getting this
tested before resubmission?

Thanks

JC

>// Martin
Martin Storsjö July 2, 2023, 8:36 p.m. UTC | #3
On Sun, 2 Jul 2023, John Cox wrote:

> On Sun, 2 Jul 2023 00:40:09 +0300 (EEST), you wrote:
>
>> On Thu, 29 Jun 2023, John Cox wrote:
>>
>>> Signed-off-by: John Cox <jc@kynesim.co.uk>
>>> ---
>>> libavfilter/aarch64/vf_bwdif_init_aarch64.c |  20 ++++
>>> libavfilter/aarch64/vf_bwdif_neon.S         | 104 ++++++++++++++++++++
>>> 2 files changed, 124 insertions(+)
>>>
>>> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>>> index 3ffaa07ab3..e75cf2f204 100644
>>> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>>> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>>> @@ -24,10 +24,29 @@
>>> #include "libavfilter/bwdif.h"
>>> #include "libavutil/aarch64/cpu.h"
>>>
>>> +void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
>>> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
>>> +                               int parity, int clip_max, int spat);
>>> +
>>> void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
>>>                                 int prefs3, int mrefs3, int parity, int clip_max);
>>>
>>>
>>> +static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
>>> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
>>> +                               int parity, int clip_max, int spat)
>>> +{
>>> +    const int w0 = clip_max != 255 ? 0 : w & ~15;
>>> +
>>> +    ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2,
>>> +                              parity, clip_max, spat);
>>> +
>>> +    if (w0 < w)
>>> +        ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
>>> +                               w - w0, prefs, mrefs, prefs2, mrefs2,
>>> +                               parity, clip_max, spat);
>>> +}
>>> +
>>> static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
>>>                                 int prefs3, int mrefs3, int parity, int clip_max)
>>> {
>>> @@ -52,5 +71,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
>>>         return;
>>>
>>>     s->filter_intra = filter_intra_helper;
>>> +    s->filter_edge  = filter_edge_helper;
>>> }
>>>
>>> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
>>> index 6c5d1598f4..a33b235882 100644
>>> --- a/libavfilter/aarch64/vf_bwdif_neon.S
>>> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
>>> @@ -128,6 +128,110 @@ coeffs:
>>>         .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
>>>         .hword          5077, 981                       // sp[0] = v0.h[6]
>>>
>>> +// ============================================================================
>>> +//
>>> +// void ff_bwdif_filter_edge_neon(
>>> +//      void *dst1,     // x0
>>> +//      void *prev1,    // x1
>>> +//      void *cur1,     // x2
>>> +//      void *next1,    // x3
>>> +//      int w,          // w4
>>> +//      int prefs,      // w5
>>> +//      int mrefs,      // w6
>>> +//      int prefs2,     // w7
>>> +//      int mrefs2,     // [sp, #0]
>>> +//      int parity,     // [sp, #8]
>>> +//      int clip_max,   // [sp, #16]  unused
>>> +//      int spat);      // [sp, #24]
>>
>> This doesn't hold for macOS targets (and the checkasm tests fail on that
>> platform).
>>
>> On macOS, arguments that aren't passed in registers but on the stack, are
>> tightly packed. So since parity is 32 bit and mrefs2 also was 32 bit,
>> parity is available at [sp, #4].
>>
>> Therefore, it's usually simplest for portability reasons, to pass any
>> arguments after the first 8, as intptr_t or ptrdiff_t, as that makes them
>> consistent across platforms.
>
> Not my interface - this is already existing code. What do you suggest I
> do?
>
> I'm happy either to change the interface or fix my stack offsets if
> there is any clue that lets me detect this ABI. As personal preference
> I'd choose the latter.

You can litter the code with "#ifdef __APPLE__", but in general I'd prefer 
to not go down that route as it makes the code messier to maintain, and 
explicitly requires you to test the code on both system combinations 
whenever touching it, compared to just having to test it once if there's a 
shared codepath.

We often do change function interfaces to make writing the assembly 
simpler; we often change parameters like "int stride" into "ptrdiff_t 
stride", to avoid needing explicit sign extension instructions on some 
architectures. Changing interfaces for this reason is less common though 
(we often don't have many functions that take that many parameters), but 
it shouldn't require a lot of changes for other architectures with 
existing assembly.

// Martin
diff mbox series

Patch

diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index 3ffaa07ab3..e75cf2f204 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -24,10 +24,29 @@ 
 #include "libavfilter/bwdif.h"
 #include "libavutil/aarch64/cpu.h"
 
+void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int parity, int clip_max, int spat);
+
 void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
                                 int prefs3, int mrefs3, int parity, int clip_max);
 
 
+static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int parity, int clip_max, int spat)
+{
+    const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+    ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2,
+                              parity, clip_max, spat);
+
+    if (w0 < w)
+        ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
+                               w - w0, prefs, mrefs, prefs2, mrefs2,
+                               parity, clip_max, spat);
+}
+
 static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
                                 int prefs3, int mrefs3, int parity, int clip_max)
 {
@@ -52,5 +71,6 @@  ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
         return;
 
     s->filter_intra = filter_intra_helper;
+    s->filter_edge  = filter_edge_helper;
 }
 
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index 6c5d1598f4..a33b235882 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -128,6 +128,110 @@  coeffs:
         .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
         .hword          5077, 981                       // sp[0] = v0.h[6]
 
+// ============================================================================
+//
+// void ff_bwdif_filter_edge_neon(
+//      void *dst1,     // x0
+//      void *prev1,    // x1
+//      void *cur1,     // x2
+//      void *next1,    // x3
+//      int w,          // w4
+//      int prefs,      // w5
+//      int mrefs,      // w6
+//      int prefs2,     // w7
+//      int mrefs2,     // [sp, #0]
+//      int parity,     // [sp, #8]
+//      int clip_max,   // [sp, #16]  unused
+//      int spat);      // [sp, #24]
+
+function ff_bwdif_filter_edge_neon, export=1
+        // Sanity check w
+        cmp             w4, #0
+        ble             99f
+
+// #define prev2 cur
+//     const uint8_t * restrict next2 = parity ? prev : next;
+
+        ldr             w8,  [sp, #0]                   // mrefs2
+
+        ldr             w17, [sp, #8]                   // parity
+        ldr             w16,  [sp, #24]                 // spat
+        cmp             w17, #0
+        csel            x17, x1, x3, ne
+
+//     for (x = 0; x < w; x++) {
+
+10:
+//        int m1 = cur[mrefs];
+//        int d = (prev2[0] + next2[0]) >> 1;
+//        int p1 = cur[prefs];
+//        int temporal_diff0 = FFABS(prev2[0] - next2[0]);
+//        int temporal_diff1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
+//        int temporal_diff2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
+//        int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2);
+        ldr             q31, [x2]
+        ldr             q21, [x17]
+        uhadd           v16.16b, v31.16b, v21.16b       // d0 = v16
+        uabd            v17.16b, v31.16b, v21.16b       // td0 = v17
+        ldr             q24, [x2, w6, SXTW]             // m1 = v24
+        ldr             q22, [x2, w5, SXTW]             // p1 = v22
+
+        ldr             q0,  [x1, w6, SXTW]             // prev[mrefs]
+        ldr             q2,  [x1, w5, SXTW]             // prev[prefs]
+        ldr             q1,  [x3, w6, SXTW]             // next[mrefs]
+        ldr             q3,  [x3, w5, SXTW]             // next[prefs]
+
+        ushr            v29.16b, v17.16b, #1
+
+        uabd            v31.16b, v0.16b,  v24.16b
+        uabd            v30.16b, v2.16b,  v22.16b
+        uhadd           v0.16b,  v31.16b, v30.16b       // td1 = q0
+
+        uabd            v31.16b, v1.16b,  v24.16b
+        uabd            v30.16b, v3.16b,  v22.16b
+        uhadd           v1.16b,  v31.16b, v30.16b       // td2 = q1
+
+        umax            v0.16b,  v0.16b,  v29.16b
+        umax            v0.16b,  v0.16b,  v1.16b        // diff = v0
+
+//        if (spat) {
+//            SPAT_CHECK()
+//        }
+//        i0 = (m1 + p1) >> 1;
+        cbz             w16, 1f
+
+        ldr             q31, [x2,  w8, SXTW]
+        ldr             q18, [x17, w8, SXTW]
+        ldr             q30, [x2,  w7, SXTW]
+        ldr             q19, [x17, w7, SXTW]
+        uhadd           v18.16b, v18.16b, v31.16b
+        uhadd           v19.16b, v19.16b, v30.16b
+
+        SPAT_CHECK      v0, v18, v24, v16, v22, v19, v31, v30, v29, v28
+
+1:
+        uhadd           v2.16b,  v22.16b, v24.16b
+
+        // i0 = v2, s0 = v2, d0 = v16, diff = v0, t0 = v31, t1 = v30
+        DIFF_CLIP       v2, v2, v16, v0, v31, v30
+
+//        dst[0] = av_clip(interpol, 0, clip_max);
+        str             q2, [x0], #16
+
+//        dst++;
+//        cur++;
+//    }
+        subs            w4,  w4,  #16
+        add             x1,  x1,  #16
+        add             x2,  x2,  #16
+        add             x3,  x3,  #16
+        add             x17, x17, #16
+        bgt             10b
+
+99:
+        ret
+endfunc
+
 // ============================================================================
 //
 // void ff_bwdif_filter_intra_neon(