diff mbox series

[FFmpeg-devel,04/15] avfilter/vf_bwdif: Add neon for filter_intra

Message ID 20230629175729.224383-5-jc@kynesim.co.uk
State New
Headers show
Series avfilter/vf_bwdif: Add aarch64 neon functions | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

John Cox June 29, 2023, 5:57 p.m. UTC
Signed-off-by: John Cox <jc@kynesim.co.uk>
---
 libavfilter/aarch64/vf_bwdif_init_aarch64.c | 17 +++++++
 libavfilter/aarch64/vf_bwdif_neon.S         | 53 +++++++++++++++++++++
 2 files changed, 70 insertions(+)

Comments

Martin Storsjö July 1, 2023, 9:37 p.m. UTC | #1
On Thu, 29 Jun 2023, John Cox wrote:

> Signed-off-by: John Cox <jc@kynesim.co.uk>
> ---
> libavfilter/aarch64/vf_bwdif_init_aarch64.c | 17 +++++++
> libavfilter/aarch64/vf_bwdif_neon.S         | 53 +++++++++++++++++++++
> 2 files changed, 70 insertions(+)
>
> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> index 86d53b2ca1..3ffaa07ab3 100644
> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
> @@ -24,6 +24,22 @@
> #include "libavfilter/bwdif.h"
> #include "libavutil/aarch64/cpu.h"
>
> +void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
> +                                int prefs3, int mrefs3, int parity, int clip_max);
> +
> +
> +static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
> +                                int prefs3, int mrefs3, int parity, int clip_max)
> +{
> +    const int w0 = clip_max != 255 ? 0 : w & ~15;
> +
> +    ff_bwdif_filter_intra_neon(dst1, cur1, w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
> +
> +    if (w0 < w)
> +        ff_bwdif_filter_intra_c((char *)dst1 + w0, (char *)cur1 + w0,
> +                                w - w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
> +}
> +
> void
> ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
> {
> @@ -35,5 +51,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
>     if (!have_neon(cpu_flags))
>         return;
>
> +    s->filter_intra = filter_intra_helper;
> }
>
> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
> index a8f0ed525a..b863b3447d 100644
> --- a/libavfilter/aarch64/vf_bwdif_neon.S
> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
> @@ -69,3 +69,56 @@ coeffs:
>         .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
>         .hword          5077, 981                       // sp[0] = v0.h[6]
>
> +// ============================================================================
> +//
> +// void ff_bwdif_filter_intra_neon(
> +//      void *dst1,     // x0
> +//      void *cur1,     // x1
> +//      int w,          // w2
> +//      int prefs,      // w3
> +//      int mrefs,      // w4
> +//      int prefs3,     // w5
> +//      int mrefs3,     // w6
> +//      int parity,     // w7       unused
> +//      int clip_max)   // [sp, #0] unused

This bit is great to have

> +
> +function ff_bwdif_filter_intra_neon, export=1
> +        cmp             w2, #0
> +        ble             99f
> +
> +        ldr             q0, coeffs
> +
> +//    for (x = 0; x < w; x++) {
> +10:
> +
> +//        interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13;

I guess the style with intermixed C code is a bit uncommon in our 
assembly, but as long as it doesn't affect the overall code style I guess 
we can keep it.

> +        ldr             q31, [x1, w4, SXTW]
> +        ldr             q30, [x1, w3, SXTW]
> +        ldr             q29, [x1, w6, SXTW]
> +        ldr             q28, [x1, w5, SXTW]

Don't use shouty uppercase SXTW here

> +
> +        uaddl           v20.8h,  v31.8b,  v30.8b
> +        uaddl2          v21.8h,  v31.16b, v30.16b
> +
> +        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[6]
> +
> +        uaddl           v20.8h,  v29.8b,  v28.8b
> +        uaddl2          v21.8h,  v29.16b, v28.16b
> +
> +        UMLSL4K         v2, v3, v4, v5, v20, v21, v0.h[7]
> +
> +//        dst[0] = av_clip(interpol, 0, clip_max);
> +        SQSHRUNN        v2, v2, v3, v4, v5, 13
> +        str             q2, [x0], #16
> +
> +//        dst++;
> +//        cur++;
> +//    }
> +
> +        subs            w2,  w2,  #16
> +        add             x1,  x1,  #16

For in-order cores, it might be good to update these variables sometime 
sooner, e.g. before the str instruction. But such scheduling breaks your 
mapping between neat C code and assembly.

// Martin
John Cox July 2, 2023, 10:43 a.m. UTC | #2
On Sun, 2 Jul 2023 00:37:35 +0300 (EEST), you wrote:

>On Thu, 29 Jun 2023, John Cox wrote:
>
>> Signed-off-by: John Cox <jc@kynesim.co.uk>
>> ---
>> libavfilter/aarch64/vf_bwdif_init_aarch64.c | 17 +++++++
>> libavfilter/aarch64/vf_bwdif_neon.S         | 53 +++++++++++++++++++++
>> 2 files changed, 70 insertions(+)
>>
>> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>> index 86d53b2ca1..3ffaa07ab3 100644
>> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>> @@ -24,6 +24,22 @@
>> #include "libavfilter/bwdif.h"
>> #include "libavutil/aarch64/cpu.h"
>>
>> +void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
>> +                                int prefs3, int mrefs3, int parity, int clip_max);
>> +
>> +
>> +static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
>> +                                int prefs3, int mrefs3, int parity, int clip_max)
>> +{
>> +    const int w0 = clip_max != 255 ? 0 : w & ~15;
>> +
>> +    ff_bwdif_filter_intra_neon(dst1, cur1, w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
>> +
>> +    if (w0 < w)
>> +        ff_bwdif_filter_intra_c((char *)dst1 + w0, (char *)cur1 + w0,
>> +                                w - w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
>> +}
>> +
>> void
>> ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
>> {
>> @@ -35,5 +51,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
>>     if (!have_neon(cpu_flags))
>>         return;
>>
>> +    s->filter_intra = filter_intra_helper;
>> }
>>
>> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
>> index a8f0ed525a..b863b3447d 100644
>> --- a/libavfilter/aarch64/vf_bwdif_neon.S
>> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
>> @@ -69,3 +69,56 @@ coeffs:
>>         .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
>>         .hword          5077, 981                       // sp[0] = v0.h[6]
>>
>> +// ============================================================================
>> +//
>> +// void ff_bwdif_filter_intra_neon(
>> +//      void *dst1,     // x0
>> +//      void *cur1,     // x1
>> +//      int w,          // w2
>> +//      int prefs,      // w3
>> +//      int mrefs,      // w4
>> +//      int prefs3,     // w5
>> +//      int mrefs3,     // w6
>> +//      int parity,     // w7       unused
>> +//      int clip_max)   // [sp, #0] unused
>
>This bit is great to have
>
>> +
>> +function ff_bwdif_filter_intra_neon, export=1
>> +        cmp             w2, #0
>> +        ble             99f
>> +
>> +        ldr             q0, coeffs
>> +
>> +//    for (x = 0; x < w; x++) {
>> +10:
>> +
>> +//        interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13;
>
>I guess the style with intermixed C code is a bit uncommon in our 
>assembly, but as long as it doesn't affect the overall code style I guess 
>we can keep it.

I needed it to track where I was whilst writing the code & if I ever
need to change it I'll be lost without it - so I, at least, rate it as
valuable and in line3 where I am very tight on registers it was
invaluable for keeping track of what referred to what.

>> +        ldr             q31, [x1, w4, SXTW]
>> +        ldr             q30, [x1, w3, SXTW]
>> +        ldr             q29, [x1, w6, SXTW]
>> +        ldr             q28, [x1, w5, SXTW]
>
>Don't use shouty uppercase SXTW here

Will change.

>> +
>> +        uaddl           v20.8h,  v31.8b,  v30.8b
>> +        uaddl2          v21.8h,  v31.16b, v30.16b
>> +
>> +        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[6]
>> +
>> +        uaddl           v20.8h,  v29.8b,  v28.8b
>> +        uaddl2          v21.8h,  v29.16b, v28.16b
>> +
>> +        UMLSL4K         v2, v3, v4, v5, v20, v21, v0.h[7]
>> +
>> +//        dst[0] = av_clip(interpol, 0, clip_max);
>> +        SQSHRUNN        v2, v2, v3, v4, v5, 13
>> +        str             q2, [x0], #16
>> +
>> +//        dst++;
>> +//        cur++;
>> +//    }
>> +
>> +        subs            w2,  w2,  #16
>> +        add             x1,  x1,  #16
>
>For in-order cores, it might be good to update these variables sometime 
>sooner, e.g. before the str instruction. But such scheduling breaks your 
>mapping between neat C code and assembly.

I take your point but there is at least 1 instruction between set and
use which is normally enough.

I did my benching on a Pi4 and found, to my surprise, that in most cases
reordering instructions to interleavse operations made life worse and
seeing as Pi4 is my target platform I stopped trying to do that and
stuck with code that was easier to read! (Also filter_intra seems to be
much more memory b/w limited than code limited on a Pi4.)

JC

>// Martin
>
>_______________________________________________
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Martin Storsjö July 2, 2023, 8:18 p.m. UTC | #3
On Sun, 2 Jul 2023, John Cox wrote:

> On Sun, 2 Jul 2023 00:37:35 +0300 (EEST), you wrote:
>
>>> +
>>> +        uaddl           v20.8h,  v31.8b,  v30.8b
>>> +        uaddl2          v21.8h,  v31.16b, v30.16b
>>> +
>>> +        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[6]
>>> +
>>> +        uaddl           v20.8h,  v29.8b,  v28.8b
>>> +        uaddl2          v21.8h,  v29.16b, v28.16b
>>> +
>>> +        UMLSL4K         v2, v3, v4, v5, v20, v21, v0.h[7]
>>> +
>>> +//        dst[0] = av_clip(interpol, 0, clip_max);
>>> +        SQSHRUNN        v2, v2, v3, v4, v5, 13
>>> +        str             q2, [x0], #16
>>> +
>>> +//        dst++;
>>> +//        cur++;
>>> +//    }
>>> +
>>> +        subs            w2,  w2,  #16
>>> +        add             x1,  x1,  #16
>>
>> For in-order cores, it might be good to update these variables sometime
>> sooner, e.g. before the str instruction. But such scheduling breaks your
>> mapping between neat C code and assembly.
>
> I take your point but there is at least 1 instruction between set and
> use which is normally enough.

True, in most cases, it's enough, but sometimes you can save more if 
there's a stall bubble elsewhere too.

> I did my benching on a Pi4 and found, to my surprise, that in most cases
> reordering instructions to interleavse operations made life worse and
> seeing as Pi4 is my target platform I stopped trying to do that and
> stuck with code that was easier to read! (Also filter_intra seems to be
> much more memory b/w limited than code limited on a Pi4.)

A Raspberry Pi 4 is Cortex A72, and that one has got out of order 
execution, so you generally won't be able to notice most of the effects of 
instruction scheduling. On actual in-order cores, like Cortex A53 (found 
in e..g the Pi3 and lots of other places), scheduling can have a fairly 
dramatic effect though.

In any case, here it's not a big concern, and one instruction inbetween 
usually is good enough indeed.

// Martin
diff mbox series

Patch

diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index 86d53b2ca1..3ffaa07ab3 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -24,6 +24,22 @@ 
 #include "libavfilter/bwdif.h"
 #include "libavutil/aarch64/cpu.h"
 
+void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                                int prefs3, int mrefs3, int parity, int clip_max);
+
+
+static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                                int prefs3, int mrefs3, int parity, int clip_max)
+{
+    const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+    ff_bwdif_filter_intra_neon(dst1, cur1, w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
+
+    if (w0 < w)
+        ff_bwdif_filter_intra_c((char *)dst1 + w0, (char *)cur1 + w0,
+                                w - w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
+}
+
 void
 ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
 {
@@ -35,5 +51,6 @@  ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
     if (!have_neon(cpu_flags))
         return;
 
+    s->filter_intra = filter_intra_helper;
 }
 
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index a8f0ed525a..b863b3447d 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -69,3 +69,56 @@  coeffs:
         .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
         .hword          5077, 981                       // sp[0] = v0.h[6]
 
+// ============================================================================
+//
+// void ff_bwdif_filter_intra_neon(
+//      void *dst1,     // x0
+//      void *cur1,     // x1
+//      int w,          // w2
+//      int prefs,      // w3
+//      int mrefs,      // w4
+//      int prefs3,     // w5
+//      int mrefs3,     // w6
+//      int parity,     // w7       unused
+//      int clip_max)   // [sp, #0] unused
+
+function ff_bwdif_filter_intra_neon, export=1
+        cmp             w2, #0
+        ble             99f
+
+        ldr             q0, coeffs
+
+//    for (x = 0; x < w; x++) {
+10:
+
+//        interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13;
+        ldr             q31, [x1, w4, SXTW]
+        ldr             q30, [x1, w3, SXTW]
+        ldr             q29, [x1, w6, SXTW]
+        ldr             q28, [x1, w5, SXTW]
+
+        uaddl           v20.8h,  v31.8b,  v30.8b
+        uaddl2          v21.8h,  v31.16b, v30.16b
+
+        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[6]
+
+        uaddl           v20.8h,  v29.8b,  v28.8b
+        uaddl2          v21.8h,  v29.16b, v28.16b
+
+        UMLSL4K         v2, v3, v4, v5, v20, v21, v0.h[7]
+
+//        dst[0] = av_clip(interpol, 0, clip_max);
+        SQSHRUNN        v2, v2, v3, v4, v5, 13
+        str             q2, [x0], #16
+
+//        dst++;
+//        cur++;
+//    }
+
+        subs            w2,  w2,  #16
+        add             x1,  x1,  #16
+        bgt             10b
+
+99:
+        ret
+endfunc