diff mbox series

[FFmpeg-devel,v2,06/15] avfilter/vf_bwdif: Add clip and spatial macros for aarch64 neon

Message ID 20230702123242.232484-7-jc@kynesim.co.uk
State New
Headers show
Series avfilter/vf_bwdif: Add aarch64 neon functions | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 fail Make fate failed

Commit Message

John Cox July 2, 2023, 12:32 p.m. UTC
Signed-off-by: John Cox <jc@kynesim.co.uk>
---
 libavfilter/aarch64/vf_bwdif_neon.S | 73 +++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

Comments

Lynne July 2, 2023, 2:02 p.m. UTC | #1
Jul 2, 2023, 14:34 by jc@kynesim.co.uk:

> Signed-off-by: John Cox <jc@kynesim.co.uk>
> ---
>  libavfilter/aarch64/vf_bwdif_neon.S | 73 +++++++++++++++++++++++++++++
>  1 file changed, 73 insertions(+)
>
> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
> index 6a614f8d6e..48dc7bcd9d 100644
> --- a/libavfilter/aarch64/vf_bwdif_neon.S
> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
> @@ -66,6 +66,79 @@
>  umlsl2          \a3\().4s, \s1\().8h, \k
>  .endm
>  
> +//      int b = m2s1 - m1;
> +//      int f = p2s1 - p1;
> +//      int dc = c0s1 - m1;
> +//      int de = c0s1 - p1;
> +//      int sp_max = FFMIN(p1 - c0s1, m1 - c0s1);
> +//      sp_max = FFMIN(sp_max, FFMAX(-b,-f));
> +//      int sp_min = FFMIN(c0s1 - p1, c0s1 - m1);
> +//      sp_min = FFMIN(sp_min, FFMAX(b,f));
> +//      diff = diff == 0 ? 0 : FFMAX3(diff, sp_min, sp_max);
> +.macro SPAT_CHECK diff, m2s1, m1, c0s1, p1, p2s1, t0, t1, t2, t3
> +        uqsub           \t0\().16b, \p1\().16b, \c0s1\().16b
> +        uqsub           \t2\().16b, \m1\().16b, \c0s1\().16b
> +        umin            \t2\().16b, \t0\().16b, \t2\().16b
> +
> +        uqsub           \t1\().16b, \m1\().16b, \m2s1\().16b
> +        uqsub           \t3\().16b, \p1\().16b, \p2s1\().16b
> +        umax            \t3\().16b, \t3\().16b, \t1\().16b
> +        umin            \t3\().16b, \t3\().16b, \t2\().16b
> +
> +        uqsub           \t0\().16b, \c0s1\().16b, \p1\().16b
> +        uqsub           \t2\().16b, \c0s1\().16b, \m1\().16b
> +        umin            \t2\().16b, \t0\().16b, \t2\().16b
> +
> +        uqsub           \t1\().16b, \m2s1\().16b, \m1\().16b
> +        uqsub           \t0\().16b, \p2s1\().16b, \p1\().16b
> +        umax            \t0\().16b, \t0\().16b, \t1\().16b
> +        umin            \t2\().16b, \t2\().16b, \t0\().16b
> +
> +        cmeq            \t1\().16b, \diff\().16b, #0
> +        umax            \diff\().16b, \diff\().16b, \t3\().16b
> +        umax            \diff\().16b, \diff\().16b, \t2\().16b
> +        bic             \diff\().16b, \diff\().16b, \t1\().16b
> +.endm
> +
> +//      i0 = s0;
> +//      if (i0 > d0 + diff0)
> +//          i0 = d0 + diff0;
> +//      else if (i0 < d0 - diff0)
> +//          i0 = d0 - diff0;
> +//
> +// i0 = s0 is safe
> +.macro DIFF_CLIP i0, s0, d0, diff, t0, t1
> +        uqadd           \t0\().16b, \d0\().16b, \diff\().16b
> +        uqsub           \t1\().16b, \d0\().16b, \diff\().16b
> +        umin            \i0\().16b, \s0\().16b, \t0\().16b
> +        umax            \i0\().16b, \i0\().16b, \t1\().16b
> +.endm
> +
> +//      i0 = FFABS(m1 - p1) > td0 ? i1 : i2;
> +//      DIFF_CLIP
> +//
> +// i0 = i1 is safe
> +.macro INTERPOL i0, i1, i2, m1, d0, p1, td0, diff, t0, t1, t2
> +        uabd            \t0\().16b, \m1\().16b, \p1\().16b
> +        cmhi            \t0\().16b, \t0\().16b, \td0\().16b
> +        bsl             \t0\().16b, \i1\().16b, \i2\().16b
> +        DIFF_CLIP       \i0, \t0, \d0, \diff, \t1, \t2
> +.endm
> +
> +.macro PUSH_VREGS
> +        stp             d8,  d9,  [sp, #-64]!
> +        stp             d10, d11, [sp, #16]
> +        stp             d12, d13, [sp, #32]
> +        stp             d14, d15, [sp, #48]
> +.endm
> +
> +.macro POP_VREGS
> +        ldp             d14, d15, [sp, #48]
> +        ldp             d12, d13, [sp, #32]
> +        ldp             d10, d11, [sp, #16]
> +        ldp             d8,  d9,  [sp], #64
> +.endm
>

Could you squash? Adding empty files and then commit by
commit filling them up is pointless and makes it harder to
review. Just export what you need in one commit, and add
everything else in another.

Also, keep in mind the final spatial clip at the end should be
removable. I discovered it makes the filter look quite a lot
better. Currently, only the Vulkan version does it, but we're
looking into changing the C/asm versions too, and you're the
second one to rush into implementing asm for it before we've
had a chance to discuss it properly.
Kieran Kunhya July 2, 2023, 2:09 p.m. UTC | #2
On Sun, 2 Jul 2023, 16:02 Lynne <dev@lynne.ee> wrote:

>
> Also, keep in mind the final spatial clip at the end should be
> removable. I discovered it makes the filter look quite a lot
> better. Currently, only the Vulkan version does it, but we're
> looking into changing the C/asm versions too, and you're the
> second one to rush into implementing asm for it before we've
> had a chance to discuss it properly.
>

Didn't Thomas Mundt ask you to provide samples to back up your claim?

Kieran

>
Lynne July 2, 2023, 4:55 p.m. UTC | #3
Jul 2, 2023, 16:09 by kierank@obe.tv:

> On Sun, 2 Jul 2023, 16:02 Lynne <dev@lynne.ee> wrote:
>
>>
>> Also, keep in mind the final spatial clip at the end should be
>> removable. I discovered it makes the filter look quite a lot
>> better. Currently, only the Vulkan version does it, but we're
>> looking into changing the C/asm versions too, and you're the
>> second one to rush into implementing asm for it before we've
>> had a chance to discuss it properly.
>>
>
> Didn't Thomas Mundt ask you to provide samples to back up your claim?
>

I haven't had time to.
Until you made me just now.
diff mbox series

Patch

diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index 6a614f8d6e..48dc7bcd9d 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -66,6 +66,79 @@ 
         umlsl2          \a3\().4s, \s1\().8h, \k
 .endm
 
+//      int b = m2s1 - m1;
+//      int f = p2s1 - p1;
+//      int dc = c0s1 - m1;
+//      int de = c0s1 - p1;
+//      int sp_max = FFMIN(p1 - c0s1, m1 - c0s1);
+//      sp_max = FFMIN(sp_max, FFMAX(-b,-f));
+//      int sp_min = FFMIN(c0s1 - p1, c0s1 - m1);
+//      sp_min = FFMIN(sp_min, FFMAX(b,f));
+//      diff = diff == 0 ? 0 : FFMAX3(diff, sp_min, sp_max);
+.macro SPAT_CHECK diff, m2s1, m1, c0s1, p1, p2s1, t0, t1, t2, t3
+        uqsub           \t0\().16b, \p1\().16b, \c0s1\().16b
+        uqsub           \t2\().16b, \m1\().16b, \c0s1\().16b
+        umin            \t2\().16b, \t0\().16b, \t2\().16b
+
+        uqsub           \t1\().16b, \m1\().16b, \m2s1\().16b
+        uqsub           \t3\().16b, \p1\().16b, \p2s1\().16b
+        umax            \t3\().16b, \t3\().16b, \t1\().16b
+        umin            \t3\().16b, \t3\().16b, \t2\().16b
+
+        uqsub           \t0\().16b, \c0s1\().16b, \p1\().16b
+        uqsub           \t2\().16b, \c0s1\().16b, \m1\().16b
+        umin            \t2\().16b, \t0\().16b, \t2\().16b
+
+        uqsub           \t1\().16b, \m2s1\().16b, \m1\().16b
+        uqsub           \t0\().16b, \p2s1\().16b, \p1\().16b
+        umax            \t0\().16b, \t0\().16b, \t1\().16b
+        umin            \t2\().16b, \t2\().16b, \t0\().16b
+
+        cmeq            \t1\().16b, \diff\().16b, #0
+        umax            \diff\().16b, \diff\().16b, \t3\().16b
+        umax            \diff\().16b, \diff\().16b, \t2\().16b
+        bic             \diff\().16b, \diff\().16b, \t1\().16b
+.endm
+
+//      i0 = s0;
+//      if (i0 > d0 + diff0)
+//          i0 = d0 + diff0;
+//      else if (i0 < d0 - diff0)
+//          i0 = d0 - diff0;
+//
+// i0 = s0 is safe
+.macro DIFF_CLIP i0, s0, d0, diff, t0, t1
+        uqadd           \t0\().16b, \d0\().16b, \diff\().16b
+        uqsub           \t1\().16b, \d0\().16b, \diff\().16b
+        umin            \i0\().16b, \s0\().16b, \t0\().16b
+        umax            \i0\().16b, \i0\().16b, \t1\().16b
+.endm
+
+//      i0 = FFABS(m1 - p1) > td0 ? i1 : i2;
+//      DIFF_CLIP
+//
+// i0 = i1 is safe
+.macro INTERPOL i0, i1, i2, m1, d0, p1, td0, diff, t0, t1, t2
+        uabd            \t0\().16b, \m1\().16b, \p1\().16b
+        cmhi            \t0\().16b, \t0\().16b, \td0\().16b
+        bsl             \t0\().16b, \i1\().16b, \i2\().16b
+        DIFF_CLIP       \i0, \t0, \d0, \diff, \t1, \t2
+.endm
+
+.macro PUSH_VREGS
+        stp             d8,  d9,  [sp, #-64]!
+        stp             d10, d11, [sp, #16]
+        stp             d12, d13, [sp, #32]
+        stp             d14, d15, [sp, #48]
+.endm
+
+.macro POP_VREGS
+        ldp             d14, d15, [sp, #48]
+        ldp             d12, d13, [sp, #32]
+        ldp             d10, d11, [sp, #16]
+        ldp             d8,  d9,  [sp], #64
+.endm
+
 // static const uint16_t coef_lf[2] = { 4309, 213 };
 // static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
 // static const uint16_t coef_sp[2] = { 5077, 981 };