[FFmpeg-devel] avfilter/vf_adadenoise: add x86 SIMD

Submitted by Paul B Mahol on Oct. 15, 2019, 3:21 p.m.

Details

Message ID 20191015152138.3552-1-onemda@gmail.com
State New
Headers show

Commit Message

Paul B Mahol Oct. 15, 2019, 3:21 p.m.
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavfilter/atadenoise.h             |  36 +++++++
 libavfilter/vf_atadenoise.c          |  16 +--
 libavfilter/x86/Makefile             |   2 +
 libavfilter/x86/vf_atadenoise.asm    | 147 +++++++++++++++++++++++++++
 libavfilter/x86/vf_atadenoise_init.c |  40 ++++++++
 5 files changed, 234 insertions(+), 7 deletions(-)
 create mode 100644 libavfilter/atadenoise.h
 create mode 100644 libavfilter/x86/vf_atadenoise.asm
 create mode 100644 libavfilter/x86/vf_atadenoise_init.c

Comments

James Almer Oct. 15, 2019, 3:51 p.m.
On 10/15/2019 12:21 PM, Paul B Mahol wrote:
> +;------------------------------------------------------------------------------
> +; void ff_filter_row(const uint8_t *src, uint8_t *dst,
> +;                    const uint8_t **srcf,
> +;                    int w, int mid, int size,
> +;                    int thra, int thrb)
> +;------------------------------------------------------------------------------
> +
> +INIT_XMM sse4
> +cglobal atadenoise_filter_row8, 8,12,13, src, dst, srcf, w, mid, size, thra, thrb, i, j, srcfx, x

You need to sign extend all the int argument. And if you're not using
thra and thrb, then don't load them and save two gprs.
Paul B Mahol Oct. 15, 2019, 3:55 p.m.
On 10/15/19, James Almer <jamrial@gmail.com> wrote:
> On 10/15/2019 12:21 PM, Paul B Mahol wrote:
>> +;------------------------------------------------------------------------------
>> +; void ff_filter_row(const uint8_t *src, uint8_t *dst,
>> +;                    const uint8_t **srcf,
>> +;                    int w, int mid, int size,
>> +;                    int thra, int thrb)
>> +;------------------------------------------------------------------------------
>> +
>> +INIT_XMM sse4
>> +cglobal atadenoise_filter_row8, 8,12,13, src, dst, srcf, w, mid, size,
>> thra, thrb, i, j, srcfx, x
>
> You need to sign extend all the int argument. And if you're not using
> thra and thrb, then don't load them and save two gprs.

Sorry but they are used.

> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
James Almer Oct. 15, 2019, 3:58 p.m.
On 10/15/2019 12:55 PM, Paul B Mahol wrote:
> On 10/15/19, James Almer <jamrial@gmail.com> wrote:
>> On 10/15/2019 12:21 PM, Paul B Mahol wrote:
>>> +;------------------------------------------------------------------------------
>>> +; void ff_filter_row(const uint8_t *src, uint8_t *dst,
>>> +;                    const uint8_t **srcf,
>>> +;                    int w, int mid, int size,
>>> +;                    int thra, int thrb)
>>> +;------------------------------------------------------------------------------
>>> +
>>> +INIT_XMM sse4
>>> +cglobal atadenoise_filter_row8, 8,12,13, src, dst, srcf, w, mid, size,
>>> thra, thrb, i, j, srcfx, x
>>
>> You need to sign extend all the int argument. And if you're not using
>> thra and thrb, then don't load them and save two gprs.
> 
> Sorry but they are used.

You're loading them from memory manually using r6m and r7m after having
the prologue code load them to gprs in the cglobal line. Since you don't
need them in r6q and r7q, just reuse them for i and j.

Patch hide | download patch | download mbox

diff --git a/libavfilter/atadenoise.h b/libavfilter/atadenoise.h
new file mode 100644
index 0000000000..c1fdc2f64e
--- /dev/null
+++ b/libavfilter/atadenoise.h
@@ -0,0 +1,36 @@ 
+ /*
+ * Copyright (c) 2019 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_ATADENOISE_H
+#define AVFILTER_ATADENOISE_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct ATADenoiseDSPContext {
+    void (*filter_row)(const uint8_t *src, uint8_t *dst,
+                       const uint8_t **srcf,
+                       int w, int mid, int size,
+                       int thra, int thrb);
+} ATADenoiseDSPContext;
+
+void ff_atadenoise_init_x86(ATADenoiseDSPContext *dsp, int depth);
+
+#endif /* AVFILTER_ATADENOISE_H */
diff --git a/libavfilter/vf_atadenoise.c b/libavfilter/vf_atadenoise.c
index be7c4e2a34..d85fa79961 100644
--- a/libavfilter/vf_atadenoise.c
+++ b/libavfilter/vf_atadenoise.c
@@ -33,6 +33,7 @@ 
 #define FF_BUFQUEUE_SIZE 129
 #include "bufferqueue.h"
 
+#include "atadenoise.h"
 #include "formats.h"
 #include "internal.h"
 #include "video.h"
@@ -57,10 +58,8 @@  typedef struct ATADenoiseContext {
     int available;
 
     int (*filter_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
-    void (*filter_row)(const uint8_t *src, uint8_t *dst,
-                       const uint8_t *srcf[SIZE],
-                       int w, int mid, int size,
-                       int thra, int thrb);
+
+    ATADenoiseDSPContext dsp;
 } ATADenoiseContext;
 
 #define OFFSET(x) offsetof(ATADenoiseContext, x)
@@ -209,7 +208,7 @@  static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
             srcf[i] = data[i] + slice_start * linesize[i];
 
         for (y = slice_start; y < slice_end; y++) {
-            s->filter_row(src, dst, srcf, w, mid, size, thra, thrb);
+            s->dsp.filter_row(src, dst, srcf, w, mid, size, thra, thrb);
 
             dst += out->linesize[p];
             src += in->linesize[p];
@@ -239,9 +238,9 @@  static int config_input(AVFilterLink *inlink)
     depth = desc->comp[0].depth;
     s->filter_slice = filter_slice;
     if (depth == 8)
-        s->filter_row = filter_row8;
+        s->dsp.filter_row = filter_row8;
     else
-        s->filter_row = filter_row16;
+        s->dsp.filter_row = filter_row16;
 
     s->thra[0] = s->fthra[0] * (1 << depth) - 1;
     s->thra[1] = s->fthra[1] * (1 << depth) - 1;
@@ -250,6 +249,9 @@  static int config_input(AVFilterLink *inlink)
     s->thrb[1] = s->fthrb[1] * (1 << depth) - 1;
     s->thrb[2] = s->fthrb[2] * (1 << depth) - 1;
 
+    if (ARCH_X86)
+        ff_atadenoise_init_x86(&s->dsp, depth);
+
     return 0;
 }
 
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 4cd914366a..06f832e36c 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -2,6 +2,7 @@  OBJS-$(CONFIG_SCENE_SAD)                     += x86/scene_sad_init.o
 
 OBJS-$(CONFIG_AFIR_FILTER)                   += x86/af_afir_init.o
 OBJS-$(CONFIG_ANLMDN_FILTER)                 += x86/af_anlmdn_init.o
+OBJS-$(CONFIG_ATADENOISE_FILTER)             += x86/vf_atadenoise_init.o
 OBJS-$(CONFIG_BLEND_FILTER)                  += x86/vf_blend_init.o
 OBJS-$(CONFIG_BWDIF_FILTER)                  += x86/vf_bwdif_init.o
 OBJS-$(CONFIG_COLORSPACE_FILTER)             += x86/colorspacedsp_init.o
@@ -39,6 +40,7 @@  X86ASM-OBJS-$(CONFIG_SCENE_SAD)              += x86/scene_sad.o
 
 X86ASM-OBJS-$(CONFIG_AFIR_FILTER)            += x86/af_afir.o
 X86ASM-OBJS-$(CONFIG_ANLMDN_FILTER)          += x86/af_anlmdn.o
+X86ASM-OBJS-$(CONFIG_ATADENOISE_FILTER)      += x86/vf_atadenoise.o
 X86ASM-OBJS-$(CONFIG_BLEND_FILTER)           += x86/vf_blend.o
 X86ASM-OBJS-$(CONFIG_BWDIF_FILTER)           += x86/vf_bwdif.o
 X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER)      += x86/colorspacedsp.o
diff --git a/libavfilter/x86/vf_atadenoise.asm b/libavfilter/x86/vf_atadenoise.asm
new file mode 100644
index 0000000000..cc3abe47bd
--- /dev/null
+++ b/libavfilter/x86/vf_atadenoise.asm
@@ -0,0 +1,147 @@ 
+;*****************************************************************************
+;* x86-optimized functions for blend filter
+;*
+;* Copyright (C) 2019 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%if ARCH_X86_64
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_one:  times 8 dw 1
+pw_ones: times 8 dw 65535
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void ff_filter_row(const uint8_t *src, uint8_t *dst,
+;                    const uint8_t **srcf,
+;                    int w, int mid, int size,
+;                    int thra, int thrb)
+;------------------------------------------------------------------------------
+
+INIT_XMM sse4
+cglobal atadenoise_filter_row8, 8,12,13, src, dst, srcf, w, mid, size, thra, thrb, i, j, srcfx, x
+    add         srcq, wq
+    add         dstq, wq
+    mov           xq, wq
+    dec        sizeq
+    neg           xq
+    movd          m4, r6m
+    SPLATW        m4, m4
+    movd          m5, r7m
+    SPLATW        m5, m5
+    pxor          m2, m2
+    mova         m10, [pw_ones]
+
+    .loop:
+        mov         iq, midq
+        mov         jq, midq
+        pxor        m3, m3
+        pxor       m11, m11
+        movu        m0, [srcq + xq]
+        punpcklbw   m0, m2
+        mova        m7, m0
+        mova        m8, [pw_one]
+        mova       m12, [pw_ones]
+
+        .loop0:
+            inc              iq
+            dec              jq
+
+            mov          srcfxq, [srcfq + jq * 8]
+            add          srcfxq, wq
+
+            movu             m1, [srcfxq + xq]
+            punpcklbw        m1, m2
+            mova             m9, m1
+            psubw            m1, m0
+            pabsw            m1, m1
+            paddw           m11, m1
+            pcmpgtw          m1, m4
+            mova             m6, m11
+            pcmpgtw          m6, m5
+            por              m6, m1
+            pxor             m6, m10
+            pand            m12, m6
+            pand             m9, m12
+            paddw            m7, m9
+            mova             m6, m12
+            psrlw            m6, 15
+            paddw            m8, m6
+
+            mov          srcfxq, [srcfq + iq * 8]
+            add          srcfxq, wq
+
+            movu             m1, [srcfxq + xq]
+            punpcklbw        m1, m2
+            mova             m9, m1
+            psubw            m1, m0
+            pabsw            m1, m1
+            paddw            m3, m1
+            pcmpgtw          m1, m4
+            mova             m6, m3
+            pcmpgtw          m6, m5
+            por              m6, m1
+            pxor             m6, m10
+            pand            m12, m6
+            pand             m9, m12
+            paddw            m7, m9
+            mova             m6, m12
+            psrlw            m6, 15
+            paddw            m8, m6
+
+            ptest           m12, m12
+            jz .finish
+
+            cmp              iq, sizeq
+            jl .loop0
+
+    .finish:
+        mova                 m1, m7
+        mova                 m6, m8
+
+        punpcklwd            m7, m2
+        punpcklwd            m8, m2
+        cvtdq2ps             m7, m7
+        cvtdq2ps             m8, m8
+        divps                m7, m8
+        cvttps2dq            m7, m7
+        packssdw             m7, m7
+        packuswb             m7, m7
+
+        movd        [dstq + xq], m7
+
+        punpckhwd            m1, m2
+        punpckhwd            m6, m2
+        cvtdq2ps             m1, m1
+        cvtdq2ps             m6, m6
+        divps                m1, m6
+        cvttps2dq            m1, m1
+        packssdw             m1, m1
+        packuswb             m1, m1
+
+        movd    [dstq + xq + 4], m1
+
+        add                  xq, mmsize/2
+    jl .loop
+    RET
+
+%endif
diff --git a/libavfilter/x86/vf_atadenoise_init.c b/libavfilter/x86/vf_atadenoise_init.c
new file mode 100644
index 0000000000..5ddfb243f2
--- /dev/null
+++ b/libavfilter/x86/vf_atadenoise_init.c
@@ -0,0 +1,40 @@ 
+/*
+ * Copyright (C) 2019 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/atadenoise.h"
+
+void ff_atadenoise_filter_row8_sse4(const uint8_t *src, uint8_t *dst,
+                                    const uint8_t **srcf,
+                                    int w, int mid, int size,
+                                    int thra, int thrb);
+
+av_cold void ff_atadenoise_init_x86(ATADenoiseDSPContext *dsp, int depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) && depth <= 8) {
+        dsp->filter_row = ff_atadenoise_filter_row8_sse4;
+    }
+}