[FFmpeg-devel] avfilter/vf_adadenoise: add x86 SIMD

Submitted by Paul B Mahol on Oct. 15, 2019, 4:23 p.m.

Details

Message ID 20191015162336.16171-1-onemda@gmail.com
State New
Headers show

Commit Message

Paul B Mahol Oct. 15, 2019, 4:23 p.m.
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavfilter/atadenoise.h             |  36 +++++++
 libavfilter/vf_atadenoise.c          |  16 +--
 libavfilter/x86/Makefile             |   2 +
 libavfilter/x86/vf_atadenoise.asm    | 150 +++++++++++++++++++++++++++
 libavfilter/x86/vf_atadenoise_init.c |  40 +++++++
 5 files changed, 237 insertions(+), 7 deletions(-)
 create mode 100644 libavfilter/atadenoise.h
 create mode 100644 libavfilter/x86/vf_atadenoise.asm
 create mode 100644 libavfilter/x86/vf_atadenoise_init.c

Patch hide | download patch | download mbox

diff --git a/libavfilter/atadenoise.h b/libavfilter/atadenoise.h
new file mode 100644
index 0000000000..c1fdc2f64e
--- /dev/null
+++ b/libavfilter/atadenoise.h
@@ -0,0 +1,36 @@ 
+ /*
+ * Copyright (c) 2019 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_ATADENOISE_H
+#define AVFILTER_ATADENOISE_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct ATADenoiseDSPContext {
+    void (*filter_row)(const uint8_t *src, uint8_t *dst,
+                       const uint8_t **srcf,
+                       int w, int mid, int size,
+                       int thra, int thrb);
+} ATADenoiseDSPContext;
+
+void ff_atadenoise_init_x86(ATADenoiseDSPContext *dsp, int depth);
+
+#endif /* AVFILTER_ATADENOISE_H */
diff --git a/libavfilter/vf_atadenoise.c b/libavfilter/vf_atadenoise.c
index be7c4e2a34..d85fa79961 100644
--- a/libavfilter/vf_atadenoise.c
+++ b/libavfilter/vf_atadenoise.c
@@ -33,6 +33,7 @@ 
 #define FF_BUFQUEUE_SIZE 129
 #include "bufferqueue.h"
 
+#include "atadenoise.h"
 #include "formats.h"
 #include "internal.h"
 #include "video.h"
@@ -57,10 +58,8 @@  typedef struct ATADenoiseContext {
     int available;
 
     int (*filter_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
-    void (*filter_row)(const uint8_t *src, uint8_t *dst,
-                       const uint8_t *srcf[SIZE],
-                       int w, int mid, int size,
-                       int thra, int thrb);
+
+    ATADenoiseDSPContext dsp;
 } ATADenoiseContext;
 
 #define OFFSET(x) offsetof(ATADenoiseContext, x)
@@ -209,7 +208,7 @@  static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
             srcf[i] = data[i] + slice_start * linesize[i];
 
         for (y = slice_start; y < slice_end; y++) {
-            s->filter_row(src, dst, srcf, w, mid, size, thra, thrb);
+            s->dsp.filter_row(src, dst, srcf, w, mid, size, thra, thrb);
 
             dst += out->linesize[p];
             src += in->linesize[p];
@@ -239,9 +238,9 @@  static int config_input(AVFilterLink *inlink)
     depth = desc->comp[0].depth;
     s->filter_slice = filter_slice;
     if (depth == 8)
-        s->filter_row = filter_row8;
+        s->dsp.filter_row = filter_row8;
     else
-        s->filter_row = filter_row16;
+        s->dsp.filter_row = filter_row16;
 
     s->thra[0] = s->fthra[0] * (1 << depth) - 1;
     s->thra[1] = s->fthra[1] * (1 << depth) - 1;
@@ -250,6 +249,9 @@  static int config_input(AVFilterLink *inlink)
     s->thrb[1] = s->fthrb[1] * (1 << depth) - 1;
     s->thrb[2] = s->fthrb[2] * (1 << depth) - 1;
 
+    if (ARCH_X86)
+        ff_atadenoise_init_x86(&s->dsp, depth);
+
     return 0;
 }
 
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 4cd914366a..06f832e36c 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -2,6 +2,7 @@  OBJS-$(CONFIG_SCENE_SAD)                     += x86/scene_sad_init.o
 
 OBJS-$(CONFIG_AFIR_FILTER)                   += x86/af_afir_init.o
 OBJS-$(CONFIG_ANLMDN_FILTER)                 += x86/af_anlmdn_init.o
+OBJS-$(CONFIG_ATADENOISE_FILTER)             += x86/vf_atadenoise_init.o
 OBJS-$(CONFIG_BLEND_FILTER)                  += x86/vf_blend_init.o
 OBJS-$(CONFIG_BWDIF_FILTER)                  += x86/vf_bwdif_init.o
 OBJS-$(CONFIG_COLORSPACE_FILTER)             += x86/colorspacedsp_init.o
@@ -39,6 +40,7 @@  X86ASM-OBJS-$(CONFIG_SCENE_SAD)              += x86/scene_sad.o
 
 X86ASM-OBJS-$(CONFIG_AFIR_FILTER)            += x86/af_afir.o
 X86ASM-OBJS-$(CONFIG_ANLMDN_FILTER)          += x86/af_anlmdn.o
+X86ASM-OBJS-$(CONFIG_ATADENOISE_FILTER)      += x86/vf_atadenoise.o
 X86ASM-OBJS-$(CONFIG_BLEND_FILTER)           += x86/vf_blend.o
 X86ASM-OBJS-$(CONFIG_BWDIF_FILTER)           += x86/vf_bwdif.o
 X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER)      += x86/colorspacedsp.o
diff --git a/libavfilter/x86/vf_atadenoise.asm b/libavfilter/x86/vf_atadenoise.asm
new file mode 100644
index 0000000000..54ea08a0b7
--- /dev/null
+++ b/libavfilter/x86/vf_atadenoise.asm
@@ -0,0 +1,150 @@ 
+;*****************************************************************************
+;* x86-optimized functions for blend filter
+;*
+;* Copyright (C) 2019 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%if ARCH_X86_64
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_one:  times 8 dw 1
+pw_ones: times 8 dw 65535
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void ff_filter_row(const uint8_t *src, uint8_t *dst,
+;                    const uint8_t **srcf,
+;                    int w, int mid, int size,
+;                    int thra, int thrb)
+;------------------------------------------------------------------------------
+
+INIT_XMM sse4
+cglobal atadenoise_filter_row8, 8,10,13, src, dst, srcf, w, mid, size, i, j, srcfx, x
+    movsxdifnidn    wq, wd
+    movsxdifnidn  midq, midd
+    movsxdifnidn sizeq, sized
+    add           srcq, wq
+    add           dstq, wq
+    mov             xq, wq
+    dec          sizeq
+    neg             xq
+    movd            m4, r6m
+    SPLATW          m4, m4
+    movd            m5, r7m
+    SPLATW          m5, m5
+    pxor            m2, m2
+    mova           m10, [pw_ones]
+
+    .loop:
+        mov         iq, midq
+        mov         jq, midq
+        pxor        m3, m3
+        pxor       m11, m11
+        movu        m0, [srcq + xq]
+        punpcklbw   m0, m2
+        mova        m7, m0
+        mova        m8, [pw_one]
+        mova       m12, [pw_ones]
+
+        .loop0:
+            inc              iq
+            dec              jq
+
+            mov          srcfxq, [srcfq + jq * 8]
+            add          srcfxq, wq
+
+            movu             m1, [srcfxq + xq]
+            punpcklbw        m1, m2
+            mova             m9, m1
+            psubw            m1, m0
+            pabsw            m1, m1
+            paddw           m11, m1
+            pcmpgtw          m1, m4
+            mova             m6, m11
+            pcmpgtw          m6, m5
+            por              m6, m1
+            pxor             m6, m10
+            pand            m12, m6
+            pand             m9, m12
+            paddw            m7, m9
+            mova             m6, m12
+            psrlw            m6, 15
+            paddw            m8, m6
+
+            mov          srcfxq, [srcfq + iq * 8]
+            add          srcfxq, wq
+
+            movu             m1, [srcfxq + xq]
+            punpcklbw        m1, m2
+            mova             m9, m1
+            psubw            m1, m0
+            pabsw            m1, m1
+            paddw            m3, m1
+            pcmpgtw          m1, m4
+            mova             m6, m3
+            pcmpgtw          m6, m5
+            por              m6, m1
+            pxor             m6, m10
+            pand            m12, m6
+            pand             m9, m12
+            paddw            m7, m9
+            mova             m6, m12
+            psrlw            m6, 15
+            paddw            m8, m6
+
+            ptest           m12, m12
+            jz .finish
+
+            cmp              iq, sizeq
+            jl .loop0
+
+    .finish:
+        mova                 m1, m7
+        mova                 m6, m8
+
+        punpcklwd            m7, m2
+        punpcklwd            m8, m2
+        cvtdq2ps             m7, m7
+        cvtdq2ps             m8, m8
+        divps                m7, m8
+        cvttps2dq            m7, m7
+        packssdw             m7, m7
+        packuswb             m7, m7
+
+        movd        [dstq + xq], m7
+
+        punpckhwd            m1, m2
+        punpckhwd            m6, m2
+        cvtdq2ps             m1, m1
+        cvtdq2ps             m6, m6
+        divps                m1, m6
+        cvttps2dq            m1, m1
+        packssdw             m1, m1
+        packuswb             m1, m1
+
+        movd    [dstq + xq + 4], m1
+
+        add                  xq, mmsize/2
+    jl .loop
+    RET
+
+%endif
diff --git a/libavfilter/x86/vf_atadenoise_init.c b/libavfilter/x86/vf_atadenoise_init.c
new file mode 100644
index 0000000000..5ddfb243f2
--- /dev/null
+++ b/libavfilter/x86/vf_atadenoise_init.c
@@ -0,0 +1,40 @@ 
+/*
+ * Copyright (C) 2019 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/atadenoise.h"
+
+void ff_atadenoise_filter_row8_sse4(const uint8_t *src, uint8_t *dst,
+                                    const uint8_t **srcf,
+                                    int w, int mid, int size,
+                                    int thra, int thrb);
+
+av_cold void ff_atadenoise_init_x86(ATADenoiseDSPContext *dsp, int depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) && depth <= 8) {
+        dsp->filter_row = ff_atadenoise_filter_row8_sse4;
+    }
+}