diff mbox

[FFmpeg-devel] avfilter/vf_maskedclamp: add x86 SIMD

Message ID 20191023064011.14077-1-onemda@gmail.com
State Accepted
Commit ac0f5f4c1717470c0254879cb3ac164af0d47727
Headers show

Commit Message

Paul B Mahol Oct. 23, 2019, 6:40 a.m. UTC
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavfilter/maskedclamp.h             | 35 ++++++++++
 libavfilter/vf_maskedclamp.c          | 18 +++--
 libavfilter/x86/Makefile              |  2 +
 libavfilter/x86/vf_maskedclamp.asm    | 95 +++++++++++++++++++++++++++
 libavfilter/x86/vf_maskedclamp_init.c | 47 +++++++++++++
 5 files changed, 190 insertions(+), 7 deletions(-)
 create mode 100644 libavfilter/maskedclamp.h
 create mode 100644 libavfilter/x86/vf_maskedclamp.asm
 create mode 100644 libavfilter/x86/vf_maskedclamp_init.c

Comments

Paul B Mahol Oct. 23, 2019, 3:09 p.m. UTC | #1
Will apply.
diff mbox

Patch

diff --git a/libavfilter/maskedclamp.h b/libavfilter/maskedclamp.h
new file mode 100644
index 0000000000..6a1fd9c04b
--- /dev/null
+++ b/libavfilter/maskedclamp.h
@@ -0,0 +1,35 @@ 
+ /*
+ * Copyright (c) 2019 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_MASKEDCLAMP_H
+#define AVFILTER_MASKEDCLAMP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct MaskedClampDSPContext {
+    void (*maskedclamp)(const uint8_t *bsrc, uint8_t *dst,
+                        const uint8_t *darksrc, const uint8_t *brightsrc,
+                        int w, int undershoot, int overshoot);
+} MaskedClampDSPContext;
+
+void ff_maskedclamp_init_x86(MaskedClampDSPContext *dsp, int depth);
+
+#endif /* AVFILTER_MASKEDCLAMP_H */
diff --git a/libavfilter/vf_maskedclamp.c b/libavfilter/vf_maskedclamp.c
index 595c8f17fd..97000c7f9d 100644
--- a/libavfilter/vf_maskedclamp.c
+++ b/libavfilter/vf_maskedclamp.c
@@ -26,6 +26,7 @@ 
 #include "internal.h"
 #include "video.h"
 #include "framesync.h"
+#include "maskedclamp.h"
 
 #define OFFSET(x) offsetof(MaskedClampContext, x)
 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
@@ -47,9 +48,7 @@  typedef struct MaskedClampContext {
     int depth;
     FFFrameSync fs;
 
-    void (*maskedclamp)(const uint8_t *bsrc, uint8_t *dst,
-                        const uint8_t *darksrc, const uint8_t *brightsrc,
-                        int w, int undershoot, int overshoot);
+    MaskedClampDSPContext dsp;
 } MaskedClampContext;
 
 static const AVOption maskedclamp_options[] = {
@@ -117,7 +116,7 @@  static int maskedclamp_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_
         }
 
         for (y = slice_start; y < slice_end; y++) {
-            s->maskedclamp(bsrc, dst, darksrc, brightsrc, w, undershoot, overshoot);
+            s->dsp.maskedclamp(bsrc, dst, darksrc, brightsrc, w, undershoot, overshoot);
 
             dst  += dlinesize;
             bsrc += blinesize;
@@ -210,11 +209,16 @@  static int config_input(AVFilterLink *inlink)
     s->width[0]  = s->width[3]  = inlink->w;
 
     s->depth = desc->comp[0].depth;
+    s->undershoot = FFMIN(s->undershoot, (1 << s->depth) - 1);
+    s->overshoot = FFMIN(s->overshoot, (1 << s->depth) - 1);
 
-    if (desc->comp[0].depth == 8)
-        s->maskedclamp = maskedclamp8;
+    if (s->depth <= 8)
+        s->dsp.maskedclamp = maskedclamp8;
     else
-        s->maskedclamp = maskedclamp16;
+        s->dsp.maskedclamp = maskedclamp16;
+
+    if (ARCH_X86)
+        ff_maskedclamp_init_x86(&s->dsp, s->depth);
 
     return 0;
 }
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 8d97e46c3f..016a5b3511 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -17,6 +17,7 @@  OBJS-$(CONFIG_HQDN3D_FILTER)                 += x86/vf_hqdn3d_init.o
 OBJS-$(CONFIG_IDET_FILTER)                   += x86/vf_idet_init.o
 OBJS-$(CONFIG_INTERLACE_FILTER)              += x86/vf_tinterlace_init.o
 OBJS-$(CONFIG_LIMITER_FILTER)                += x86/vf_limiter_init.o
+OBJS-$(CONFIG_MASKEDCLAMP_FILTER)            += x86/vf_maskedclamp_init.o
 OBJS-$(CONFIG_MASKEDMERGE_FILTER)            += x86/vf_maskedmerge_init.o
 OBJS-$(CONFIG_NOISE_FILTER)                  += x86/vf_noise.o
 OBJS-$(CONFIG_OVERLAY_FILTER)                += x86/vf_overlay_init.o
@@ -56,6 +57,7 @@  X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER)          += x86/vf_hqdn3d.o
 X86ASM-OBJS-$(CONFIG_IDET_FILTER)            += x86/vf_idet.o
 X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER)       += x86/vf_interlace.o
 X86ASM-OBJS-$(CONFIG_LIMITER_FILTER)         += x86/vf_limiter.o
+X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER)     += x86/vf_maskedclamp.o
 X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER)     += x86/vf_maskedmerge.o
 X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER)         += x86/vf_overlay.o
 X86ASM-OBJS-$(CONFIG_PP7_FILTER)             += x86/vf_pp7.o
diff --git a/libavfilter/x86/vf_maskedclamp.asm b/libavfilter/x86/vf_maskedclamp.asm
new file mode 100644
index 0000000000..d586610c92
--- /dev/null
+++ b/libavfilter/x86/vf_maskedclamp.asm
@@ -0,0 +1,95 @@ 
+;*****************************************************************************
+;* x86-optimized functions for maskedclamp filter
+;*
+;* Copyright (c) 2019 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void ff_maskedclamp(const uint8_t *src, uint8_t *dst,
+;                     const uint8_t *darksrc,
+;                     const uint8_t *brightsrc,
+;                     int w, int undershoot, int overshoot)
+;------------------------------------------------------------------------------
+
+INIT_XMM sse2
+cglobal maskedclamp8, 5,5,5, src, dst, dark, bright, w, undershoot, overshoot
+    movsxdifnidn wq, wd
+
+    add        srcq, wq
+    add       darkq, wq
+    add     brightq, wq
+    add        dstq, wq
+    neg          wq
+
+    movd         m3, r5m
+    punpcklbw    m3, m3
+    SPLATW       m3, m3
+
+    movd         m4, r6m
+    punpcklbw    m4, m4
+    SPLATW       m4, m4
+
+    .loop:
+        movu                  m0, [srcq + wq]
+        movu                  m1, [darkq + wq]
+        movu                  m2, [brightq + wq]
+
+        psubusb               m1, m3
+        paddusb               m2, m4
+        CLIPUB                m0, m1, m2
+        mova         [dstq + wq], m0
+
+        add                   wq, mmsize
+        jl .loop
+    RET
+
+INIT_XMM sse4
+cglobal maskedclamp16, 5,5,5, src, dst, dark, bright, w, undershoot, overshoot
+    shl          wd, 1
+
+    add        srcq, wq
+    add       darkq, wq
+    add     brightq, wq
+    add        dstq, wq
+    neg          wq
+
+    movd         m3, r5m
+    SPLATW       m3, m3
+
+    movd         m4, r6m
+    SPLATW       m4, m4
+
+    .loop:
+        movu                  m0, [srcq + wq]
+        movu                  m1, [darkq + wq]
+        movu                  m2, [brightq + wq]
+
+        psubusw               m1, m3
+        paddusw               m2, m4
+        pmaxuw                m0, m1
+        pminuw                m0, m2
+        mova         [dstq + wq], m0
+
+        add                   wq, mmsize
+        jl .loop
+    RET
diff --git a/libavfilter/x86/vf_maskedclamp_init.c b/libavfilter/x86/vf_maskedclamp_init.c
new file mode 100644
index 0000000000..53153f80ea
--- /dev/null
+++ b/libavfilter/x86/vf_maskedclamp_init.c
@@ -0,0 +1,47 @@ 
+/*
+ * Copyright (c) 2019 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/maskedclamp.h"
+
+void ff_maskedclamp8_sse2(const uint8_t *bsrc, uint8_t *dst,
+                          const uint8_t *darksrc, const uint8_t *brightsrc,
+                          int w, int undershoot, int overshoot);
+
+void ff_maskedclamp16_sse4(const uint8_t *bsrc, uint8_t *dst,
+                           const uint8_t *darksrc, const uint8_t *brightsrc,
+                           int w, int undershoot, int overshoot);
+
+av_cold void ff_maskedclamp_init_x86(MaskedClampDSPContext *dsp, int depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags) && depth <= 8) {
+        dsp->maskedclamp = ff_maskedclamp8_sse2;
+    }
+
+    if (EXTERNAL_SSE4(cpu_flags) && depth > 8) {
+        dsp->maskedclamp = ff_maskedclamp16_sse4;
+    }
+}