[FFmpeg-devel,3/3] avfilter/x86/vf_eq: add SSE2 version

Message ID	20190917133938.949-3-ting.fu@intel.com
State	Superseded
Headers	show Return-Path: <ffmpeg-devel-bounces@ffmpeg.org> From: Ting Fu <ting.fu@intel.com> To: ffmpeg-devel@ffmpeg.org Date: Tue, 17 Sep 2019 21:39:38 +0800 Message-Id: <20190917133938.949-3-ting.fu@intel.com> In-Reply-To: <20190917133938.949-1-ting.fu@intel.com> References: <20190917133938.949-1-ting.fu@intel.com> Subject: [FFmpeg-devel] [PATCH 3/3] avfilter/x86/vf_eq: add SSE2 version Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

Message ID

20190917133938.949-3-ting.fu@intel.com

State

Superseded

Headers

From: Ting Fu <ting.fu@intel.com>
To: ffmpeg-devel@ffmpeg.org
Date: Tue, 17 Sep 2019 21:39:38 +0800
Message-Id: <20190917133938.949-3-ting.fu@intel.com>
In-Reply-To: <20190917133938.949-1-ting.fu@intel.com>
References: <20190917133938.949-1-ting.fu@intel.com>
Subject: [FFmpeg-devel] [PATCH 3/3] avfilter/x86/vf_eq: add SSE2 version
Precedence: list
Reply-To: FFmpeg development discussions and patches
	<ffmpeg-devel@ffmpeg.org>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

Signed-off-by: Ting Fu <ting.fu@intel.com> --- libavfilter/x86/vf_eq.asm | 19 +++++++++++++++++-- libavfilter/x86/vf_eq_init.c | 20 ++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-)

Comments

James Almer Sept. 17, 2019, 1:56 p.m. UTC | #1

On 9/17/2019 10:39 AM, Ting Fu wrote:
> Signed-off-by: Ting Fu <ting.fu@intel.com>
> ---
>  libavfilter/x86/vf_eq.asm    | 19 +++++++++++++++++--
>  libavfilter/x86/vf_eq_init.c | 20 ++++++++++++++++++++
>  2 files changed, 37 insertions(+), 2 deletions(-)
> 
> diff --git a/libavfilter/x86/vf_eq.asm b/libavfilter/x86/vf_eq.asm
> index bf28691297..d6b51cf6df 100644
> --- a/libavfilter/x86/vf_eq.asm
> +++ b/libavfilter/x86/vf_eq.asm
> @@ -24,14 +24,21 @@
>  
>  SECTION .text
>  
> -INIT_MMX mmx
> +%macro PROCESS_ONE_LINE 1
>  cglobal process_one_line, 5, 7, 5, src, dst, contrast, brightness, w
>      movd m3, contrastd
>      movd m4, brightnessd
>      movsx r5d, contrastw
>      movsx r6d, brightnessw
> +%if mmsize == 8
>      pshufw m3, m3, 0
>      pshufw m4, m4, 0

pshufw isn't mmx, but mmxext (AKA, integer SSE). Hardly a problem since
i don't think anyone using a Pentium 2 will use this filter, but still
it's technically wrong.

The function should be changed into mmxext to reflect the above.

> +%elif mmsize == 16
> +    pshuflw m3, m3, 0
> +    movlhps m3, m3
> +    pshuflw m4, m4, 0
> +    movlhps m4, m4
> +%endif

You can use the SPLATW macro instead. It will expand to the correct
instructions based on what set is enabled.

>  
>      DEFINE_ARGS src, dst, tmp, scalar, w
>      xor tmpd, tmpd
> @@ -39,7 +46,7 @@ cglobal process_one_line, 5, 7, 5, src, dst, contrast, brightness, w
>      pxor m1, m1
>      mov scalard, wd
>      and scalard, mmsize-1
> -    sar wd, 3
> +    sar wd, %1
>      cmp wd, 1
>      jl .loop1
>  
> @@ -80,3 +87,11 @@ cglobal process_one_line, 5, 7, 5, src, dst, contrast, brightness, w
>  
>      .end:
>          RET
> +
> +%endmacro
> +
> +INIT_MMX mmx
> +PROCESS_ONE_LINE 3
> +
> +INIT_XMM sse2
> +PROCESS_ONE_LINE 4
> diff --git a/libavfilter/x86/vf_eq_init.c b/libavfilter/x86/vf_eq_init.c
> index 63c69078fb..cdd5272220 100644
> --- a/libavfilter/x86/vf_eq_init.c
> +++ b/libavfilter/x86/vf_eq_init.c
> @@ -28,6 +28,8 @@
>  
>  extern void ff_process_one_line_mmx(const uint8_t *src, uint8_t *dst, int contvec,
>                                      int brvec, int w);
> +extern void ff_process_one_line_sse2(const uint8_t *src, uint8_t *dst, int contvec,
> +                                    int brvec, int w);
>  
>  static void process_mmx(EQParameters *param, uint8_t *dst, int dst_stride,
>                          const uint8_t *src, int src_stride, int w, int h)
> @@ -44,6 +46,21 @@ static void process_mmx(EQParameters *param, uint8_t *dst, int dst_stride,
>      emms_c();
>  }
>  
> +static void process_sse2(EQParameters *param, uint8_t *dst, int dst_stride,
> +                        const uint8_t *src, int src_stride, int w, int h)
> +{
> +    short contrast = (short) (param->contrast * 256 * 16);
> +    short brightness = ((short) (100.0 * param->brightness + 100.0) * 511)
> +                       / 200 - 128 - contrast / 32;
> +
> +    while (h--) {
> +        ff_process_one_line_sse2(src, dst, contrast, brightness, w);
> +        src += src_stride;
> +        dst += dst_stride;
> +    }
> +    emms_c();

No need for this since the SSE2 version uses xmm regs.

> +}
> +
>  av_cold void ff_eq_init_x86(EQContext *eq)
>  {
>      int cpu_flags = av_get_cpu_flags();
> @@ -51,5 +68,8 @@ av_cold void ff_eq_init_x86(EQContext *eq)
>      if (cpu_flags & AV_CPU_FLAG_MMX) {
>          eq->process = process_mmx;
>      }
> +    if (cpu_flags & AV_CPU_FLAG_SSE2) {
> +        eq->process = process_sse2;
> +    }
>  }
>  
>

diff --git a/libavfilter/x86/vf_eq.asm b/libavfilter/x86/vf_eq.asm
index bf28691297..d6b51cf6df 100644
--- a/libavfilter/x86/vf_eq.asm
+++ b/libavfilter/x86/vf_eq.asm
@@ -24,14 +24,21 @@ 
 
 SECTION .text
 
-INIT_MMX mmx
+%macro PROCESS_ONE_LINE 1
 cglobal process_one_line, 5, 7, 5, src, dst, contrast, brightness, w
     movd m3, contrastd
     movd m4, brightnessd
     movsx r5d, contrastw
     movsx r6d, brightnessw
+%if mmsize == 8
     pshufw m3, m3, 0
     pshufw m4, m4, 0
+%elif mmsize == 16
+    pshuflw m3, m3, 0
+    movlhps m3, m3
+    pshuflw m4, m4, 0
+    movlhps m4, m4
+%endif
 
     DEFINE_ARGS src, dst, tmp, scalar, w
     xor tmpd, tmpd
@@ -39,7 +46,7 @@  cglobal process_one_line, 5, 7, 5, src, dst, contrast, brightness, w
     pxor m1, m1
     mov scalard, wd
     and scalard, mmsize-1
-    sar wd, 3
+    sar wd, %1
     cmp wd, 1
     jl .loop1
 
@@ -80,3 +87,11 @@  cglobal process_one_line, 5, 7, 5, src, dst, contrast, brightness, w
 
     .end:
         RET
+
+%endmacro
+
+INIT_MMX mmx
+PROCESS_ONE_LINE 3
+
+INIT_XMM sse2
+PROCESS_ONE_LINE 4
diff --git a/libavfilter/x86/vf_eq_init.c b/libavfilter/x86/vf_eq_init.c
index 63c69078fb..cdd5272220 100644
--- a/libavfilter/x86/vf_eq_init.c
+++ b/libavfilter/x86/vf_eq_init.c
@@ -28,6 +28,8 @@ 
 
 extern void ff_process_one_line_mmx(const uint8_t *src, uint8_t *dst, int contvec,
                                     int brvec, int w);
+extern void ff_process_one_line_sse2(const uint8_t *src, uint8_t *dst, int contvec,
+                                    int brvec, int w);
 
 static void process_mmx(EQParameters *param, uint8_t *dst, int dst_stride,
                         const uint8_t *src, int src_stride, int w, int h)
@@ -44,6 +46,21 @@  static void process_mmx(EQParameters *param, uint8_t *dst, int dst_stride,
     emms_c();
 }
 
+static void process_sse2(EQParameters *param, uint8_t *dst, int dst_stride,
+                        const uint8_t *src, int src_stride, int w, int h)
+{
+    short contrast = (short) (param->contrast * 256 * 16);
+    short brightness = ((short) (100.0 * param->brightness + 100.0) * 511)
+                       / 200 - 128 - contrast / 32;
+
+    while (h--) {
+        ff_process_one_line_sse2(src, dst, contrast, brightness, w);
+        src += src_stride;
+        dst += dst_stride;
+    }
+    emms_c();
+}
+
 av_cold void ff_eq_init_x86(EQContext *eq)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -51,5 +68,8 @@  av_cold void ff_eq_init_x86(EQContext *eq)
     if (cpu_flags & AV_CPU_FLAG_MMX) {
         eq->process = process_mmx;
     }
+    if (cpu_flags & AV_CPU_FLAG_SSE2) {
+        eq->process = process_sse2;
+    }
 }

[FFmpeg-devel,3/3] avfilter/x86/vf_eq: add SSE2 version

Commit Message

Comments

Patch