diff mbox series

[FFmpeg-devel,2/2] avfilter/x86/vf_gblur: add postscale SIMD

Message ID 20210214143205.9320-2-onemda@gmail.com
State Accepted
Headers show
Series [FFmpeg-devel,1/2] avfilter/vf_gblur: factor out postscale function | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Paul B Mahol Feb. 14, 2021, 2:32 p.m. UTC
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavfilter/x86/vf_gblur.asm    | 49 +++++++++++++++++++++++++++++++++
 libavfilter/x86/vf_gblur_init.c | 17 ++++++++++--
 2 files changed, 63 insertions(+), 3 deletions(-)

Comments

James Almer Feb. 15, 2021, 5:44 p.m. UTC | #1
On 2/14/2021 11:32 AM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>   libavfilter/x86/vf_gblur.asm    | 49 +++++++++++++++++++++++++++++++++
>   libavfilter/x86/vf_gblur_init.c | 17 ++++++++++--
>   2 files changed, 63 insertions(+), 3 deletions(-)
> 
> diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm
> index a25b1659f5..8ccfbdc56b 100644
> --- a/libavfilter/x86/vf_gblur.asm
> +++ b/libavfilter/x86/vf_gblur.asm
> @@ -183,3 +183,52 @@ HORIZ_SLICE
>   INIT_XMM avx2
>   HORIZ_SLICE
>   %endif
> +
> +%macro POSTSCALE_SLICE 0
> +%if UNIX64
> +cglobal postscale_slice, 2, 3, 4, ptr, length, x

2, 2, 4, ptr, length

> +%else
> +cglobal postscale_slice, 5, 6, 4, ptr, length, postscale, min, max, x

5, 5, 4, ptr, length, postscale, min, max

> +%endif
> +    shl lengthd, 2

shl lengthd, 2
add ptrq, lengthq
neg lengthq

> +%if WIN64
> +    SWAP 0, 2
> +    SWAP 1, 3
> +    SWAP 2, 4
> +%endif
> +%if cpuflag(avx2)
> +    vbroadcastss  m0, xm0
> +    vbroadcastss  m1, xm1
> +    vbroadcastss  m2, xm2
> +%else
> +    shufps   xm0, xm0, 0
> +    shufps   xm1, xm1, 0
> +    shufps   xm2, xm2, 0
> +%endif
> +    xor      xq, xq

remove this instruction.

> +
> +    .loop:
> +%if cpuflag(avx2)
> +    mulps         m3, m0, [ptrq + xq]

Replace xq with lengthq here and everywhere else.

> +%else
> +    movu          m3, [ptrq + xq]
> +    mulps         m3, m0
> +%endif
> +    maxps         m3, m1
> +    minps         m3, m2
> +    movu   [ptrq+xq], m3
> +
> +    add xq, mmsize
> +    cmp xd, lengthd

remove this cmp.

> +    jl .loop
> +
> +    RET
> +%endmacro
> +
> +INIT_XMM sse
> +POSTSCALE_SLICE
> +
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +POSTSCALE_SLICE
> +%endif
> diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c
> index e63e59fe23..9223cb797d 100644
> --- a/libavfilter/x86/vf_gblur_init.c
> +++ b/libavfilter/x86/vf_gblur_init.c
> @@ -27,14 +27,25 @@
>   void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale);
>   void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale);
>   
> +void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min, float max);
> +void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
> +
>   av_cold void ff_gblur_init_x86(GBlurContext *s)
>   {
> -#if ARCH_X86_64
>       int cpu_flags = av_get_cpu_flags();
>   
> -    if (EXTERNAL_SSE4(cpu_flags))
> +    if (EXTERNAL_SSE(cpu_flags)) {
> +        s->postscale_slice = ff_postscale_slice_sse;
> +    }
> +    if (EXTERNAL_AVX2(cpu_flags)) {

EXTERNAL_AVX2_FAST

> +        s->postscale_slice = ff_postscale_slice_avx2;
> +    }
> +#if ARCH_X86_64
> +    if (EXTERNAL_SSE4(cpu_flags)) {
>           s->horiz_slice = ff_horiz_slice_sse4;
> -    if (EXTERNAL_AVX2(cpu_flags))
> +    }
> +    if (EXTERNAL_AVX2(cpu_flags)) {
>           s->horiz_slice = ff_horiz_slice_avx2;
> +    }
>   #endif
>   }

LGTM with the above.
diff mbox series

Patch

diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm
index a25b1659f5..8ccfbdc56b 100644
--- a/libavfilter/x86/vf_gblur.asm
+++ b/libavfilter/x86/vf_gblur.asm
@@ -183,3 +183,52 @@  HORIZ_SLICE
 INIT_XMM avx2
 HORIZ_SLICE
 %endif
+
+%macro POSTSCALE_SLICE 0
+%if UNIX64
+cglobal postscale_slice, 2, 3, 4, ptr, length, x
+%else
+cglobal postscale_slice, 5, 6, 4, ptr, length, postscale, min, max, x
+%endif
+    shl lengthd, 2
+%if WIN64
+    SWAP 0, 2
+    SWAP 1, 3
+    SWAP 2, 4
+%endif
+%if cpuflag(avx2)
+    vbroadcastss  m0, xm0
+    vbroadcastss  m1, xm1
+    vbroadcastss  m2, xm2
+%else
+    shufps   xm0, xm0, 0
+    shufps   xm1, xm1, 0
+    shufps   xm2, xm2, 0
+%endif
+    xor      xq, xq
+
+    .loop:
+%if cpuflag(avx2)
+    mulps         m3, m0, [ptrq + xq]
+%else
+    movu          m3, [ptrq + xq]
+    mulps         m3, m0
+%endif
+    maxps         m3, m1
+    minps         m3, m2
+    movu   [ptrq+xq], m3
+
+    add xq, mmsize
+    cmp xd, lengthd
+    jl .loop
+
+    RET
+%endmacro
+
+INIT_XMM sse
+POSTSCALE_SLICE
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+POSTSCALE_SLICE
+%endif
diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c
index e63e59fe23..9223cb797d 100644
--- a/libavfilter/x86/vf_gblur_init.c
+++ b/libavfilter/x86/vf_gblur_init.c
@@ -27,14 +27,25 @@ 
 void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale);
 void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale);
 
+void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min, float max);
+void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
+
 av_cold void ff_gblur_init_x86(GBlurContext *s)
 {
-#if ARCH_X86_64
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_SSE4(cpu_flags))
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->postscale_slice = ff_postscale_slice_sse;
+    }
+    if (EXTERNAL_AVX2(cpu_flags)) {
+        s->postscale_slice = ff_postscale_slice_avx2;
+    }
+#if ARCH_X86_64
+    if (EXTERNAL_SSE4(cpu_flags)) {
         s->horiz_slice = ff_horiz_slice_sse4;
-    if (EXTERNAL_AVX2(cpu_flags))
+    }
+    if (EXTERNAL_AVX2(cpu_flags)) {
         s->horiz_slice = ff_horiz_slice_avx2;
+    }
 #endif
 }