diff mbox series

[FFmpeg-devel,2/2] avfilter/x86/vf_gblur: add postscale SIMD

Message ID 20210213111038.14840-2-onemda@gmail.com
State Superseded
Headers show
Series [FFmpeg-devel,1/2] avfilter/vf_gblur: factor out postscale function | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Paul B Mahol Feb. 13, 2021, 11:10 a.m. UTC
Signed-off-by: Paul B Mahol <onemda@gmail.com>
---
 libavfilter/x86/vf_gblur.asm    | 46 +++++++++++++++++++++++++++++++++
 libavfilter/x86/vf_gblur_init.c | 11 ++++++--
 2 files changed, 55 insertions(+), 2 deletions(-)

Comments

James Almer Feb. 14, 2021, 1:36 a.m. UTC | #1
On 2/13/2021 8:10 AM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda@gmail.com>
> ---
>   libavfilter/x86/vf_gblur.asm    | 46 +++++++++++++++++++++++++++++++++
>   libavfilter/x86/vf_gblur_init.c | 11 ++++++--
>   2 files changed, 55 insertions(+), 2 deletions(-)
> 
> diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm
> index a25b1659f5..8fea6d2a61 100644
> --- a/libavfilter/x86/vf_gblur.asm
> +++ b/libavfilter/x86/vf_gblur.asm
> @@ -183,3 +183,49 @@ HORIZ_SLICE
>   INIT_XMM avx2
>   HORIZ_SLICE
>   %endif
> +
> +%macro POSTSCALE_SLICE 0
> +%if UNIX64
> +cglobal postscale_slice, 2, 6, 4, ptr, length, postscale, min, max, x

cglobal postscale_slice, 2, 3, 4, ptr, length, x

> +%else
> +cglobal postscale_slice, 5, 6, 4, ptr, length, postscale, min, max, x
> +%endif
> +    shl lengthd, 2
> +%if WIN64
> +    SWAP 0, 2
> +    SWAP 1, 3
> +    SWAP 2, 4
> +%endif
> +    shufps   xm0, xm0, 0
> +    shufps   xm1, xm1, 0
> +    shufps   xm2, xm2, 0
> +%if cpuflag(avx2)
> +    vinsertf128  m0, m0, xm0, 1
> +    vinsertf128  m1, m1, xm1, 1
> +    vinsertf128  m2, m2, xm2, 1

You can use vbroadcastss ymm, xmm with AVX2, which combines both the 
shufps and vinsertf128 into one instruction.

As is, this function is base AVX. So if you can't measure any 
performance gain with vbroadcastss, then just mark the function as AVX.

> +%endif
> +    xor      xq, xq
> +
> +    .loop:
> +    movu          m3, [ptrq + xq]
> +    mulps         m3, m0

AVX can use unaligned memory operands, so just do

mulps m3, m0, [ptrq + xq]

But keep the explicit movu + mulps for the SSE version, otherwise x86inc 
will expand it into a mova.

> +    maxps         m3, m1
> +    minps         m3, m2
> +    movu   [ptrq+xq], m3
> +
> +    add xq, mmsize
> +    cmp xd, lengthd

Can't you use the neg trick? It should let you reuse length instead of x.

> +    jl .loop
> +
> +    RET
> +%endmacro
> +
> +%if ARCH_X86_64

Nothing in this function seems to require x86_64.

> +INIT_XMM sse4

No instruction is SSE4 here. It's all base SSE.

> +POSTSCALE_SLICE
> +
> +%if HAVE_AVX_EXTERNAL

Wrong check.

> +INIT_YMM avx2
> +POSTSCALE_SLICE
> +%endif
> +%endif
> diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c
> index e63e59fe23..7a9b40b0ad 100644
> --- a/libavfilter/x86/vf_gblur_init.c
> +++ b/libavfilter/x86/vf_gblur_init.c
> @@ -27,14 +27,21 @@
>   void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale);
>   void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale);
>   
> +void ff_postscale_slice_sse4(float *ptr, int length, float postscale, float min, float max);
> +void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
> +
>   av_cold void ff_gblur_init_x86(GBlurContext *s)
>   {
>   #if ARCH_X86_64
>       int cpu_flags = av_get_cpu_flags();
>   
> -    if (EXTERNAL_SSE4(cpu_flags))
> +    if (EXTERNAL_SSE4(cpu_flags)) {
>           s->horiz_slice = ff_horiz_slice_sse4;
> -    if (EXTERNAL_AVX2(cpu_flags))
> +        s->postscale_slice = ff_postscale_slice_sse4;
> +    }
> +    if (EXTERNAL_AVX2(cpu_flags)) {
>           s->horiz_slice = ff_horiz_slice_avx2;
> +        s->postscale_slice = ff_postscale_slice_avx2;

Needs to be EXTERNAL_AVX2_FAST. You're using ymm regs, unlike in 
ff_horiz_slice_avx2.

> +    }
>   #endif
>   }
>
diff mbox series

Patch

diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm
index a25b1659f5..8fea6d2a61 100644
--- a/libavfilter/x86/vf_gblur.asm
+++ b/libavfilter/x86/vf_gblur.asm
@@ -183,3 +183,49 @@  HORIZ_SLICE
 INIT_XMM avx2
 HORIZ_SLICE
 %endif
+
+%macro POSTSCALE_SLICE 0
+%if UNIX64
+cglobal postscale_slice, 2, 6, 4, ptr, length, postscale, min, max, x
+%else
+cglobal postscale_slice, 5, 6, 4, ptr, length, postscale, min, max, x
+%endif
+    shl lengthd, 2
+%if WIN64
+    SWAP 0, 2
+    SWAP 1, 3
+    SWAP 2, 4
+%endif
+    shufps   xm0, xm0, 0
+    shufps   xm1, xm1, 0
+    shufps   xm2, xm2, 0
+%if cpuflag(avx2)
+    vinsertf128  m0, m0, xm0, 1
+    vinsertf128  m1, m1, xm1, 1
+    vinsertf128  m2, m2, xm2, 1
+%endif
+    xor      xq, xq
+
+    .loop:
+    movu          m3, [ptrq + xq]
+    mulps         m3, m0
+    maxps         m3, m1
+    minps         m3, m2
+    movu   [ptrq+xq], m3
+
+    add xq, mmsize
+    cmp xd, lengthd
+    jl .loop
+
+    RET
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse4
+POSTSCALE_SLICE
+
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx2
+POSTSCALE_SLICE
+%endif
+%endif
diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c
index e63e59fe23..7a9b40b0ad 100644
--- a/libavfilter/x86/vf_gblur_init.c
+++ b/libavfilter/x86/vf_gblur_init.c
@@ -27,14 +27,21 @@ 
 void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale);
 void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale);
 
+void ff_postscale_slice_sse4(float *ptr, int length, float postscale, float min, float max);
+void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
+
 av_cold void ff_gblur_init_x86(GBlurContext *s)
 {
 #if ARCH_X86_64
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_SSE4(cpu_flags))
+    if (EXTERNAL_SSE4(cpu_flags)) {
         s->horiz_slice = ff_horiz_slice_sse4;
-    if (EXTERNAL_AVX2(cpu_flags))
+        s->postscale_slice = ff_postscale_slice_sse4;
+    }
+    if (EXTERNAL_AVX2(cpu_flags)) {
         s->horiz_slice = ff_horiz_slice_avx2;
+        s->postscale_slice = ff_postscale_slice_avx2;
+    }
 #endif
 }