diff mbox series

[FFmpeg-devel] x86/float_dsp: add SSE2 and AVX versions of scalarproduct_double

Message ID 20240531194708.6146-1-jamrial@gmail.com
State New
Headers show
Series [FFmpeg-devel] x86/float_dsp: add SSE2 and AVX versions of scalarproduct_double | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

James Almer May 31, 2024, 7:47 p.m. UTC
Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavutil/x86/float_dsp.asm    | 52 ++++++++++++++++++++++++++++++++++
 libavutil/x86/float_dsp_init.c |  5 ++++
 2 files changed, 57 insertions(+)

Comments

James Almer June 3, 2024, 2:39 a.m. UTC | #1
On 5/31/2024 4:47 PM, James Almer wrote:
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
>   libavutil/x86/float_dsp.asm    | 52 ++++++++++++++++++++++++++++++++++
>   libavutil/x86/float_dsp_init.c |  5 ++++
>   2 files changed, 57 insertions(+)
> 
> diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
> index e84ba52566..e9816cdf02 100644
> --- a/libavutil/x86/float_dsp.asm
> +++ b/libavutil/x86/float_dsp.asm
> @@ -567,6 +567,58 @@ cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset
>   %endif
>       RET
>   
> +;---------------------------------------------------------------------------------
> +; double scalarproduct_double(const double *v1, const double *v2, size_t len)
> +;---------------------------------------------------------------------------------
> +%macro SCALARPRODUCT_DOUBLE 0
> +cglobal scalarproduct_double, 3,3,8, v1, v2, offset
> +    shl offsetq, 3
> +    add     v1q, offsetq
> +    add     v2q, offsetq
> +    neg offsetq
> +    xorpd    m0, m0
> +    xorpd    m1, m1
> +    xorpd    m2, m2
> +    xorpd    m3, m3
> +align 16
> +.loop:
> +    movapd   m4, [v1q+offsetq+mmsize*0]
> +    movapd   m5, [v1q+offsetq+mmsize*1]
> +    movapd   m6, [v1q+offsetq+mmsize*2]
> +    movapd   m7, [v1q+offsetq+mmsize*3]
> +    mulpd    m4, [v2q+offsetq+mmsize*0]
> +    mulpd    m5, [v2q+offsetq+mmsize*1]
> +    mulpd    m6, [v2q+offsetq+mmsize*2]
> +    mulpd    m7, [v2q+offsetq+mmsize*3]
> +    addpd    m0, m4
> +    addpd    m1, m5
> +    addpd    m2, m6
> +    addpd    m3, m7
> +    add offsetq, mmsize*4
> +    jl .loop
> +    addpd    m0, m1
> +    addpd    m2, m3
> +    addpd    m0, m2
> +%if mmsize == 32
> +    vextractf128 xm1, m0, 1
> +    addpd   xm0, xm1
> +%endif
> +    movhlps xm1, xm0
> +    addpd   xm0, xm1
> +%if ARCH_X86_64 == 0
> +    movsd   r0m, xm0
> +    fld qword r0m
> +%endif
> +    RET
> +%endmacro
> +
> +INIT_XMM sse2
> +SCALARPRODUCT_DOUBLE
> +%if HAVE_AVX_EXTERNAL
> +INIT_YMM avx
> +SCALARPRODUCT_DOUBLE
> +%endif
> +
>   ;-----------------------------------------------------------------------------
>   ; void ff_butterflies_float(float *src0, float *src1, int len);
>   ;-----------------------------------------------------------------------------
> diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
> index 093bce9b94..6cf0b4a277 100644
> --- a/libavutil/x86/float_dsp_init.c
> +++ b/libavutil/x86/float_dsp_init.c
> @@ -73,6 +73,9 @@ void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
>   float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
>   float ff_scalarproduct_float_fma3(const float *v1, const float *v2, int order);
>   
> +double ff_scalarproduct_double_sse2(const double *v1, const double *v2, size_t order);
> +double ff_scalarproduct_double_avx(const double *v1, const double *v2, size_t order);
> +
>   void ff_butterflies_float_sse(float *restrict src0, float *restrict src1, int len);
>   
>   av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
> @@ -93,6 +96,7 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
>           fdsp->vector_dmul = ff_vector_dmul_sse2;
>           fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_sse2;
>           fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
> +        fdsp->scalarproduct_double = ff_scalarproduct_double_sse2;
>       }
>       if (EXTERNAL_AVX_FAST(cpu_flags)) {
>           fdsp->vector_fmul = ff_vector_fmul_avx;
> @@ -102,6 +106,7 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
>           fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx;
>           fdsp->vector_fmul_add    = ff_vector_fmul_add_avx;
>           fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
> +        fdsp->scalarproduct_double = ff_scalarproduct_double_avx;
>       }
>       if (EXTERNAL_AVX2_FAST(cpu_flags)) {
>           fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2;

Will apply.
diff mbox series

Patch

diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index e84ba52566..e9816cdf02 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -567,6 +567,58 @@  cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset
 %endif
     RET
 
+;---------------------------------------------------------------------------------
+; double scalarproduct_double(const double *v1, const double *v2, size_t len)
+;---------------------------------------------------------------------------------
+%macro SCALARPRODUCT_DOUBLE 0
+cglobal scalarproduct_double, 3,3,8, v1, v2, offset
+    shl offsetq, 3
+    add     v1q, offsetq
+    add     v2q, offsetq
+    neg offsetq
+    xorpd    m0, m0
+    xorpd    m1, m1
+    xorpd    m2, m2
+    xorpd    m3, m3
+align 16
+.loop:
+    movapd   m4, [v1q+offsetq+mmsize*0]
+    movapd   m5, [v1q+offsetq+mmsize*1]
+    movapd   m6, [v1q+offsetq+mmsize*2]
+    movapd   m7, [v1q+offsetq+mmsize*3]
+    mulpd    m4, [v2q+offsetq+mmsize*0]
+    mulpd    m5, [v2q+offsetq+mmsize*1]
+    mulpd    m6, [v2q+offsetq+mmsize*2]
+    mulpd    m7, [v2q+offsetq+mmsize*3]
+    addpd    m0, m4
+    addpd    m1, m5
+    addpd    m2, m6
+    addpd    m3, m7
+    add offsetq, mmsize*4
+    jl .loop
+    addpd    m0, m1
+    addpd    m2, m3
+    addpd    m0, m2
+%if mmsize == 32
+    vextractf128 xm1, m0, 1
+    addpd   xm0, xm1
+%endif
+    movhlps xm1, xm0
+    addpd   xm0, xm1
+%if ARCH_X86_64 == 0
+    movsd   r0m, xm0
+    fld qword r0m
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse2
+SCALARPRODUCT_DOUBLE
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+SCALARPRODUCT_DOUBLE
+%endif
+
 ;-----------------------------------------------------------------------------
 ; void ff_butterflies_float(float *src0, float *src1, int len);
 ;-----------------------------------------------------------------------------
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index 093bce9b94..6cf0b4a277 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -73,6 +73,9 @@  void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
 float ff_scalarproduct_float_fma3(const float *v1, const float *v2, int order);
 
+double ff_scalarproduct_double_sse2(const double *v1, const double *v2, size_t order);
+double ff_scalarproduct_double_avx(const double *v1, const double *v2, size_t order);
+
 void ff_butterflies_float_sse(float *restrict src0, float *restrict src1, int len);
 
 av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
@@ -93,6 +96,7 @@  av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
         fdsp->vector_dmul = ff_vector_dmul_sse2;
         fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_sse2;
         fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
+        fdsp->scalarproduct_double = ff_scalarproduct_double_sse2;
     }
     if (EXTERNAL_AVX_FAST(cpu_flags)) {
         fdsp->vector_fmul = ff_vector_fmul_avx;
@@ -102,6 +106,7 @@  av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
         fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx;
         fdsp->vector_fmul_add    = ff_vector_fmul_add_avx;
         fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
+        fdsp->scalarproduct_double = ff_scalarproduct_double_avx;
     }
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
         fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2;