diff mbox

[FFmpeg-devel,2/2] avutil/float_dsp: add ff_vector_dmul_{sse2, avx}

Message ID 20180913130825.11236-2-jamrial@gmail.com
State New
Headers show

Commit Message

James Almer Sept. 13, 2018, 1:08 p.m. UTC
~3x to 5x faster.

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavutil/x86/float_dsp.asm    | 33 +++++++++++++++++++++++++++++++++
 libavutil/x86/float_dsp_init.c |  7 +++++++
 2 files changed, 40 insertions(+)

Comments

Henrik Gramner Sept. 14, 2018, 12:57 p.m. UTC | #1
On Thu, Sep 13, 2018 at 3:08 PM, James Almer <jamrial@gmail.com> wrote:
> +    lea       lenq, [lend*8 - mmsize*4]

Is len guaranteed to be a multiple of mmsize/8? Otherwise this would
cause misalignment. It will also break if len < mmsize/2.

Also if you want a 32-bit result from lea it should be written as "lea
lend, [lenq*8 - mmsize*4]" which is equivalent but has a shorter
opcode (e.g. always use native sizes within brackets).
James Almer Sept. 14, 2018, 1:26 p.m. UTC | #2
On 9/14/2018 9:57 AM, Henrik Gramner wrote:
> On Thu, Sep 13, 2018 at 3:08 PM, James Almer <jamrial@gmail.com> wrote:
>> +    lea       lenq, [lend*8 - mmsize*4]
> 
> Is len guaranteed to be a multiple of mmsize/8? Otherwise this would
> cause misalignment. It will also break if len < mmsize/2.

len must be a multiple of 16 as per the doxy, so yes.
The only way for len to be < mmsize/2 is if we add an avx512 version.

> 
> Also if you want a 32-bit result from lea it should be written as "lea
> lend, [lenq*8 - mmsize*4]" which is equivalent but has a shorter
> opcode (e.g. always use native sizes within brackets).

len is an int, so I assume this is only possible here because it's an
argument passed in a reg and not stack? Otherwise, the upper 32bits
would probably make a mess with the multiplication. See for example
vector_fmul_add where len is the fifth argument.
Henrik Gramner Sept. 14, 2018, 2:51 p.m. UTC | #3
On Fri, Sep 14, 2018 at 3:26 PM, James Almer <jamrial@gmail.com> wrote:
> On 9/14/2018 9:57 AM, Henrik Gramner wrote:
>> Also if you want a 32-bit result from lea it should be written as "lea
>> lend, [lenq*8 - mmsize*4]" which is equivalent but has a shorter
>> opcode (e.g. always use native sizes within brackets).
>
> len is an int, so I assume this is only possible here because it's an
> argument passed in a reg and not stack? Otherwise, the upper 32bits
> would probably make a mess with the multiplication.

As long as the destination register is 32-bit the upper half of the
input is irrelevant. Always use native sizes for registers within
brackets when using LEA, and select the size of the destination
register to pick 32-bit vs 64-bit.

I can't really think of any scenario where using a 32-bit register
address operand with a 64-bit destination for LEA is not a mistake.
The fact that doing so is even valid is probably just an artifact of
16-bit memory operands having some usefulness on x86-32 and that
behavior was just straight up copied over to AMD64, either because
nobody thought about it or that doing so made some implementation
detail easier.
Henrik Gramner Sept. 14, 2018, 2:57 p.m. UTC | #4
On Fri, Sep 14, 2018 at 4:51 PM, Henrik Gramner <henrik@gramner.com> wrote:
> I can't really think of any scenario where using a 32-bit register
> address operand with a 64-bit destination for LEA is not a mistake.

To clarify on this, using a 32-bit memory operand means the calculated
effective address will be 32-bit, not 64-bit, so if the result exceeds
0xFFFFFFFF it will be truncated and zero-extended (not even
sign-extended like most other x86 things!). It's essentially i big
trap because it really doesn't do what you'd expect it do to.
James Almer Sept. 14, 2018, 3:59 p.m. UTC | #5
On 9/14/2018 11:57 AM, Henrik Gramner wrote:
> On Fri, Sep 14, 2018 at 4:51 PM, Henrik Gramner <henrik@gramner.com> wrote:
>> I can't really think of any scenario where using a 32-bit register
>> address operand with a 64-bit destination for LEA is not a mistake.
> 
> To clarify on this, using a 32-bit memory operand means the calculated
> effective address will be 32-bit, not 64-bit, so if the result exceeds
> 0xFFFFFFFF it will be truncated and zero-extended (not even
> sign-extended like most other x86 things!). It's essentially i big
> trap because it really doesn't do what you'd expect it do to.

Alright, I'll change it and push. Thanks.
diff mbox

Patch

diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index 06d2d2cfd1..d77d8e9e9c 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -58,6 +58,39 @@  INIT_YMM avx
 VECTOR_FMUL
 %endif
 
+;-----------------------------------------------------------------------------
+; void vector_dmul(double *dst, const double *src0, const double *src1, int len)
+;-----------------------------------------------------------------------------
+%macro VECTOR_DMUL 0
+cglobal vector_dmul, 4,4,4, dst, src0, src1, len
+    lea       lenq, [lend*8 - mmsize*4]
+ALIGN 16
+.loop:
+    movaps    m0,     [src0q + lenq + 0*mmsize]
+    movaps    m1,     [src0q + lenq + 1*mmsize]
+    movaps    m2,     [src0q + lenq + 2*mmsize]
+    movaps    m3,     [src0q + lenq + 3*mmsize]
+    mulpd     m0, m0, [src1q + lenq + 0*mmsize]
+    mulpd     m1, m1, [src1q + lenq + 1*mmsize]
+    mulpd     m2, m2, [src1q + lenq + 2*mmsize]
+    mulpd     m3, m3, [src1q + lenq + 3*mmsize]
+    movaps    [dstq + lenq + 0*mmsize], m0
+    movaps    [dstq + lenq + 1*mmsize], m1
+    movaps    [dstq + lenq + 2*mmsize], m2
+    movaps    [dstq + lenq + 3*mmsize], m3
+
+    sub       lenq, mmsize*4
+    jge       .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+VECTOR_DMUL
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+VECTOR_DMUL
+%endif
+
 ;------------------------------------------------------------------------------
 ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
 ;------------------------------------------------------------------------------
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index 122087a196..8826e4e2c9 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -29,6 +29,11 @@  void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
 void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
                         int len);
 
+void ff_vector_dmul_sse2(double *dst, const double *src0, const double *src1,
+                         int len);
+void ff_vector_dmul_avx(double *dst, const double *src0, const double *src1,
+                        int len);
+
 void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
                                int len);
 void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
@@ -92,11 +97,13 @@  av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
         fdsp->butterflies_float   = ff_butterflies_float_sse;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
+        fdsp->vector_dmul = ff_vector_dmul_sse2;
         fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_sse2;
         fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
     }
     if (EXTERNAL_AVX_FAST(cpu_flags)) {
         fdsp->vector_fmul = ff_vector_fmul_avx;
+        fdsp->vector_dmul = ff_vector_dmul_avx;
         fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
         fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
         fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx;