diff mbox

[FFmpeg-devel,4/5] x86/opusdsp: implement FMA3 accelerated postfilter and deemphasis

Message ID La6LTzI--3-1@lynne.ee
State Superseded
Headers show

Commit Message

Lynne March 16, 2019, 4:33 p.m. UTC
58893 decicycles in deemphasis_c,  130548 runs,    524 skips
9475 decicycles in deemphasis_fma3,  130686 runs,    386 skips -> 6.21x speedup

24866 decicycles in postfilter_c,   65386 runs,    150 skips
5268 decicycles in postfilter_fma3,   65505 runs,     31 skips -> 4.72x speedup

Total decoder speedup: ~14%

Deemphasis SIMD based on the following unrolling:
const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
float state = coeff;

for (int i = 0; i < len; i += 4) {
    y[0] = x[0] + c1*state;
    y[1] = x[1] + c2*state + c1*x[0];
    y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
    y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];

    state = y[3];
    y += 4;
    x += 4;
}

Comments

Lynne March 29, 2019, 6:05 p.m. UTC | #1
Mar 16, 2019, 4:33 PM by dev@lynne.ee <mailto:dev@lynne.ee>:

> 58893 decicycles in deemphasis_c,  130548 runs,    524 skips
> 9475 decicycles in deemphasis_fma3,  130686 runs,    386 skips -> 6.21x speedup
>
> 24866 decicycles in postfilter_c,   65386 runs,    150 skips
> 5268 decicycles in postfilter_fma3,   65505 runs,     31 skips -> 4.72x speedup
>
> Total decoder speedup: ~14%
>
> Deemphasis SIMD based on the following unrolling:
> const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
> float state = coeff;
>
> for (int i = 0; i < len; i += 4) {
>     y[0] = x[0] + c1*state;
>     y[1] = x[1] + c2*state + c1*x[0];
>     y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
>     y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];
>
>     state = y[3];
>     y += 4;
>     x += 4;
> }
>

Ping to this part of the patchset while I fix android?
Carl Eugen Hoyos March 29, 2019, 10:35 p.m. UTC | #2
2019-03-16 17:33 GMT+01:00, Lynne <dev@lynne.ee>:
> 58893 decicycles in deemphasis_c,  130548 runs,    524 skips
> 9475 decicycles in deemphasis_fma3,  130686 runs,    386 skips -> 6.21x
> speedup
>
> 24866 decicycles in postfilter_c,   65386 runs,    150 skips
> 5268 decicycles in postfilter_fma3,   65505 runs,     31 skips -> 4.72x
> speedup
>
> Total decoder speedup: ~14%

I can reproduce approximately half of this speedup and
will push over the weekend if there are no objections.

Carl Eugen
Carl Eugen Hoyos March 31, 2019, 10:26 p.m. UTC | #3
2019-03-29 19:05 GMT+01:00, Lynne <dev@lynne.ee>:
> Mar 16, 2019, 4:33 PM by dev@lynne.ee <mailto:dev@lynne.ee>:
>
>> 58893 decicycles in deemphasis_c,  130548 runs,    524 skips
>> 9475 decicycles in deemphasis_fma3,  130686 runs,    386 skips -> 6.21x
>> speedup
>>
>> 24866 decicycles in postfilter_c,   65386 runs,    150 skips
>> 5268 decicycles in postfilter_fma3,   65505 runs,     31 skips -> 4.72x
>> speedup
>>
>> Total decoder speedup: ~14%
>>
>> Deemphasis SIMD based on the following unrolling:
>> const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
>> float state = coeff;
>>
>> for (int i = 0; i < len; i += 4) {
>>     y[0] = x[0] + c1*state;
>>     y[1] = x[1] + c2*state + c1*x[0];
>>     y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
>>     y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];
>>
>>     state = y[3];
>>     y += 4;
>>     x += 4;
>> }
>>
>
> Ping to this part of the patchset while I fix android?

Done.

Please run fate on future patches (it did not pass).

Carl Eugen
James Almer March 31, 2019, 10:35 p.m. UTC | #4
On 3/31/2019 7:26 PM, Carl Eugen Hoyos wrote:
> 2019-03-29 19:05 GMT+01:00, Lynne <dev@lynne.ee>:
>> Mar 16, 2019, 4:33 PM by dev@lynne.ee <mailto:dev@lynne.ee>:
>>
>>> 58893 decicycles in deemphasis_c,  130548 runs,    524 skips
>>> 9475 decicycles in deemphasis_fma3,  130686 runs,    386 skips -> 6.21x
>>> speedup
>>>
>>> 24866 decicycles in postfilter_c,   65386 runs,    150 skips
>>> 5268 decicycles in postfilter_fma3,   65505 runs,     31 skips -> 4.72x
>>> speedup
>>>
>>> Total decoder speedup: ~14%
>>>
>>> Deemphasis SIMD based on the following unrolling:
>>> const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
>>> float state = coeff;
>>>
>>> for (int i = 0; i < len; i += 4) {
>>>     y[0] = x[0] + c1*state;
>>>     y[1] = x[1] + c2*state + c1*x[0];
>>>     y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
>>>     y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];
>>>
>>>     state = y[3];
>>>     y += 4;
>>>     x += 4;
>>> }
>>>
>>
>> Ping to this part of the patchset while I fix android?
> 
> Done.
> 
> Please run fate on future patches (it did not pass).
> 
> Carl Eugen

This fails on mingw-w64:

TEST    opus-testvector01
stddev:  183.65 PSNR: 51.05 MAXDIFF:14569 bytes:  5660160/  5660160
stddev: |183.65 - 0| >= 3
James Almer March 31, 2019, 10:49 p.m. UTC | #5
On 3/16/2019 1:33 PM, Lynne wrote:
> diff --git a/libavcodec/x86/opusdsp.asm b/libavcodec/x86/opusdsp.asm
> new file mode 100644
> index 0000000000..ed65614e06
> --- /dev/null
> +++ b/libavcodec/x86/opusdsp.asm
> @@ -0,0 +1,114 @@
> +;******************************************************************************
> +;* Opus SIMD functions
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +         ; 0.85..^1    0.85..^2    0.85..^3    0.85..^4
> +tab_st: dd 0x3f599a00, 0x3f38f671, 0x3f1d382a, 0x3f05a32f
> +tab_x0: dd 0x0,        0x3f599a00, 0x3f599a00, 0x3f599a00
> +tab_x1: dd 0x0,        0x0,        0x3f38f671, 0x3f38f671
> +tab_x2: dd 0x0,        0x0,        0x0,        0x3f1d382a
> +
> +SECTION .text
> +
> +INIT_XMM fma3
> +%if UNIX64
> +cglobal opus_deemphasis, 3, 3, 8, out, in, len
> +%else
> +cglobal opus_deemphasis, 4, 4, 8, out, in, coeff, len
> +%endif
> +%if ARCH_X86_32
> +    VBROADCASTSS m0, coeffm
> +%else
> +%if WIN64
> +    SWAP 0, 2
> +%endif
> +    shufps m0, m0, 0
> +%endif
> +
> +    movaps m4, [tab_st]
> +    movaps m5, [tab_x0]
> +    movaps m6, [tab_x1]
> +    movaps m7, [tab_x2]
> +
> +.loop:
> +    movaps  m1, [inq]                ; x0, x1, x2, x3
> +
> +    pslldq  m2, m1, 4                ;  0, x0, x1, x2
> +    pslldq  m3, m1, 8                ;  0,  0, x0, x1
> +
> +    fmaddps m2, m2, m5, m1           ; x + c1*x[0-2]
> +    pslldq  m1, 12                   ;  0,  0,  0, x0
> +
> +    fmaddps m2, m3, m6, m2           ; x + c1*x[0-2] + c2*x[0-1]
> +    fmaddps m1, m1, m7, m2           ; x + c1*x[0-2] + c2*x[0-1] + c3*x[0]
> +    fmaddps m0, m0, m4, m1           ; x + c1*x[0-2] + c2*x[0-1] + c3*x[0] + c*s
> +
> +    movaps [outq], m0
> +    shufps m0, m0, q3333             ; new state
> +
> +    add inq,  mmsize
> +    add outq, mmsize
> +    sub lenq, mmsize >> 2
> +    jg .loop
> +
> +%if ARCH_X86_64 == 0
> +    movss r0m, m0
> +    fld dword r0m
> +%endif
> +    RET

A float ret value needs to be in xmm0, and you swapped m0 with m2 on
Win64. This is the source of the fate failure.
diff mbox

Patch

From b4be0e7019f16ec567f39da50d1ea35ce5ddf45a Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Fri, 15 Mar 2019 14:43:04 +0000
Subject: [PATCH 4/5] x86/opusdsp: implement FMA3 accelerated postfilter and
 deemphasis

58893 decicycles in deemphasis_c,  130548 runs,    524 skips
9475 decicycles in deemphasis_fma3,  130686 runs,    386 skips -> 6.21x speedup

24866 decicycles in postfilter_c,   65386 runs,    150 skips
5268 decicycles in postfilter_fma3,   65505 runs,     31 skips -> 4.72x speedup

Total decoder speedup: ~14%

Deemphasis SIMD based on the following unrolling:
const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
float state = coeff;

for (int i = 0; i < len; i += 4) {
    y[0] = x[0] + c1*state;
    y[1] = x[1] + c2*state + c1*x[0];
    y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
    y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];

    state = y[3];
    y += 4;
    x += 4;
}
---
 libavcodec/opusdsp.c          |   3 +
 libavcodec/opusdsp.h          |   2 +
 libavcodec/x86/Makefile       |   2 +
 libavcodec/x86/opusdsp.asm    | 114 ++++++++++++++++++++++++++++++++++
 libavcodec/x86/opusdsp_init.c |  35 +++++++++++
 5 files changed, 156 insertions(+)
 create mode 100644 libavcodec/x86/opusdsp.asm
 create mode 100644 libavcodec/x86/opusdsp_init.c

diff --git a/libavcodec/opusdsp.c b/libavcodec/opusdsp.c
index 615e7d6816..17e819f977 100644
--- a/libavcodec/opusdsp.c
+++ b/libavcodec/opusdsp.c
@@ -58,4 +58,7 @@  av_cold void ff_opus_dsp_init(OpusDSP *ctx)
 {
     ctx->postfilter = postfilter_c;
     ctx->deemphasis = deemphasis_c;
+
+    if (ARCH_X86)
+        ff_opus_dsp_init_x86(ctx);
 }
diff --git a/libavcodec/opusdsp.h b/libavcodec/opusdsp.h
index 74adfe6859..e8b8cf40a9 100644
--- a/libavcodec/opusdsp.h
+++ b/libavcodec/opusdsp.h
@@ -30,4 +30,6 @@  typedef struct OpusDSP {
 
 void ff_opus_dsp_init(OpusDSP *ctx);
 
+void ff_opus_dsp_init_x86(OpusDSP *ctx);
+
 #endif /* AVCODEC_OPUS_DSP_H */
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 2697431781..f63f7cfed3 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -53,6 +53,7 @@  OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
 OBJS-$(CONFIG_EXR_DECODER)             += x86/exrdsp_init.o
+OBJS-$(CONFIG_OPUS_DECODER)            += x86/opusdsp_init.o
 OBJS-$(CONFIG_OPUS_ENCODER)            += x86/celt_pvq_init.o
 OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
 OBJS-$(CONFIG_JPEG2000_DECODER)        += x86/jpeg2000dsp_init.o
@@ -126,6 +127,7 @@  X86ASM-OBJS-$(CONFIG_MDCT15)           += x86/mdct15.o
 X86ASM-OBJS-$(CONFIG_ME_CMP)           += x86/me_cmp.o
 X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP)     += x86/imdct36.o
 X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC)     += x86/mpegvideoencdsp.o
+X86ASM-OBJS-$(CONFIG_OPUS_DECODER)     += x86/opusdsp.o
 X86ASM-OBJS-$(CONFIG_OPUS_ENCODER)     += x86/celt_pvq_search.o
 X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP)      += x86/pixblockdsp.o
 X86ASM-OBJS-$(CONFIG_QPELDSP)          += x86/qpeldsp.o                 \
diff --git a/libavcodec/x86/opusdsp.asm b/libavcodec/x86/opusdsp.asm
new file mode 100644
index 0000000000..ed65614e06
--- /dev/null
+++ b/libavcodec/x86/opusdsp.asm
@@ -0,0 +1,114 @@ 
+;******************************************************************************
+;* Opus SIMD functions
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+         ; 0.85..^1    0.85..^2    0.85..^3    0.85..^4
+tab_st: dd 0x3f599a00, 0x3f38f671, 0x3f1d382a, 0x3f05a32f
+tab_x0: dd 0x0,        0x3f599a00, 0x3f599a00, 0x3f599a00
+tab_x1: dd 0x0,        0x0,        0x3f38f671, 0x3f38f671
+tab_x2: dd 0x0,        0x0,        0x0,        0x3f1d382a
+
+SECTION .text
+
+INIT_XMM fma3
+%if UNIX64
+cglobal opus_deemphasis, 3, 3, 8, out, in, len
+%else
+cglobal opus_deemphasis, 4, 4, 8, out, in, coeff, len
+%endif
+%if ARCH_X86_32
+    VBROADCASTSS m0, coeffm
+%else
+%if WIN64
+    SWAP 0, 2
+%endif
+    shufps m0, m0, 0
+%endif
+
+    movaps m4, [tab_st]
+    movaps m5, [tab_x0]
+    movaps m6, [tab_x1]
+    movaps m7, [tab_x2]
+
+.loop:
+    movaps  m1, [inq]                ; x0, x1, x2, x3
+
+    pslldq  m2, m1, 4                ;  0, x0, x1, x2
+    pslldq  m3, m1, 8                ;  0,  0, x0, x1
+
+    fmaddps m2, m2, m5, m1           ; x + c1*x[0-2]
+    pslldq  m1, 12                   ;  0,  0,  0, x0
+
+    fmaddps m2, m3, m6, m2           ; x + c1*x[0-2] + c2*x[0-1]
+    fmaddps m1, m1, m7, m2           ; x + c1*x[0-2] + c2*x[0-1] + c3*x[0]
+    fmaddps m0, m0, m4, m1           ; x + c1*x[0-2] + c2*x[0-1] + c3*x[0] + c*s
+
+    movaps [outq], m0
+    shufps m0, m0, q3333             ; new state
+
+    add inq,  mmsize
+    add outq, mmsize
+    sub lenq, mmsize >> 2
+    jg .loop
+
+%if ARCH_X86_64 == 0
+    movss r0m, m0
+    fld dword r0m
+%endif
+    RET
+
+
+INIT_XMM fma3
+cglobal opus_postfilter, 4, 4, 8, data, period, gains, len
+    VBROADCASTSS m0, [gainsq + 0]
+    VBROADCASTSS m1, [gainsq + 4]
+    VBROADCASTSS m2, [gainsq + 8]
+
+    lea periodq, [periodq*4 + 8]
+    neg periodq
+
+    movups  m3, [dataq + periodq]
+    mulps   m3, m2
+
+.loop:
+    movups  m4, [dataq + periodq +  4]
+    movups  m5, [dataq + periodq +  8]
+    movups  m6, [dataq + periodq + 12]
+    movups  m7, [dataq + periodq + 16]
+
+    fmaddps m3, m7, m2, m3
+    addps   m6, m4
+
+    fmaddps m5, m5, m0, [dataq]
+    fmaddps m6, m6, m1, m3
+
+    addps   m5, m6
+    mulps   m3, m7, m2
+
+    movaps  [dataq], m5
+
+    add dataq, mmsize
+    sub lenq,  mmsize >> 2
+    jg .loop
+
+    RET
diff --git a/libavcodec/x86/opusdsp_init.c b/libavcodec/x86/opusdsp_init.c
new file mode 100644
index 0000000000..6834c4e6a4
--- /dev/null
+++ b/libavcodec/x86/opusdsp_init.c
@@ -0,0 +1,35 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/opusdsp.h"
+
+void ff_opus_postfilter_fma3(float *data, int period, float *gains, int len);
+float ff_opus_deemphasis_fma3(float *out, float *in, float coeff, int len);
+
+av_cold void ff_opus_dsp_init_x86(OpusDSP *ctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_FMA3_FAST(cpu_flags)) {
+        ctx->postfilter = ff_opus_postfilter_fma3;
+        ctx->deemphasis = ff_opus_deemphasis_fma3;
+    }
+}
-- 
2.21.0