diff mbox

[FFmpeg-devel,7/9] sbcenc: add MMX optimizations

Message ID 20180221223718.20789-8-aurel@gnuage.org
State Superseded
Headers show

Commit Message

Aurelien Jacobs Feb. 21, 2018, 10:37 p.m. UTC
This was originally based on libsbc, and was fully integrated into ffmpeg.

Rough speed test:
C version:    speed= 592x
MMX version:  speed= 785x
---
 libavcodec/sbcdsp.c          |   3 +
 libavcodec/sbcdsp.h          |   2 +
 libavcodec/x86/Makefile      |   2 +
 libavcodec/x86/sbcdsp.asm    | 285 +++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/sbcdsp_init.c |  51 ++++++++
 5 files changed, 343 insertions(+)
 create mode 100644 libavcodec/x86/sbcdsp.asm
 create mode 100644 libavcodec/x86/sbcdsp_init.c

Comments

Rostislav Pehlivanov Feb. 22, 2018, 5:21 p.m. UTC | #1
On 21 February 2018 at 22:37, Aurelien Jacobs <aurel@gnuage.org> wrote:

> This was originally based on libsbc, and was fully integrated into ffmpeg.
>
> Rough speed test:
> C version:    speed= 592x
> MMX version:  speed= 785x
> ---
>  libavcodec/sbcdsp.c          |   3 +
>  libavcodec/sbcdsp.h          |   2 +
>  libavcodec/x86/Makefile      |   2 +
>  libavcodec/x86/sbcdsp.asm    | 285 ++++++++++++++++++++++++++++++
> +++++++++++++
>  libavcodec/x86/sbcdsp_init.c |  51 ++++++++
>  5 files changed, 343 insertions(+)
>  create mode 100644 libavcodec/x86/sbcdsp.asm
>  create mode 100644 libavcodec/x86/sbcdsp_init.c
>
> diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c
> index e155387f0d..2d0addcf28 100644
> --- a/libavcodec/sbcdsp.c
> +++ b/libavcodec/sbcdsp.c
> @@ -379,4 +379,7 @@ av_cold void ff_sbcdsp_init(SBCDSPContext *s)
>      /* Default implementation for scale factors calculation */
>      s->sbc_calc_scalefactors = sbc_calc_scalefactors;
>      s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j;
> +
> +    if (ARCH_X86)
> +        ff_sbcdsp_init_x86(s);
>  }
> diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h
> index 66ed7d324e..127e6a8a11 100644
> --- a/libavcodec/sbcdsp.h
> +++ b/libavcodec/sbcdsp.h
> @@ -80,4 +80,6 @@ struct sbc_dsp_context {
>   */
>  void ff_sbcdsp_init(SBCDSPContext *s);
>
> +void ff_sbcdsp_init_x86(SBCDSPContext *s);
> +
>  #endif /* AVCODEC_SBCDSP_H */
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index a805cd37b4..2350c8bbee 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -63,6 +63,7 @@ OBJS-$(CONFIG_PNG_DECODER)             +=
> x86/pngdsp_init.o
>  OBJS-$(CONFIG_PRORES_DECODER)          += x86/proresdsp_init.o
>  OBJS-$(CONFIG_PRORES_LGPL_DECODER)     += x86/proresdsp_init.o
>  OBJS-$(CONFIG_RV40_DECODER)            += x86/rv40dsp_init.o
> +OBJS-$(CONFIG_SBC_ENCODER)             += x86/sbcdsp_init.o
>  OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc_init.o
>  OBJS-$(CONFIG_TAK_DECODER)             += x86/takdsp_init.o
>  OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp_init.o
> @@ -172,6 +173,7 @@ X86ASM-OBJS-$(CONFIG_PNG_DECODER)      += x86/pngdsp.o
>  X86ASM-OBJS-$(CONFIG_PRORES_DECODER)   += x86/proresdsp.o
>  X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
>  X86ASM-OBJS-$(CONFIG_RV40_DECODER)     += x86/rv40dsp.o
> +X86ASM-OBJS-$(CONFIG_SBC_ENCODER)      += x86/sbcdsp.o
>  X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER)     += x86/svq1enc.o
>  X86ASM-OBJS-$(CONFIG_TAK_DECODER)      += x86/takdsp.o
>  X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER)   += x86/mlpdsp.o
> diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm
> new file mode 100644
> index 0000000000..4e02263a63
> --- /dev/null
> +++ b/libavcodec/x86/sbcdsp.asm
> @@ -0,0 +1,285 @@
> +;**********************************************************
> ********************
> +;* SIMD optimized SBC encoder DSP functions
> +;*
> +;* Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
> +;* Copyright (C) 2008-2010  Nokia Corporation
> +;* Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
> +;* Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
> +;* Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> +;**********************************************************
> ********************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +scale_mask: times 2 dd 0x8000    ; 1 << (SBC_PROTO_FIXED_SCALE - 1)
> +
> +SECTION .text
> +
> +;*******************************************************************
> +;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t
> *consts);
> +;*******************************************************************
> +INIT_MMX mmx
> +cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
> +    movq          m0, [inq]
> +    movq          m1, [inq+8]
> +    pmaddwd       m0, [constsq]
> +    pmaddwd       m1, [constsq+8]
> +    paddd         m0, [scale_mask]
> +    paddd         m1, [scale_mask]
> +
> +    movq          m2, [inq+16]
> +    movq          m3, [inq+24]
> +    pmaddwd       m2, [constsq+16]
> +    pmaddwd       m3, [constsq+24]
> +    paddd         m0, m2
> +    paddd         m1, m3
> +
> +    movq          m2, [inq+32]
> +    movq          m3, [inq+40]
> +    pmaddwd       m2, [constsq+32]
> +    pmaddwd       m3, [constsq+40]
> +    paddd         m0, m2
> +    paddd         m1, m3
> +
> +    movq          m2, [inq+48]
> +    movq          m3, [inq+56]
> +    pmaddwd       m2, [constsq+48]
> +    pmaddwd       m3, [constsq+56]
> +    paddd         m0, m2
> +    paddd         m1, m3
> +
> +    movq          m2, [inq+64]
> +    movq          m3, [inq+72]
> +    pmaddwd       m2, [constsq+64]
> +    pmaddwd       m3, [constsq+72]
> +    paddd         m0, m2
> +    paddd         m1, m3
>

You can macro the top 3 blocks


> +
> +    psrad         m0, 16    ; SBC_PROTO_FIXED_SCALE
> +    psrad         m1, 16    ; SBC_PROTO_FIXED_SCALE
> +    packssdw      m0, m0
> +    packssdw      m1, m1
> +
> +    movq          m2, m0
> +    pmaddwd       m0, [constsq+80]
> +    pmaddwd       m2, [constsq+88]
> +
> +    movq          m3, m1
> +    pmaddwd       m1, [constsq+96]
> +    pmaddwd       m3, [constsq+104]
> +    paddd         m0, m1
> +    paddd         m2, m3
> +
> +    movq          [outq  ], m0
> +    movq          [outq+8], m2
> +
> +    RET
> +
> +
> +
> +;*******************************************************************
> +;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t
> *consts);
> +;*******************************************************************
> +INIT_MMX mmx
> +cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
> +    movq          m0, [inq]
> +    movq          m1, [inq+8]
> +    movq          m2, [inq+16]
> +    movq          m3, [inq+24]
> +    pmaddwd       m0, [constsq]
> +    pmaddwd       m1, [constsq+8]
> +    pmaddwd       m2, [constsq+16]
> +    pmaddwd       m3, [constsq+24]
> +    paddd         m0, [scale_mask]
> +    paddd         m1, [scale_mask]
> +    paddd         m2, [scale_mask]
> +    paddd         m3, [scale_mask]
> +
> +    movq          m4, [inq+32]
> +    movq          m5, [inq+40]
> +    movq          m6, [inq+48]
> +    movq          m7, [inq+56]
> +    pmaddwd       m4, [constsq+32]
> +    pmaddwd       m5, [constsq+40]
> +    pmaddwd       m6, [constsq+48]
> +    pmaddwd       m7, [constsq+56]
> +    paddd         m0, m4
> +    paddd         m1, m5
> +    paddd         m2, m6
> +    paddd         m3, m7
> +
> +    movq          m4, [inq+64]
> +    movq          m5, [inq+72]
> +    movq          m6, [inq+80]
> +    movq          m7, [inq+88]
> +    pmaddwd       m4, [constsq+64]
> +    pmaddwd       m5, [constsq+72]
> +    pmaddwd       m6, [constsq+80]
> +    pmaddwd       m7, [constsq+88]
> +    paddd         m0, m4
> +    paddd         m1, m5
> +    paddd         m2, m6
> +    paddd         m3, m7
> +
> +    movq          m4, [inq+96]
> +    movq          m5, [inq+104]
> +    movq          m6, [inq+112]
> +    movq          m7, [inq+120]
> +    pmaddwd       m4, [constsq+96]
> +    pmaddwd       m5, [constsq+104]
> +    pmaddwd       m6, [constsq+112]
> +    pmaddwd       m7, [constsq+120]
> +    paddd         m0, m4
> +    paddd         m1, m5
> +    paddd         m2, m6
> +    paddd         m3, m7
> +
> +    movq          m4, [inq+128]
> +    movq          m5, [inq+136]
> +    movq          m6, [inq+144]
> +    movq          m7, [inq+152]
> +    pmaddwd       m4, [constsq+128]
> +    pmaddwd       m5, [constsq+136]
> +    pmaddwd       m6, [constsq+144]
> +    pmaddwd       m7, [constsq+152]
> +    paddd         m0, m4
> +    paddd         m1, m5
> +    paddd         m2, m6
> +    paddd         m3, m7
>

And those 5 blocks


> +
> +    psrad         m0, 16    ; SBC_PROTO_FIXED_SCALE
> +    psrad         m1, 16    ; SBC_PROTO_FIXED_SCALE
> +    psrad         m2, 16    ; SBC_PROTO_FIXED_SCALE
> +    psrad         m3, 16    ; SBC_PROTO_FIXED_SCALE
> +
> +    packssdw      m0, m0
> +    packssdw      m1, m1
> +    packssdw      m2, m2
> +    packssdw      m3, m3
> +
> +    movq          m4, m0
> +    movq          m5, m0
> +    pmaddwd       m4, [constsq+160]
> +    pmaddwd       m5, [constsq+168]
> +
> +    movq          m6, m1
> +    movq          m7, m1
> +    pmaddwd       m6, [constsq+192]
> +    pmaddwd       m7, [constsq+200]
> +    paddd         m4, m6
> +    paddd         m5, m7
> +
> +    movq          m6, m2
> +    movq          m7, m2
> +    pmaddwd       m6, [constsq+224]
> +    pmaddwd       m7, [constsq+232]
> +    paddd         m4, m6
> +    paddd         m5, m7
> +
> +    movq          m6, m3
> +    movq          m7, m3
> +    pmaddwd       m6, [constsq+256]
> +    pmaddwd       m7, [constsq+264]
> +    paddd         m4, m6
> +    paddd         m5, m7
>

Reuse the first macro here

Should save quite a bit of code
diff mbox

Patch

diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c
index e155387f0d..2d0addcf28 100644
--- a/libavcodec/sbcdsp.c
+++ b/libavcodec/sbcdsp.c
@@ -379,4 +379,7 @@  av_cold void ff_sbcdsp_init(SBCDSPContext *s)
     /* Default implementation for scale factors calculation */
     s->sbc_calc_scalefactors = sbc_calc_scalefactors;
     s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j;
+
+    if (ARCH_X86)
+        ff_sbcdsp_init_x86(s);
 }
diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h
index 66ed7d324e..127e6a8a11 100644
--- a/libavcodec/sbcdsp.h
+++ b/libavcodec/sbcdsp.h
@@ -80,4 +80,6 @@  struct sbc_dsp_context {
  */
 void ff_sbcdsp_init(SBCDSPContext *s);
 
+void ff_sbcdsp_init_x86(SBCDSPContext *s);
+
 #endif /* AVCODEC_SBCDSP_H */
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index a805cd37b4..2350c8bbee 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -63,6 +63,7 @@  OBJS-$(CONFIG_PNG_DECODER)             += x86/pngdsp_init.o
 OBJS-$(CONFIG_PRORES_DECODER)          += x86/proresdsp_init.o
 OBJS-$(CONFIG_PRORES_LGPL_DECODER)     += x86/proresdsp_init.o
 OBJS-$(CONFIG_RV40_DECODER)            += x86/rv40dsp_init.o
+OBJS-$(CONFIG_SBC_ENCODER)             += x86/sbcdsp_init.o
 OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc_init.o
 OBJS-$(CONFIG_TAK_DECODER)             += x86/takdsp_init.o
 OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp_init.o
@@ -172,6 +173,7 @@  X86ASM-OBJS-$(CONFIG_PNG_DECODER)      += x86/pngdsp.o
 X86ASM-OBJS-$(CONFIG_PRORES_DECODER)   += x86/proresdsp.o
 X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
 X86ASM-OBJS-$(CONFIG_RV40_DECODER)     += x86/rv40dsp.o
+X86ASM-OBJS-$(CONFIG_SBC_ENCODER)      += x86/sbcdsp.o
 X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER)     += x86/svq1enc.o
 X86ASM-OBJS-$(CONFIG_TAK_DECODER)      += x86/takdsp.o
 X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER)   += x86/mlpdsp.o
diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm
new file mode 100644
index 0000000000..4e02263a63
--- /dev/null
+++ b/libavcodec/x86/sbcdsp.asm
@@ -0,0 +1,285 @@ 
+;******************************************************************************
+;* SIMD optimized SBC encoder DSP functions
+;*
+;* Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+;* Copyright (C) 2008-2010  Nokia Corporation
+;* Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+;* Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+;* Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+scale_mask: times 2 dd 0x8000    ; 1 << (SBC_PROTO_FIXED_SCALE - 1)
+
+SECTION .text
+
+;*******************************************************************
+;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts);
+;*******************************************************************
+INIT_MMX mmx
+cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
+    movq          m0, [inq]
+    movq          m1, [inq+8]
+    pmaddwd       m0, [constsq]
+    pmaddwd       m1, [constsq+8]
+    paddd         m0, [scale_mask]
+    paddd         m1, [scale_mask]
+
+    movq          m2, [inq+16]
+    movq          m3, [inq+24]
+    pmaddwd       m2, [constsq+16]
+    pmaddwd       m3, [constsq+24]
+    paddd         m0, m2
+    paddd         m1, m3
+
+    movq          m2, [inq+32]
+    movq          m3, [inq+40]
+    pmaddwd       m2, [constsq+32]
+    pmaddwd       m3, [constsq+40]
+    paddd         m0, m2
+    paddd         m1, m3
+
+    movq          m2, [inq+48]
+    movq          m3, [inq+56]
+    pmaddwd       m2, [constsq+48]
+    pmaddwd       m3, [constsq+56]
+    paddd         m0, m2
+    paddd         m1, m3
+
+    movq          m2, [inq+64]
+    movq          m3, [inq+72]
+    pmaddwd       m2, [constsq+64]
+    pmaddwd       m3, [constsq+72]
+    paddd         m0, m2
+    paddd         m1, m3
+
+    psrad         m0, 16    ; SBC_PROTO_FIXED_SCALE
+    psrad         m1, 16    ; SBC_PROTO_FIXED_SCALE
+    packssdw      m0, m0
+    packssdw      m1, m1
+
+    movq          m2, m0
+    pmaddwd       m0, [constsq+80]
+    pmaddwd       m2, [constsq+88]
+
+    movq          m3, m1
+    pmaddwd       m1, [constsq+96]
+    pmaddwd       m3, [constsq+104]
+    paddd         m0, m1
+    paddd         m2, m3
+
+    movq          [outq  ], m0
+    movq          [outq+8], m2
+
+    RET
+
+
+
+;*******************************************************************
+;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts);
+;*******************************************************************
+INIT_MMX mmx
+cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
+    movq          m0, [inq]
+    movq          m1, [inq+8]
+    movq          m2, [inq+16]
+    movq          m3, [inq+24]
+    pmaddwd       m0, [constsq]
+    pmaddwd       m1, [constsq+8]
+    pmaddwd       m2, [constsq+16]
+    pmaddwd       m3, [constsq+24]
+    paddd         m0, [scale_mask]
+    paddd         m1, [scale_mask]
+    paddd         m2, [scale_mask]
+    paddd         m3, [scale_mask]
+
+    movq          m4, [inq+32]
+    movq          m5, [inq+40]
+    movq          m6, [inq+48]
+    movq          m7, [inq+56]
+    pmaddwd       m4, [constsq+32]
+    pmaddwd       m5, [constsq+40]
+    pmaddwd       m6, [constsq+48]
+    pmaddwd       m7, [constsq+56]
+    paddd         m0, m4
+    paddd         m1, m5
+    paddd         m2, m6
+    paddd         m3, m7
+
+    movq          m4, [inq+64]
+    movq          m5, [inq+72]
+    movq          m6, [inq+80]
+    movq          m7, [inq+88]
+    pmaddwd       m4, [constsq+64]
+    pmaddwd       m5, [constsq+72]
+    pmaddwd       m6, [constsq+80]
+    pmaddwd       m7, [constsq+88]
+    paddd         m0, m4
+    paddd         m1, m5
+    paddd         m2, m6
+    paddd         m3, m7
+
+    movq          m4, [inq+96]
+    movq          m5, [inq+104]
+    movq          m6, [inq+112]
+    movq          m7, [inq+120]
+    pmaddwd       m4, [constsq+96]
+    pmaddwd       m5, [constsq+104]
+    pmaddwd       m6, [constsq+112]
+    pmaddwd       m7, [constsq+120]
+    paddd         m0, m4
+    paddd         m1, m5
+    paddd         m2, m6
+    paddd         m3, m7
+
+    movq          m4, [inq+128]
+    movq          m5, [inq+136]
+    movq          m6, [inq+144]
+    movq          m7, [inq+152]
+    pmaddwd       m4, [constsq+128]
+    pmaddwd       m5, [constsq+136]
+    pmaddwd       m6, [constsq+144]
+    pmaddwd       m7, [constsq+152]
+    paddd         m0, m4
+    paddd         m1, m5
+    paddd         m2, m6
+    paddd         m3, m7
+
+    psrad         m0, 16    ; SBC_PROTO_FIXED_SCALE
+    psrad         m1, 16    ; SBC_PROTO_FIXED_SCALE
+    psrad         m2, 16    ; SBC_PROTO_FIXED_SCALE
+    psrad         m3, 16    ; SBC_PROTO_FIXED_SCALE
+
+    packssdw      m0, m0
+    packssdw      m1, m1
+    packssdw      m2, m2
+    packssdw      m3, m3
+
+    movq          m4, m0
+    movq          m5, m0
+    pmaddwd       m4, [constsq+160]
+    pmaddwd       m5, [constsq+168]
+
+    movq          m6, m1
+    movq          m7, m1
+    pmaddwd       m6, [constsq+192]
+    pmaddwd       m7, [constsq+200]
+    paddd         m4, m6
+    paddd         m5, m7
+
+    movq          m6, m2
+    movq          m7, m2
+    pmaddwd       m6, [constsq+224]
+    pmaddwd       m7, [constsq+232]
+    paddd         m4, m6
+    paddd         m5, m7
+
+    movq          m6, m3
+    movq          m7, m3
+    pmaddwd       m6, [constsq+256]
+    pmaddwd       m7, [constsq+264]
+    paddd         m4, m6
+    paddd         m5, m7
+
+    movq          [outq  ], m4
+    movq          [outq+8], m5
+
+    movq          m5, m0
+    pmaddwd       m0, [constsq+176]
+    pmaddwd       m5, [constsq+184]
+
+    movq          m7, m1
+    pmaddwd       m1, [constsq+208]
+    pmaddwd       m7, [constsq+216]
+    paddd         m0, m1
+    paddd         m5, m7
+
+    movq          m7, m2
+    pmaddwd       m2, [constsq+240]
+    pmaddwd       m7, [constsq+248]
+    paddd         m0, m2
+    paddd         m5, m7
+
+    movq          m7, m3
+    pmaddwd       m3, [constsq+272]
+    pmaddwd       m7, [constsq+280]
+    paddd         m0, m3
+    paddd         m5, m7
+
+    movq          [outq+16], m0
+    movq          [outq+24], m5
+
+    RET
+
+
+;*******************************************************************
+;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
+;                              uint32_t scale_factor[2][8],
+;                              int blocks, int channels, int subbands)
+;*******************************************************************
+INIT_MMX mmx
+cglobal sbc_calc_scalefactors, 5, 7, 4, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk
+    ; subbands = 4 * subbands * channels
+    movq          m3, [scale_mask]
+    shl           subbandsd, 2
+    cmp           channelsd, 2
+    jl            .loop_1
+    shl           subbandsd, 1
+
+.loop_1:
+    sub           subbandsq, 8
+    lea           ptrq, [sb_sample_fq + subbandsq]
+
+    ; blk = (blocks - 1) * 64;
+    lea           blkq, [blocksq - 1]
+    shl           blkd, 6
+
+    movq          m0, m3
+.loop_2:
+    movq          m1, [ptrq+blkq]
+    pxor          m2, m2
+    pcmpgtd       m1, m2
+    paddd         m1, [ptrq+blkq]
+    pcmpgtd       m2, m1
+    pxor          m1, m2
+
+    por           m0, m1
+
+    sub           blkq, 64
+    jns           .loop_2
+
+    movd          blkd, m0
+    psrlq         m0,   32
+    bsr           blkd, blkd
+    sub           blkd, 15    ; SCALE_OUT_BITS
+    mov           [scale_factorq + subbandsq], blkd
+
+    movd          blkd, m0
+    bsr           blkd, blkd
+    sub           blkd, 15    ; SCALE_OUT_BITS
+    mov           [scale_factorq + subbandsq + 4], blkd
+
+    cmp           subbandsq, 0
+    jg            .loop_1
+
+    emms
+    RET
diff --git a/libavcodec/x86/sbcdsp_init.c b/libavcodec/x86/sbcdsp_init.c
new file mode 100644
index 0000000000..86effecfdf
--- /dev/null
+++ b/libavcodec/x86/sbcdsp_init.c
@@ -0,0 +1,51 @@ 
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
+ * Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
+ * Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC MMX optimization for some basic "building bricks"
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/sbcdsp.h"
+
+void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
+void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
+void ff_sbc_calc_scalefactors_mmx(int32_t sb_sample_f[16][2][8],
+                                  uint32_t scale_factor[2][8],
+                                  int blocks, int channels, int subbands);
+
+av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        s->sbc_analyze_4 = ff_sbc_analyze_4_mmx;
+        s->sbc_analyze_8 = ff_sbc_analyze_8_mmx;
+        s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
+    }
+}