Message ID | 20171217214720.3753-6-aurel@gnuage.org |
---|---|
State | Superseded |
Headers | show |
2017-12-17 22:47 GMT+01:00 Aurelien Jacobs <aurel@gnuage.org>:
> This was originally based on libsbc, and was fully integrated into ffmpeg.
Very rough numbers are useful in the commit message.
Carl Eugen
On 12/17/2017 6:47 PM, Aurelien Jacobs wrote: > This was originally based on libsbc, and was fully integrated into ffmpeg. > --- > libavcodec/sbcdsp.c | 3 + > libavcodec/sbcdsp.h | 2 + > libavcodec/x86/Makefile | 2 + > libavcodec/x86/sbcdsp.asm | 284 +++++++++++++++++++++++++++++++++++++++++++ > libavcodec/x86/sbcdsp_init.c | 51 ++++++++ > 5 files changed, 342 insertions(+) > create mode 100644 libavcodec/x86/sbcdsp.asm > create mode 100644 libavcodec/x86/sbcdsp_init.c > > diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c > index 16faf5ba9b..9bb60cdd5e 100644 > --- a/libavcodec/sbcdsp.c > +++ b/libavcodec/sbcdsp.c > @@ -387,4 +387,7 @@ av_cold void ff_sbcdsp_init(SBCDSPContext *s) > /* Default implementation for scale factors calculation */ > s->sbc_calc_scalefactors = sbc_calc_scalefactors; > s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j; > + > + if (ARCH_X86) > + ff_sbcdsp_init_x86(s); > } > diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h > index 66ed7d324e..127e6a8a11 100644 > --- a/libavcodec/sbcdsp.h > +++ b/libavcodec/sbcdsp.h > @@ -80,4 +80,6 @@ struct sbc_dsp_context { > */ > void ff_sbcdsp_init(SBCDSPContext *s); > > +void ff_sbcdsp_init_x86(SBCDSPContext *s); > + > #endif /* AVCODEC_SBCDSP_H */ > diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile > index a805cd37b4..2350c8bbee 100644 > --- a/libavcodec/x86/Makefile > +++ b/libavcodec/x86/Makefile > @@ -63,6 +63,7 @@ OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o > OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o > OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o > OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o > +OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o > OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o > OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o > OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o > @@ -172,6 +173,7 @@ X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o > X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o > X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o > X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o > +X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o > X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o > X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o > X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o > diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm > new file mode 100644 > index 0000000000..00b48a821b > --- /dev/null > +++ b/libavcodec/x86/sbcdsp.asm > @@ -0,0 +1,284 @@ > +;****************************************************************************** > +;* SIMD optimized SBC encoder DSP functions > +;* > +;* Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org> > +;* Copyright (C) 2008-2010 Nokia Corporation > +;* Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org> > +;* Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> > +;* Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1) > + > +SECTION .text > + > +;******************************************************************* > +;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts); > +;******************************************************************* > +INIT_MMX mmx > +cglobal sbc_analyze_4, 3, 3, 4, in, out, consts > + movq m0, [inq] > + movq m1, [inq+8] > + pmaddwd m0, [constsq] > + pmaddwd m1, [constsq+8] > + paddd m0, [scale_mask] > + paddd m1, [scale_mask] > + > + movq m2, [inq+16] > + movq m3, [inq+24] > + pmaddwd m2, [constsq+16] > + pmaddwd m3, [constsq+24] > + paddd m0, m2 > + paddd m1, m3 > + > + movq m2, [inq+32] > + movq m3, [inq+40] > + pmaddwd m2, [constsq+32] > + pmaddwd m3, [constsq+40] > + paddd m0, m2 > + paddd m1, m3 > + > + movq m2, [inq+48] > + movq m3, [inq+56] > + pmaddwd m2, [constsq+48] > + pmaddwd m3, [constsq+56] > + paddd m0, m2 > + paddd m1, m3 > + > + movq m2, [inq+64] > + movq m3, [inq+72] > + pmaddwd m2, [constsq+64] > + pmaddwd m3, [constsq+72] > + paddd m0, m2 > + paddd m1, m3 > + > + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE > + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE > + packssdw m0, m0 > + packssdw m1, m1 > + > + movq m2, m0 > + pmaddwd m0, [constsq+80] > + pmaddwd m2, [constsq+88] > + > + movq m3, m1 > + pmaddwd m1, [constsq+96] > + pmaddwd m3, [constsq+104] > + paddd m0, m1 > + paddd m2, m3 > + > + movq [outq ], m0 > + movq [outq+8], m2 > + > + RET > + > + > + > +;******************************************************************* > +;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts); > +;******************************************************************* > +INIT_MMX mmx > +cglobal sbc_analyze_8, 3, 3, 4, in, out, consts > + movq m0, [inq] > + movq m1, [inq+8] > + movq m2, [inq+16] > + movq m3, [inq+24] > + pmaddwd m0, [constsq] > + pmaddwd m1, [constsq+8] > + pmaddwd m2, [constsq+16] > + pmaddwd m3, [constsq+24] > + paddd m0, [scale_mask] > + paddd m1, [scale_mask] > + paddd m2, [scale_mask] > + paddd m3, [scale_mask] > + > + movq m4, [inq+32] > + movq m5, [inq+40] > + movq m6, [inq+48] > + movq m7, [inq+56] > + pmaddwd m4, [constsq+32] > + pmaddwd m5, [constsq+40] > + pmaddwd m6, [constsq+48] > + pmaddwd m7, [constsq+56] > + paddd m0, m4 > + paddd m1, m5 > + paddd m2, m6 > + paddd m3, m7 > + > + movq m4, [inq+64] > + movq m5, [inq+72] > + movq m6, [inq+80] > + movq m7, [inq+88] > + pmaddwd m4, [constsq+64] > + pmaddwd m5, [constsq+72] > + pmaddwd m6, [constsq+80] > + pmaddwd m7, [constsq+88] > + paddd m0, m4 > + paddd m1, m5 > + paddd m2, m6 > + paddd m3, m7 > + > + movq m4, [inq+96] > + movq m5, [inq+104] > + movq m6, [inq+112] > + movq m7, [inq+120] > + pmaddwd m4, [constsq+96] > + pmaddwd m5, [constsq+104] > + pmaddwd m6, [constsq+112] > + pmaddwd m7, [constsq+120] > + paddd m0, m4 > + paddd m1, m5 > + paddd m2, m6 > + paddd m3, m7 > + > + movq m4, [inq+128] > + movq m5, [inq+136] > + movq m6, [inq+144] > + movq m7, [inq+152] > + pmaddwd m4, [constsq+128] > + pmaddwd m5, [constsq+136] > + pmaddwd m6, [constsq+144] > + pmaddwd m7, [constsq+152] > + paddd m0, m4 > + paddd m1, m5 > + paddd m2, m6 > + paddd m3, m7 > + > + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE > + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE > + psrad m2, 16 ; SBC_PROTO_FIXED_SCALE > + psrad m3, 16 ; SBC_PROTO_FIXED_SCALE > + > + packssdw m0, m0 > + packssdw m1, m1 > + packssdw m2, m2 > + packssdw m3, m3 > + > + movq m4, m0 > + movq m5, m0 > + pmaddwd m4, [constsq+160] > + pmaddwd m5, [constsq+168] > + > + movq m6, m1 > + movq m7, m1 > + pmaddwd m6, [constsq+192] > + pmaddwd m7, [constsq+200] > + paddd m4, m6 > + paddd m5, m7 > + > + movq m6, m2 > + movq m7, m2 > + pmaddwd m6, [constsq+224] > + pmaddwd m7, [constsq+232] > + paddd m4, m6 > + paddd m5, m7 > + > + movq m6, m3 > + movq m7, m3 > + pmaddwd m6, [constsq+256] > + pmaddwd m7, [constsq+264] > + paddd m4, m6 > + paddd m5, m7 > + > + movq [outq ], m4 > + movq [outq+8], m5 > + > + movq m5, m0 > + pmaddwd m0, [constsq+176] > + pmaddwd m5, [constsq+184] > + > + movq m7, m1 > + pmaddwd m1, [constsq+208] > + pmaddwd m7, [constsq+216] > + paddd m0, m1 > + paddd m5, m7 > + > + movq m7, m2 > + pmaddwd m2, [constsq+240] > + pmaddwd m7, [constsq+248] > + paddd m0, m2 > + paddd m5, m7 > + > + movq m7, m3 > + pmaddwd m3, [constsq+272] > + pmaddwd m7, [constsq+280] > + paddd m0, m3 > + paddd m5, m7 > + > + movq [outq+16], m0 > + movq [outq+24], m5 > + > + RET > + > + > +;******************************************************************* > +;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8], > +; uint32_t scale_factor[2][8], > +; int blocks, int channels, int subbands) > +;******************************************************************* > +INIT_MMX mmx > +cglobal sbc_calc_scalefactors, 5, 7, 3, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk > + ; subbands = 4 * subbands * channels > + shl subbandsq, 2 > + cmp channelsq, 2 > + jl .loop_1 > + shl subbandsq, 1 > + > +.loop_1: > + sub subbandsq, 8 > + lea ptrq, [sb_sample_fq + subbandsq] > + > + ; blk = (blocks - 1) * 64; > + lea blkq, [blocksq - 1] > + shl blkq, 6 > + > + movq m0, [scale_mask] > +.loop_2: > + movq m1, [ptrq+blkq] > + pxor m2, m2 > + pcmpgtd m1, m2 > + paddd m1, [ptrq+blkq] > + pcmpgtd m2, m1 > + pxor m1, m2 > + > + por m0, m1 > + > + sub blkq, 64 > + jns .loop_2 > + > + movd blkd, m0 > + psrlq m0, 32 > + bsr blkd, blkd > + sub blkd, 15 ; SCALE_OUT_BITS > + mov [scale_factorq + subbandsq], blkd > + > + movd blkd, m0 > + bsr blkd, blkd > + sub blkd, 15 ; SCALE_OUT_BITS > + mov [scale_factorq + subbandsq + 4], blkd > + > + cmp subbandsq, 0 > + jg .loop_1 > + > + emms > + RET These should be done in SSE2. There's no reason for them to be MMX. Especially with analize_4 and analize_8, you're running twice the amount of instructions you'd need for sse2. > diff --git a/libavcodec/x86/sbcdsp_init.c b/libavcodec/x86/sbcdsp_init.c > new file mode 100644 > index 0000000000..86effecfdf > --- /dev/null > +++ b/libavcodec/x86/sbcdsp_init.c > @@ -0,0 +1,51 @@ > +/* > + * Bluetooth low-complexity, subband codec (SBC) > + * > + * Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org> > + * Copyright (C) 2008-2010 Nokia Corporation > + * Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org> > + * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> > + * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +/** > + * @file > + * SBC MMX optimization for some basic "building bricks" > + */ > + > +#include "libavutil/cpu.h" > +#include "libavutil/x86/cpu.h" > +#include "libavcodec/sbcdsp.h" > + > +void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts); > +void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts); > +void ff_sbc_calc_scalefactors_mmx(int32_t sb_sample_f[16][2][8], > + uint32_t scale_factor[2][8], > + int blocks, int channels, int subbands); > + > +av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s) > +{ > + int cpu_flags = av_get_cpu_flags(); > + > + if (EXTERNAL_MMX(cpu_flags)) { > + s->sbc_analyze_4 = ff_sbc_analyze_4_mmx; > + s->sbc_analyze_8 = ff_sbc_analyze_8_mmx; > + s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx; > + } > +} >
On Wed, Dec 20, 2017 at 03:47:35PM -0300, James Almer wrote: > On 12/17/2017 6:47 PM, Aurelien Jacobs wrote: > > This was originally based on libsbc, and was fully integrated into ffmpeg. > > --- > > libavcodec/sbcdsp.c | 3 + > > libavcodec/sbcdsp.h | 2 + > > libavcodec/x86/Makefile | 2 + > > libavcodec/x86/sbcdsp.asm | 284 +++++++++++++++++++++++++++++++++++++++++++ > > libavcodec/x86/sbcdsp_init.c | 51 ++++++++ > > 5 files changed, 342 insertions(+) > > create mode 100644 libavcodec/x86/sbcdsp.asm > > create mode 100644 libavcodec/x86/sbcdsp_init.c > > > > diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c > > index 16faf5ba9b..9bb60cdd5e 100644 > > --- a/libavcodec/sbcdsp.c > > +++ b/libavcodec/sbcdsp.c > > @@ -387,4 +387,7 @@ av_cold void ff_sbcdsp_init(SBCDSPContext *s) > > /* Default implementation for scale factors calculation */ > > s->sbc_calc_scalefactors = sbc_calc_scalefactors; > > s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j; > > + > > + if (ARCH_X86) > > + ff_sbcdsp_init_x86(s); > > } > > diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h > > index 66ed7d324e..127e6a8a11 100644 > > --- a/libavcodec/sbcdsp.h > > +++ b/libavcodec/sbcdsp.h > > @@ -80,4 +80,6 @@ struct sbc_dsp_context { > > */ > > void ff_sbcdsp_init(SBCDSPContext *s); > > > > +void ff_sbcdsp_init_x86(SBCDSPContext *s); > > + > > #endif /* AVCODEC_SBCDSP_H */ > > diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile > > index a805cd37b4..2350c8bbee 100644 > > --- a/libavcodec/x86/Makefile > > +++ b/libavcodec/x86/Makefile > > @@ -63,6 +63,7 @@ OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o > > OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o > > OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o > > OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o > > +OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o > > OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o > > OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o > > OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o > > @@ -172,6 +173,7 @@ X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o > > X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o > > X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o > > X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o > > +X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o > > X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o > > X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o > > X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o > > diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm > > new file mode 100644 > > index 0000000000..00b48a821b > > --- /dev/null > > +++ b/libavcodec/x86/sbcdsp.asm > > @@ -0,0 +1,284 @@ > > +;****************************************************************************** > > +;* SIMD optimized SBC encoder DSP functions > > +;* > > +;* Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org> > > +;* Copyright (C) 2008-2010 Nokia Corporation > > +;* Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org> > > +;* Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> > > +;* Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> > > +;* > > +;* This file is part of FFmpeg. > > +;* > > +;* FFmpeg is free software; you can redistribute it and/or > > +;* modify it under the terms of the GNU Lesser General Public > > +;* License as published by the Free Software Foundation; either > > +;* version 2.1 of the License, or (at your option) any later version. > > +;* > > +;* FFmpeg is distributed in the hope that it will be useful, > > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > +;* Lesser General Public License for more details. > > +;* > > +;* You should have received a copy of the GNU Lesser General Public > > +;* License along with FFmpeg; if not, write to the Free Software > > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > > +;****************************************************************************** > > + > > +%include "libavutil/x86/x86util.asm" > > + > > +SECTION_RODATA > > + > > +scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1) > > + > > +SECTION .text > > + > > +;******************************************************************* > > +;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts); > > +;******************************************************************* > > +INIT_MMX mmx > > +cglobal sbc_analyze_4, 3, 3, 4, in, out, consts > > + movq m0, [inq] > > + movq m1, [inq+8] > > + pmaddwd m0, [constsq] > > + pmaddwd m1, [constsq+8] > > + paddd m0, [scale_mask] > > + paddd m1, [scale_mask] > > + > > + movq m2, [inq+16] > > + movq m3, [inq+24] > > + pmaddwd m2, [constsq+16] > > + pmaddwd m3, [constsq+24] > > + paddd m0, m2 > > + paddd m1, m3 > > + > > + movq m2, [inq+32] > > + movq m3, [inq+40] > > + pmaddwd m2, [constsq+32] > > + pmaddwd m3, [constsq+40] > > + paddd m0, m2 > > + paddd m1, m3 > > + > > + movq m2, [inq+48] > > + movq m3, [inq+56] > > + pmaddwd m2, [constsq+48] > > + pmaddwd m3, [constsq+56] > > + paddd m0, m2 > > + paddd m1, m3 > > + > > + movq m2, [inq+64] > > + movq m3, [inq+72] > > + pmaddwd m2, [constsq+64] > > + pmaddwd m3, [constsq+72] > > + paddd m0, m2 > > + paddd m1, m3 > > + > > + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE > > + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE > > + packssdw m0, m0 > > + packssdw m1, m1 > > + > > + movq m2, m0 > > + pmaddwd m0, [constsq+80] > > + pmaddwd m2, [constsq+88] > > + > > + movq m3, m1 > > + pmaddwd m1, [constsq+96] > > + pmaddwd m3, [constsq+104] > > + paddd m0, m1 > > + paddd m2, m3 > > + > > + movq [outq ], m0 > > + movq [outq+8], m2 > > + > > + RET > > + > > + > > + > > +;******************************************************************* > > +;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts); > > +;******************************************************************* > > +INIT_MMX mmx > > +cglobal sbc_analyze_8, 3, 3, 4, in, out, consts > > + movq m0, [inq] > > + movq m1, [inq+8] > > + movq m2, [inq+16] > > + movq m3, [inq+24] > > + pmaddwd m0, [constsq] > > + pmaddwd m1, [constsq+8] > > + pmaddwd m2, [constsq+16] > > + pmaddwd m3, [constsq+24] > > + paddd m0, [scale_mask] > > + paddd m1, [scale_mask] > > + paddd m2, [scale_mask] > > + paddd m3, [scale_mask] > > + > > + movq m4, [inq+32] > > + movq m5, [inq+40] > > + movq m6, [inq+48] > > + movq m7, [inq+56] > > + pmaddwd m4, [constsq+32] > > + pmaddwd m5, [constsq+40] > > + pmaddwd m6, [constsq+48] > > + pmaddwd m7, [constsq+56] > > + paddd m0, m4 > > + paddd m1, m5 > > + paddd m2, m6 > > + paddd m3, m7 > > + > > + movq m4, [inq+64] > > + movq m5, [inq+72] > > + movq m6, [inq+80] > > + movq m7, [inq+88] > > + pmaddwd m4, [constsq+64] > > + pmaddwd m5, [constsq+72] > > + pmaddwd m6, [constsq+80] > > + pmaddwd m7, [constsq+88] > > + paddd m0, m4 > > + paddd m1, m5 > > + paddd m2, m6 > > + paddd m3, m7 > > + > > + movq m4, [inq+96] > > + movq m5, [inq+104] > > + movq m6, [inq+112] > > + movq m7, [inq+120] > > + pmaddwd m4, [constsq+96] > > + pmaddwd m5, [constsq+104] > > + pmaddwd m6, [constsq+112] > > + pmaddwd m7, [constsq+120] > > + paddd m0, m4 > > + paddd m1, m5 > > + paddd m2, m6 > > + paddd m3, m7 > > + > > + movq m4, [inq+128] > > + movq m5, [inq+136] > > + movq m6, [inq+144] > > + movq m7, [inq+152] > > + pmaddwd m4, [constsq+128] > > + pmaddwd m5, [constsq+136] > > + pmaddwd m6, [constsq+144] > > + pmaddwd m7, [constsq+152] > > + paddd m0, m4 > > + paddd m1, m5 > > + paddd m2, m6 > > + paddd m3, m7 > > + > > + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE > > + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE > > + psrad m2, 16 ; SBC_PROTO_FIXED_SCALE > > + psrad m3, 16 ; SBC_PROTO_FIXED_SCALE > > + > > + packssdw m0, m0 > > + packssdw m1, m1 > > + packssdw m2, m2 > > + packssdw m3, m3 > > + > > + movq m4, m0 > > + movq m5, m0 > > + pmaddwd m4, [constsq+160] > > + pmaddwd m5, [constsq+168] > > + > > + movq m6, m1 > > + movq m7, m1 > > + pmaddwd m6, [constsq+192] > > + pmaddwd m7, [constsq+200] > > + paddd m4, m6 > > + paddd m5, m7 > > + > > + movq m6, m2 > > + movq m7, m2 > > + pmaddwd m6, [constsq+224] > > + pmaddwd m7, [constsq+232] > > + paddd m4, m6 > > + paddd m5, m7 > > + > > + movq m6, m3 > > + movq m7, m3 > > + pmaddwd m6, [constsq+256] > > + pmaddwd m7, [constsq+264] > > + paddd m4, m6 > > + paddd m5, m7 > > + > > + movq [outq ], m4 > > + movq [outq+8], m5 > > + > > + movq m5, m0 > > + pmaddwd m0, [constsq+176] > > + pmaddwd m5, [constsq+184] > > + > > + movq m7, m1 > > + pmaddwd m1, [constsq+208] > > + pmaddwd m7, [constsq+216] > > + paddd m0, m1 > > + paddd m5, m7 > > + > > + movq m7, m2 > > + pmaddwd m2, [constsq+240] > > + pmaddwd m7, [constsq+248] > > + paddd m0, m2 > > + paddd m5, m7 > > + > > + movq m7, m3 > > + pmaddwd m3, [constsq+272] > > + pmaddwd m7, [constsq+280] > > + paddd m0, m3 > > + paddd m5, m7 > > + > > + movq [outq+16], m0 > > + movq [outq+24], m5 > > + > > + RET > > + > > + > > +;******************************************************************* > > +;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8], > > +; uint32_t scale_factor[2][8], > > +; int blocks, int channels, int subbands) > > +;******************************************************************* > > +INIT_MMX mmx > > +cglobal sbc_calc_scalefactors, 5, 7, 3, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk > > + ; subbands = 4 * subbands * channels > > + shl subbandsq, 2 > > + cmp channelsq, 2 > > + jl .loop_1 > > + shl subbandsq, 1 > > + > > +.loop_1: > > + sub subbandsq, 8 > > + lea ptrq, [sb_sample_fq + subbandsq] > > + > > + ; blk = (blocks - 1) * 64; > > + lea blkq, [blocksq - 1] > > + shl blkq, 6 > > + > > + movq m0, [scale_mask] > > +.loop_2: > > + movq m1, [ptrq+blkq] > > + pxor m2, m2 > > + pcmpgtd m1, m2 > > + paddd m1, [ptrq+blkq] > > + pcmpgtd m2, m1 > > + pxor m1, m2 > > + > > + por m0, m1 > > + > > + sub blkq, 64 > > + jns .loop_2 > > + > > + movd blkd, m0 > > + psrlq m0, 32 > > + bsr blkd, blkd > > + sub blkd, 15 ; SCALE_OUT_BITS > > + mov [scale_factorq + subbandsq], blkd > > + > > + movd blkd, m0 > > + bsr blkd, blkd > > + sub blkd, 15 ; SCALE_OUT_BITS > > + mov [scale_factorq + subbandsq + 4], blkd > > + > > + cmp subbandsq, 0 > > + jg .loop_1 > > + > > + emms > > + RET > > These should be done in SSE2. There's no reason for them to be MMX. There is at least one reason for it to be MMX. It is existing legacy code that is ported to ffmpeg. So the MMX code is here and working. Of course writting an SSE2 version would certainly be a good idea. Do you volunteer ?
On 12/20/2017 5:06 PM, Aurelien Jacobs wrote: > On Wed, Dec 20, 2017 at 03:47:35PM -0300, James Almer wrote: >> On 12/17/2017 6:47 PM, Aurelien Jacobs wrote: >>> This was originally based on libsbc, and was fully integrated into ffmpeg. >>> --- >>> libavcodec/sbcdsp.c | 3 + >>> libavcodec/sbcdsp.h | 2 + >>> libavcodec/x86/Makefile | 2 + >>> libavcodec/x86/sbcdsp.asm | 284 +++++++++++++++++++++++++++++++++++++++++++ >>> libavcodec/x86/sbcdsp_init.c | 51 ++++++++ >>> 5 files changed, 342 insertions(+) >>> create mode 100644 libavcodec/x86/sbcdsp.asm >>> create mode 100644 libavcodec/x86/sbcdsp_init.c >>> >>> diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c >>> index 16faf5ba9b..9bb60cdd5e 100644 >>> --- a/libavcodec/sbcdsp.c >>> +++ b/libavcodec/sbcdsp.c >>> @@ -387,4 +387,7 @@ av_cold void ff_sbcdsp_init(SBCDSPContext *s) >>> /* Default implementation for scale factors calculation */ >>> s->sbc_calc_scalefactors = sbc_calc_scalefactors; >>> s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j; >>> + >>> + if (ARCH_X86) >>> + ff_sbcdsp_init_x86(s); >>> } >>> diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h >>> index 66ed7d324e..127e6a8a11 100644 >>> --- a/libavcodec/sbcdsp.h >>> +++ b/libavcodec/sbcdsp.h >>> @@ -80,4 +80,6 @@ struct sbc_dsp_context { >>> */ >>> void ff_sbcdsp_init(SBCDSPContext *s); >>> >>> +void ff_sbcdsp_init_x86(SBCDSPContext *s); >>> + >>> #endif /* AVCODEC_SBCDSP_H */ >>> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile >>> index a805cd37b4..2350c8bbee 100644 >>> --- a/libavcodec/x86/Makefile >>> +++ b/libavcodec/x86/Makefile >>> @@ -63,6 +63,7 @@ OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o >>> OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o >>> OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o >>> OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o >>> +OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o >>> OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o >>> OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o >>> OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o >>> @@ -172,6 +173,7 @@ X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o >>> X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o >>> X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o >>> X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o >>> +X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o >>> X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o >>> X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o >>> X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o >>> diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm >>> new file mode 100644 >>> index 0000000000..00b48a821b >>> --- /dev/null >>> +++ b/libavcodec/x86/sbcdsp.asm >>> @@ -0,0 +1,284 @@ >>> +;****************************************************************************** >>> +;* SIMD optimized SBC encoder DSP functions >>> +;* >>> +;* Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org> >>> +;* Copyright (C) 2008-2010 Nokia Corporation >>> +;* Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org> >>> +;* Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> >>> +;* Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> >>> +;* >>> +;* This file is part of FFmpeg. >>> +;* >>> +;* FFmpeg is free software; you can redistribute it and/or >>> +;* modify it under the terms of the GNU Lesser General Public >>> +;* License as published by the Free Software Foundation; either >>> +;* version 2.1 of the License, or (at your option) any later version. >>> +;* >>> +;* FFmpeg is distributed in the hope that it will be useful, >>> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of >>> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> +;* Lesser General Public License for more details. >>> +;* >>> +;* You should have received a copy of the GNU Lesser General Public >>> +;* License along with FFmpeg; if not, write to the Free Software >>> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA >>> +;****************************************************************************** >>> + >>> +%include "libavutil/x86/x86util.asm" >>> + >>> +SECTION_RODATA >>> + >>> +scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1) >>> + >>> +SECTION .text >>> + >>> +;******************************************************************* >>> +;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts); >>> +;******************************************************************* >>> +INIT_MMX mmx >>> +cglobal sbc_analyze_4, 3, 3, 4, in, out, consts >>> + movq m0, [inq] >>> + movq m1, [inq+8] >>> + pmaddwd m0, [constsq] >>> + pmaddwd m1, [constsq+8] >>> + paddd m0, [scale_mask] >>> + paddd m1, [scale_mask] >>> + >>> + movq m2, [inq+16] >>> + movq m3, [inq+24] >>> + pmaddwd m2, [constsq+16] >>> + pmaddwd m3, [constsq+24] >>> + paddd m0, m2 >>> + paddd m1, m3 >>> + >>> + movq m2, [inq+32] >>> + movq m3, [inq+40] >>> + pmaddwd m2, [constsq+32] >>> + pmaddwd m3, [constsq+40] >>> + paddd m0, m2 >>> + paddd m1, m3 >>> + >>> + movq m2, [inq+48] >>> + movq m3, [inq+56] >>> + pmaddwd m2, [constsq+48] >>> + pmaddwd m3, [constsq+56] >>> + paddd m0, m2 >>> + paddd m1, m3 >>> + >>> + movq m2, [inq+64] >>> + movq m3, [inq+72] >>> + pmaddwd m2, [constsq+64] >>> + pmaddwd m3, [constsq+72] >>> + paddd m0, m2 >>> + paddd m1, m3 >>> + >>> + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE >>> + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE >>> + packssdw m0, m0 >>> + packssdw m1, m1 >>> + >>> + movq m2, m0 >>> + pmaddwd m0, [constsq+80] >>> + pmaddwd m2, [constsq+88] >>> + >>> + movq m3, m1 >>> + pmaddwd m1, [constsq+96] >>> + pmaddwd m3, [constsq+104] >>> + paddd m0, m1 >>> + paddd m2, m3 >>> + >>> + movq [outq ], m0 >>> + movq [outq+8], m2 >>> + >>> + RET >>> + >>> + >>> + >>> +;******************************************************************* >>> +;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts); >>> +;******************************************************************* >>> +INIT_MMX mmx >>> +cglobal sbc_analyze_8, 3, 3, 4, in, out, consts >>> + movq m0, [inq] >>> + movq m1, [inq+8] >>> + movq m2, [inq+16] >>> + movq m3, [inq+24] >>> + pmaddwd m0, [constsq] >>> + pmaddwd m1, [constsq+8] >>> + pmaddwd m2, [constsq+16] >>> + pmaddwd m3, [constsq+24] >>> + paddd m0, [scale_mask] >>> + paddd m1, [scale_mask] >>> + paddd m2, [scale_mask] >>> + paddd m3, [scale_mask] >>> + >>> + movq m4, [inq+32] >>> + movq m5, [inq+40] >>> + movq m6, [inq+48] >>> + movq m7, [inq+56] >>> + pmaddwd m4, [constsq+32] >>> + pmaddwd m5, [constsq+40] >>> + pmaddwd m6, [constsq+48] >>> + pmaddwd m7, [constsq+56] >>> + paddd m0, m4 >>> + paddd m1, m5 >>> + paddd m2, m6 >>> + paddd m3, m7 >>> + >>> + movq m4, [inq+64] >>> + movq m5, [inq+72] >>> + movq m6, [inq+80] >>> + movq m7, [inq+88] >>> + pmaddwd m4, [constsq+64] >>> + pmaddwd m5, [constsq+72] >>> + pmaddwd m6, [constsq+80] >>> + pmaddwd m7, [constsq+88] >>> + paddd m0, m4 >>> + paddd m1, m5 >>> + paddd m2, m6 >>> + paddd m3, m7 >>> + >>> + movq m4, [inq+96] >>> + movq m5, [inq+104] >>> + movq m6, [inq+112] >>> + movq m7, [inq+120] >>> + pmaddwd m4, [constsq+96] >>> + pmaddwd m5, [constsq+104] >>> + pmaddwd m6, [constsq+112] >>> + pmaddwd m7, [constsq+120] >>> + paddd m0, m4 >>> + paddd m1, m5 >>> + paddd m2, m6 >>> + paddd m3, m7 >>> + >>> + movq m4, [inq+128] >>> + movq m5, [inq+136] >>> + movq m6, [inq+144] >>> + movq m7, [inq+152] >>> + pmaddwd m4, [constsq+128] >>> + pmaddwd m5, [constsq+136] >>> + pmaddwd m6, [constsq+144] >>> + pmaddwd m7, [constsq+152] >>> + paddd m0, m4 >>> + paddd m1, m5 >>> + paddd m2, m6 >>> + paddd m3, m7 >>> + >>> + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE >>> + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE >>> + psrad m2, 16 ; SBC_PROTO_FIXED_SCALE >>> + psrad m3, 16 ; SBC_PROTO_FIXED_SCALE >>> + >>> + packssdw m0, m0 >>> + packssdw m1, m1 >>> + packssdw m2, m2 >>> + packssdw m3, m3 >>> + >>> + movq m4, m0 >>> + movq m5, m0 >>> + pmaddwd m4, [constsq+160] >>> + pmaddwd m5, [constsq+168] >>> + >>> + movq m6, m1 >>> + movq m7, m1 >>> + pmaddwd m6, [constsq+192] >>> + pmaddwd m7, [constsq+200] >>> + paddd m4, m6 >>> + paddd m5, m7 >>> + >>> + movq m6, m2 >>> + movq m7, m2 >>> + pmaddwd m6, [constsq+224] >>> + pmaddwd m7, [constsq+232] >>> + paddd m4, m6 >>> + paddd m5, m7 >>> + >>> + movq m6, m3 >>> + movq m7, m3 >>> + pmaddwd m6, [constsq+256] >>> + pmaddwd m7, [constsq+264] >>> + paddd m4, m6 >>> + paddd m5, m7 >>> + >>> + movq [outq ], m4 >>> + movq [outq+8], m5 >>> + >>> + movq m5, m0 >>> + pmaddwd m0, [constsq+176] >>> + pmaddwd m5, [constsq+184] >>> + >>> + movq m7, m1 >>> + pmaddwd m1, [constsq+208] >>> + pmaddwd m7, [constsq+216] >>> + paddd m0, m1 >>> + paddd m5, m7 >>> + >>> + movq m7, m2 >>> + pmaddwd m2, [constsq+240] >>> + pmaddwd m7, [constsq+248] >>> + paddd m0, m2 >>> + paddd m5, m7 >>> + >>> + movq m7, m3 >>> + pmaddwd m3, [constsq+272] >>> + pmaddwd m7, [constsq+280] >>> + paddd m0, m3 >>> + paddd m5, m7 >>> + >>> + movq [outq+16], m0 >>> + movq [outq+24], m5 >>> + >>> + RET >>> + >>> + >>> +;******************************************************************* >>> +;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8], >>> +; uint32_t scale_factor[2][8], >>> +; int blocks, int channels, int subbands) >>> +;******************************************************************* >>> +INIT_MMX mmx >>> +cglobal sbc_calc_scalefactors, 5, 7, 3, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk >>> + ; subbands = 4 * subbands * channels >>> + shl subbandsq, 2 >>> + cmp channelsq, 2 >>> + jl .loop_1 >>> + shl subbandsq, 1 >>> + >>> +.loop_1: >>> + sub subbandsq, 8 >>> + lea ptrq, [sb_sample_fq + subbandsq] >>> + >>> + ; blk = (blocks - 1) * 64; >>> + lea blkq, [blocksq - 1] >>> + shl blkq, 6 >>> + >>> + movq m0, [scale_mask] >>> +.loop_2: >>> + movq m1, [ptrq+blkq] >>> + pxor m2, m2 >>> + pcmpgtd m1, m2 >>> + paddd m1, [ptrq+blkq] >>> + pcmpgtd m2, m1 >>> + pxor m1, m2 >>> + >>> + por m0, m1 >>> + >>> + sub blkq, 64 >>> + jns .loop_2 >>> + >>> + movd blkd, m0 >>> + psrlq m0, 32 >>> + bsr blkd, blkd >>> + sub blkd, 15 ; SCALE_OUT_BITS >>> + mov [scale_factorq + subbandsq], blkd >>> + >>> + movd blkd, m0 >>> + bsr blkd, blkd >>> + sub blkd, 15 ; SCALE_OUT_BITS >>> + mov [scale_factorq + subbandsq + 4], blkd >>> + >>> + cmp subbandsq, 0 >>> + jg .loop_1 >>> + >>> + emms >>> + RET >> >> These should be done in SSE2. There's no reason for them to be MMX. > > There is at least one reason for it to be MMX. It is existing legacy > code that is ported to ffmpeg. So the MMX code is here and working. The code had to be adapted to x86inc syntax, so it's not a copy paste done without effort. Said effort could have also been used to turn them into SSE2, as it's trivial at least for the first two. I'm not discrediting your work in porting these, I'm saying that said work should have gone into making them up to date as well. I'm also not blocking this patch if you decide to not make them SSE2. I'll point a few things about the existing asm in a separate email.
On 12/17/2017 6:47 PM, Aurelien Jacobs wrote: > This was originally based on libsbc, and was fully integrated into ffmpeg. > --- > libavcodec/sbcdsp.c | 3 + > libavcodec/sbcdsp.h | 2 + > libavcodec/x86/Makefile | 2 + > libavcodec/x86/sbcdsp.asm | 284 +++++++++++++++++++++++++++++++++++++++++++ > libavcodec/x86/sbcdsp_init.c | 51 ++++++++ > 5 files changed, 342 insertions(+) > create mode 100644 libavcodec/x86/sbcdsp.asm > create mode 100644 libavcodec/x86/sbcdsp_init.c > > diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c > index 16faf5ba9b..9bb60cdd5e 100644 > --- a/libavcodec/sbcdsp.c > +++ b/libavcodec/sbcdsp.c > @@ -387,4 +387,7 @@ av_cold void ff_sbcdsp_init(SBCDSPContext *s) > /* Default implementation for scale factors calculation */ > s->sbc_calc_scalefactors = sbc_calc_scalefactors; > s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j; > + > + if (ARCH_X86) > + ff_sbcdsp_init_x86(s); > } > diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h > index 66ed7d324e..127e6a8a11 100644 > --- a/libavcodec/sbcdsp.h > +++ b/libavcodec/sbcdsp.h > @@ -80,4 +80,6 @@ struct sbc_dsp_context { > */ > void ff_sbcdsp_init(SBCDSPContext *s); > > +void ff_sbcdsp_init_x86(SBCDSPContext *s); > + > #endif /* AVCODEC_SBCDSP_H */ > diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile > index a805cd37b4..2350c8bbee 100644 > --- a/libavcodec/x86/Makefile > +++ b/libavcodec/x86/Makefile > @@ -63,6 +63,7 @@ OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o > OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o > OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o > OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o > +OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o > OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o > OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o > OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o > @@ -172,6 +173,7 @@ X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o > X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o > X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o > X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o > +X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o > X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o > X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o > X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o > diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm > new file mode 100644 > index 0000000000..00b48a821b > --- /dev/null > +++ b/libavcodec/x86/sbcdsp.asm > @@ -0,0 +1,284 @@ > +;****************************************************************************** > +;* SIMD optimized SBC encoder DSP functions > +;* > +;* Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org> > +;* Copyright (C) 2008-2010 Nokia Corporation > +;* Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org> > +;* Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> > +;* Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1) > + > +SECTION .text > + > +;******************************************************************* > +;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts); > +;******************************************************************* > +INIT_MMX mmx > +cglobal sbc_analyze_4, 3, 3, 4, in, out, consts > + movq m0, [inq] > + movq m1, [inq+8] > + pmaddwd m0, [constsq] > + pmaddwd m1, [constsq+8] > + paddd m0, [scale_mask] > + paddd m1, [scale_mask] > + > + movq m2, [inq+16] > + movq m3, [inq+24] > + pmaddwd m2, [constsq+16] > + pmaddwd m3, [constsq+24] > + paddd m0, m2 > + paddd m1, m3 > + > + movq m2, [inq+32] > + movq m3, [inq+40] > + pmaddwd m2, [constsq+32] > + pmaddwd m3, [constsq+40] > + paddd m0, m2 > + paddd m1, m3 > + > + movq m2, [inq+48] > + movq m3, [inq+56] > + pmaddwd m2, [constsq+48] > + pmaddwd m3, [constsq+56] > + paddd m0, m2 > + paddd m1, m3 > + > + movq m2, [inq+64] > + movq m3, [inq+72] > + pmaddwd m2, [constsq+64] > + pmaddwd m3, [constsq+72] > + paddd m0, m2 > + paddd m1, m3 > + > + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE > + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE > + packssdw m0, m0 > + packssdw m1, m1 > + > + movq m2, m0 > + pmaddwd m0, [constsq+80] > + pmaddwd m2, [constsq+88] > + > + movq m3, m1 > + pmaddwd m1, [constsq+96] > + pmaddwd m3, [constsq+104] > + paddd m0, m1 > + paddd m2, m3 > + > + movq [outq ], m0 > + movq [outq+8], m2 > + > + RET > + > + > + > +;******************************************************************* > +;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts); > +;******************************************************************* > +INIT_MMX mmx > +cglobal sbc_analyze_8, 3, 3, 4, in, out, consts > + movq m0, [inq] > + movq m1, [inq+8] > + movq m2, [inq+16] > + movq m3, [inq+24] > + pmaddwd m0, [constsq] > + pmaddwd m1, [constsq+8] > + pmaddwd m2, [constsq+16] > + pmaddwd m3, [constsq+24] > + paddd m0, [scale_mask] > + paddd m1, [scale_mask] > + paddd m2, [scale_mask] > + paddd m3, [scale_mask] > + > + movq m4, [inq+32] > + movq m5, [inq+40] > + movq m6, [inq+48] > + movq m7, [inq+56] > + pmaddwd m4, [constsq+32] > + pmaddwd m5, [constsq+40] > + pmaddwd m6, [constsq+48] > + pmaddwd m7, [constsq+56] > + paddd m0, m4 > + paddd m1, m5 > + paddd m2, m6 > + paddd m3, m7 > + > + movq m4, [inq+64] > + movq m5, [inq+72] > + movq m6, [inq+80] > + movq m7, [inq+88] > + pmaddwd m4, [constsq+64] > + pmaddwd m5, [constsq+72] > + pmaddwd m6, [constsq+80] > + pmaddwd m7, [constsq+88] > + paddd m0, m4 > + paddd m1, m5 > + paddd m2, m6 > + paddd m3, m7 > + > + movq m4, [inq+96] > + movq m5, [inq+104] > + movq m6, [inq+112] > + movq m7, [inq+120] > + pmaddwd m4, [constsq+96] > + pmaddwd m5, [constsq+104] > + pmaddwd m6, [constsq+112] > + pmaddwd m7, [constsq+120] > + paddd m0, m4 > + paddd m1, m5 > + paddd m2, m6 > + paddd m3, m7 > + > + movq m4, [inq+128] > + movq m5, [inq+136] > + movq m6, [inq+144] > + movq m7, [inq+152] > + pmaddwd m4, [constsq+128] > + pmaddwd m5, [constsq+136] > + pmaddwd m6, [constsq+144] > + pmaddwd m7, [constsq+152] > + paddd m0, m4 > + paddd m1, m5 > + paddd m2, m6 > + paddd m3, m7 > + > + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE > + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE > + psrad m2, 16 ; SBC_PROTO_FIXED_SCALE > + psrad m3, 16 ; SBC_PROTO_FIXED_SCALE > + > + packssdw m0, m0 > + packssdw m1, m1 > + packssdw m2, m2 > + packssdw m3, m3 > + > + movq m4, m0 > + movq m5, m0 > + pmaddwd m4, [constsq+160] > + pmaddwd m5, [constsq+168] > + > + movq m6, m1 > + movq m7, m1 > + pmaddwd m6, [constsq+192] > + pmaddwd m7, [constsq+200] > + paddd m4, m6 > + paddd m5, m7 > + > + movq m6, m2 > + movq m7, m2 > + pmaddwd m6, [constsq+224] > + pmaddwd m7, [constsq+232] > + paddd m4, m6 > + paddd m5, m7 > + > + movq m6, m3 > + movq m7, m3 > + pmaddwd m6, [constsq+256] > + pmaddwd m7, [constsq+264] > + paddd m4, m6 > + paddd m5, m7 > + > + movq [outq ], m4 > + movq [outq+8], m5 > + > + movq m5, m0 > + pmaddwd m0, [constsq+176] > + pmaddwd m5, [constsq+184] > + > + movq m7, m1 > + pmaddwd m1, [constsq+208] > + pmaddwd m7, [constsq+216] > + paddd m0, m1 > + paddd m5, m7 > + > + movq m7, m2 > + pmaddwd m2, [constsq+240] > + pmaddwd m7, [constsq+248] > + paddd m0, m2 > + paddd m5, m7 > + > + movq m7, m3 > + pmaddwd m3, [constsq+272] > + pmaddwd m7, [constsq+280] > + paddd m0, m3 > + paddd m5, m7 > + > + movq [outq+16], m0 > + movq [outq+24], m5 > + > + RET > + > + > +;******************************************************************* > +;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8], > +; uint32_t scale_factor[2][8], > +; int blocks, int channels, int subbands) > +;******************************************************************* > +INIT_MMX mmx > +cglobal sbc_calc_scalefactors, 5, 7, 3, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk On x86_64 (Windows at least), the high 32 bits of registers storing int arguments may contain garbage, so you need to work around it. For blocks, add a "movsxdifnidn blocksq, blocksd" instruction line at the beginning of the function. For the other two see below. > + ; subbands = 4 * subbands * channels > + shl subbandsq, 2 shl subbandsd, 2 This implicitly zeroes the high bits. > + cmp channelsq, 2 cmp channelsd, 2 If any of the two above can have negative values, then you'll have to also use movsxdifnidn with them and keep the q suffix. > + jl .loop_1 > + shl subbandsq, 1 > + > +.loop_1: > + sub subbandsq, 8 > + lea ptrq, [sb_sample_fq + subbandsq] > + > + ; blk = (blocks - 1) * 64; > + lea blkq, [blocksq - 1] > + shl blkq, 6 > + > + movq m0, [scale_mask] You could keep the scale_mask constant in a separate register, to avoid having to load it repeatedly inside the loop. > +.loop_2: > + movq m1, [ptrq+blkq] > + pxor m2, m2 > + pcmpgtd m1, m2 > + paddd m1, [ptrq+blkq] Similarly, you could load this once. Although not sure it will make much difference, to be honest. > + pcmpgtd m2, m1 > + pxor m1, m2 > + > + por m0, m1 > + > + sub blkq, 64 > + jns .loop_2 > + > + movd blkd, m0 > + psrlq m0, 32 > + bsr blkd, blkd > + sub blkd, 15 ; SCALE_OUT_BITS > + mov [scale_factorq + subbandsq], blkd > + > + movd blkd, m0 > + bsr blkd, blkd > + sub blkd, 15 ; SCALE_OUT_BITS > + mov [scale_factorq + subbandsq + 4], blkd > + > + cmp subbandsq, 0 > + jg .loop_1 > + > + emms > + RET > diff --git a/libavcodec/x86/sbcdsp_init.c b/libavcodec/x86/sbcdsp_init.c > new file mode 100644 > index 0000000000..86effecfdf > --- /dev/null > +++ b/libavcodec/x86/sbcdsp_init.c > @@ -0,0 +1,51 @@ > +/* > + * Bluetooth low-complexity, subband codec (SBC) > + * > + * Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org> > + * Copyright (C) 2008-2010 Nokia Corporation > + * Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org> > + * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> > + * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +/** > + * @file > + * SBC MMX optimization for some basic "building bricks" > + */ > + > +#include "libavutil/cpu.h" > +#include "libavutil/x86/cpu.h" > +#include "libavcodec/sbcdsp.h" > + > +void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts); > +void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts); > +void ff_sbc_calc_scalefactors_mmx(int32_t sb_sample_f[16][2][8], > + uint32_t scale_factor[2][8], > + int blocks, int channels, int subbands); > + > +av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s) > +{ > + int cpu_flags = av_get_cpu_flags(); > + > + if (EXTERNAL_MMX(cpu_flags)) { > + s->sbc_analyze_4 = ff_sbc_analyze_4_mmx; > + s->sbc_analyze_8 = ff_sbc_analyze_8_mmx; > + s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx; > + } > +} >
Hi, On Wed, Dec 20, 2017 at 4:58 PM, James Almer <jamrial@gmail.com> wrote: > On 12/17/2017 6:47 PM, Aurelien Jacobs wrote: > > +;******************************************************************* > > +;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8], > > +; uint32_t scale_factor[2][8], > > +; int blocks, int channels, int subbands) > > +;******************************************************************* > > +INIT_MMX mmx > > +cglobal sbc_calc_scalefactors, 5, 7, 3, sb_sample_f, scale_factor, > blocks, channels, subbands, ptr, blk > > On x86_64 (Windows at least), the high 32 bits of registers storing int > arguments may contain garbage, so you need to work around it. I think that's only for stack arguments, i.e. only subbands (not blocks or channels). Ronald
diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c index 16faf5ba9b..9bb60cdd5e 100644 --- a/libavcodec/sbcdsp.c +++ b/libavcodec/sbcdsp.c @@ -387,4 +387,7 @@ av_cold void ff_sbcdsp_init(SBCDSPContext *s) /* Default implementation for scale factors calculation */ s->sbc_calc_scalefactors = sbc_calc_scalefactors; s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j; + + if (ARCH_X86) + ff_sbcdsp_init_x86(s); } diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h index 66ed7d324e..127e6a8a11 100644 --- a/libavcodec/sbcdsp.h +++ b/libavcodec/sbcdsp.h @@ -80,4 +80,6 @@ struct sbc_dsp_context { */ void ff_sbcdsp_init(SBCDSPContext *s); +void ff_sbcdsp_init_x86(SBCDSPContext *s); + #endif /* AVCODEC_SBCDSP_H */ diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index a805cd37b4..2350c8bbee 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -63,6 +63,7 @@ OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o +OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o @@ -172,6 +173,7 @@ X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o +X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm new file mode 100644 index 0000000000..00b48a821b --- /dev/null +++ b/libavcodec/x86/sbcdsp.asm @@ -0,0 +1,284 @@ +;****************************************************************************** +;* SIMD optimized SBC encoder DSP functions +;* +;* Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org> +;* Copyright (C) 2008-2010 Nokia Corporation +;* Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org> +;* Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> +;* Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1) + +SECTION .text + +;******************************************************************* +;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts); +;******************************************************************* +INIT_MMX mmx +cglobal sbc_analyze_4, 3, 3, 4, in, out, consts + movq m0, [inq] + movq m1, [inq+8] + pmaddwd m0, [constsq] + pmaddwd m1, [constsq+8] + paddd m0, [scale_mask] + paddd m1, [scale_mask] + + movq m2, [inq+16] + movq m3, [inq+24] + pmaddwd m2, [constsq+16] + pmaddwd m3, [constsq+24] + paddd m0, m2 + paddd m1, m3 + + movq m2, [inq+32] + movq m3, [inq+40] + pmaddwd m2, [constsq+32] + pmaddwd m3, [constsq+40] + paddd m0, m2 + paddd m1, m3 + + movq m2, [inq+48] + movq m3, [inq+56] + pmaddwd m2, [constsq+48] + pmaddwd m3, [constsq+56] + paddd m0, m2 + paddd m1, m3 + + movq m2, [inq+64] + movq m3, [inq+72] + pmaddwd m2, [constsq+64] + pmaddwd m3, [constsq+72] + paddd m0, m2 + paddd m1, m3 + + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE + packssdw m0, m0 + packssdw m1, m1 + + movq m2, m0 + pmaddwd m0, [constsq+80] + pmaddwd m2, [constsq+88] + + movq m3, m1 + pmaddwd m1, [constsq+96] + pmaddwd m3, [constsq+104] + paddd m0, m1 + paddd m2, m3 + + movq [outq ], m0 + movq [outq+8], m2 + + RET + + + +;******************************************************************* +;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts); +;******************************************************************* +INIT_MMX mmx +cglobal sbc_analyze_8, 3, 3, 4, in, out, consts + movq m0, [inq] + movq m1, [inq+8] + movq m2, [inq+16] + movq m3, [inq+24] + pmaddwd m0, [constsq] + pmaddwd m1, [constsq+8] + pmaddwd m2, [constsq+16] + pmaddwd m3, [constsq+24] + paddd m0, [scale_mask] + paddd m1, [scale_mask] + paddd m2, [scale_mask] + paddd m3, [scale_mask] + + movq m4, [inq+32] + movq m5, [inq+40] + movq m6, [inq+48] + movq m7, [inq+56] + pmaddwd m4, [constsq+32] + pmaddwd m5, [constsq+40] + pmaddwd m6, [constsq+48] + pmaddwd m7, [constsq+56] + paddd m0, m4 + paddd m1, m5 + paddd m2, m6 + paddd m3, m7 + + movq m4, [inq+64] + movq m5, [inq+72] + movq m6, [inq+80] + movq m7, [inq+88] + pmaddwd m4, [constsq+64] + pmaddwd m5, [constsq+72] + pmaddwd m6, [constsq+80] + pmaddwd m7, [constsq+88] + paddd m0, m4 + paddd m1, m5 + paddd m2, m6 + paddd m3, m7 + + movq m4, [inq+96] + movq m5, [inq+104] + movq m6, [inq+112] + movq m7, [inq+120] + pmaddwd m4, [constsq+96] + pmaddwd m5, [constsq+104] + pmaddwd m6, [constsq+112] + pmaddwd m7, [constsq+120] + paddd m0, m4 + paddd m1, m5 + paddd m2, m6 + paddd m3, m7 + + movq m4, [inq+128] + movq m5, [inq+136] + movq m6, [inq+144] + movq m7, [inq+152] + pmaddwd m4, [constsq+128] + pmaddwd m5, [constsq+136] + pmaddwd m6, [constsq+144] + pmaddwd m7, [constsq+152] + paddd m0, m4 + paddd m1, m5 + paddd m2, m6 + paddd m3, m7 + + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE + psrad m2, 16 ; SBC_PROTO_FIXED_SCALE + psrad m3, 16 ; SBC_PROTO_FIXED_SCALE + + packssdw m0, m0 + packssdw m1, m1 + packssdw m2, m2 + packssdw m3, m3 + + movq m4, m0 + movq m5, m0 + pmaddwd m4, [constsq+160] + pmaddwd m5, [constsq+168] + + movq m6, m1 + movq m7, m1 + pmaddwd m6, [constsq+192] + pmaddwd m7, [constsq+200] + paddd m4, m6 + paddd m5, m7 + + movq m6, m2 + movq m7, m2 + pmaddwd m6, [constsq+224] + pmaddwd m7, [constsq+232] + paddd m4, m6 + paddd m5, m7 + + movq m6, m3 + movq m7, m3 + pmaddwd m6, [constsq+256] + pmaddwd m7, [constsq+264] + paddd m4, m6 + paddd m5, m7 + + movq [outq ], m4 + movq [outq+8], m5 + + movq m5, m0 + pmaddwd m0, [constsq+176] + pmaddwd m5, [constsq+184] + + movq m7, m1 + pmaddwd m1, [constsq+208] + pmaddwd m7, [constsq+216] + paddd m0, m1 + paddd m5, m7 + + movq m7, m2 + pmaddwd m2, [constsq+240] + pmaddwd m7, [constsq+248] + paddd m0, m2 + paddd m5, m7 + + movq m7, m3 + pmaddwd m3, [constsq+272] + pmaddwd m7, [constsq+280] + paddd m0, m3 + paddd m5, m7 + + movq [outq+16], m0 + movq [outq+24], m5 + + RET + + +;******************************************************************* +;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8], +; uint32_t scale_factor[2][8], +; int blocks, int channels, int subbands) +;******************************************************************* +INIT_MMX mmx +cglobal sbc_calc_scalefactors, 5, 7, 3, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk + ; subbands = 4 * subbands * channels + shl subbandsq, 2 + cmp channelsq, 2 + jl .loop_1 + shl subbandsq, 1 + +.loop_1: + sub subbandsq, 8 + lea ptrq, [sb_sample_fq + subbandsq] + + ; blk = (blocks - 1) * 64; + lea blkq, [blocksq - 1] + shl blkq, 6 + + movq m0, [scale_mask] +.loop_2: + movq m1, [ptrq+blkq] + pxor m2, m2 + pcmpgtd m1, m2 + paddd m1, [ptrq+blkq] + pcmpgtd m2, m1 + pxor m1, m2 + + por m0, m1 + + sub blkq, 64 + jns .loop_2 + + movd blkd, m0 + psrlq m0, 32 + bsr blkd, blkd + sub blkd, 15 ; SCALE_OUT_BITS + mov [scale_factorq + subbandsq], blkd + + movd blkd, m0 + bsr blkd, blkd + sub blkd, 15 ; SCALE_OUT_BITS + mov [scale_factorq + subbandsq + 4], blkd + + cmp subbandsq, 0 + jg .loop_1 + + emms + RET diff --git a/libavcodec/x86/sbcdsp_init.c b/libavcodec/x86/sbcdsp_init.c new file mode 100644 index 0000000000..86effecfdf --- /dev/null +++ b/libavcodec/x86/sbcdsp_init.c @@ -0,0 +1,51 @@ +/* + * Bluetooth low-complexity, subband codec (SBC) + * + * Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org> + * Copyright (C) 2008-2010 Nokia Corporation + * Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org> + * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> + * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * SBC MMX optimization for some basic "building bricks" + */ + +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/sbcdsp.h" + +void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts); +void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts); +void ff_sbc_calc_scalefactors_mmx(int32_t sb_sample_f[16][2][8], + uint32_t scale_factor[2][8], + int blocks, int channels, int subbands); + +av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + s->sbc_analyze_4 = ff_sbc_analyze_4_mmx; + s->sbc_analyze_8 = ff_sbc_analyze_8_mmx; + s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx; + } +}