Message ID | CAE9qxYA58p_2YLswP6ZkQq2yo0m8P9EujBi_0pMw6tgoWCTnxg@mail.gmail.com |
---|---|
State | Superseded |
Headers | show |
On Sun, Oct 9, 2016 at 2:15 PM, Rostislav Pehlivanov
<atomnuker@gmail.com> wrote:
> +cglobal aac_quantize_bands, 6, 6, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding
Now that this function is SSE2 you should explicitly use
floating-point instructions to avoid bypass delays from transitioning
between int and float domains.
E.g. movaps instead of mova and shufps instead of SPLATD. Also andps
instead of pand.
On Sun, Oct 09, 2016 at 01:15:44PM +0100, Rostislav Pehlivanov wrote: > On 9 October 2016 at 03:18, Michael Niedermayer <michael@niedermayer.cc> > wrote: > > > On Sat, Oct 08, 2016 at 06:42:28PM +0100, Rostislav Pehlivanov wrote: > > > Performance improvements: > > > > > > quant_bands: > > > with: 681 decicycles in quant_bands, 8388453 runs, 155 skips > > > without: 1190 decicycles in quant_bands, 8388386 runs, 222 skips > > > Around 42% for the function > > > > > > Twoloop coder: > > > > > > abs_pow34: > > > with/without: 7.82s/8.17s > > > Around 4% for the entire encoder > > > > > > Both: > > > with/without: 7.15s/8.17s > > > Around 12% for the entire encoder > > > > > > Fast coder: > > > > > > abs_pow34: > > > with/without: 3.40s/3.77s > > > Around 10% for the entire encoder > > > > > > Both: > > > with/without: 3.02s/3.77s > > > Around 20% faster for the entire encoder > > > > > > Signed-off-by: Rostislav Pehlivanov <atomnuker@gmail.com> > > > --- > > > libavcodec/aaccoder.c | 22 ++++---- > > > libavcodec/aaccoder_trellis.h | 2 +- > > > libavcodec/aaccoder_twoloop.h | 2 +- > > > libavcodec/aacenc.c | 4 ++ > > > libavcodec/aacenc.h | 6 +++ > > > libavcodec/aacenc_is.c | 6 +-- > > > libavcodec/aacenc_ltp.c | 4 +- > > > libavcodec/aacenc_pred.c | 6 +-- > > > libavcodec/aacenc_quantization.h | 4 +- > > > libavcodec/aacenc_utils.h | 4 +- > > > libavcodec/x86/Makefile | 2 + > > > libavcodec/x86/aacencdsp.asm | 108 ++++++++++++++++++++++++++++++ > > +++++++++ > > > libavcodec/x86/aacencdsp_init.c | 42 +++++++++++++++ > > > 13 files changed, 187 insertions(+), 25 deletions(-) > > > create mode 100644 libavcodec/x86/aacencdsp.asm > > > create mode 100644 libavcodec/x86/aacencdsp_init.c > > > > libavcodec/x86/aacencdsp.asm:67: error: expression syntax error > > libavcodec/x86/aacencdsp.asm:79: warning: (RUN_AVX_INSTR:22) use of > > ``movd'' sse2 instruction in sse function: ff_aac_quantize_bands_sse > > libavcodec/x86/aacencdsp.asm:99: warning: (RUN_AVX_INSTR:22) use of > > ``pand'' sse2 instruction in sse function: ff_aac_quantize_bands_sse > > libavcodec/x86/aacencdsp.asm:103: warning: (RUN_AVX_INSTR:20) use of > > ``cvttps2dq'' sse2 instruction in sse function: ff_aac_quantize_bands_sse > > > > yasm 1.2.0 > > > > [...] > > -- > > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > > > Breaking DRM is a little like attempting to break through a door even > > though the window is wide open and the only thing in the house is a bunch > > of things you dont want and which you would get tomorrow for free anyway > > > > _______________________________________________ > > ffmpeg-devel mailing list > > ffmpeg-devel@ffmpeg.org > > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > > > > Yes, discussed on IRC, fixed that yesterday. Attached the patch if you want > to test. > I'll push it tonight unless confirmed to not work on Windows (haven't > tested but it should work according to what nevcairiel said). > aaccoder.c | 22 ++++++------ > aaccoder_trellis.h | 2 - > aaccoder_twoloop.h | 2 - > aacenc.c | 4 ++ > aacenc.h | 6 +++ > aacenc_is.c | 6 +-- > aacenc_ltp.c | 4 +- > aacenc_pred.c | 6 +-- > aacenc_quantization.h | 4 +- > aacenc_utils.h | 4 +- > x86/Makefile | 2 + > x86/aacencdsp.asm | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++ > x86/aacencdsp_init.c | 43 ++++++++++++++++++++++++ > 13 files changed, 167 insertions(+), 25 deletions(-) > 8f8dd9c1cca110b682dbb73cbae6643798336aec 0001-aacenc-add-SIMD-optimizations-for-abs_pow34-and-quan.patch > From 3bc5622e5be67698d099a191ebfd297bf1eda7cd Mon Sep 17 00:00:00 2001 > From: Rostislav Pehlivanov <atomnuker@gmail.com> > Date: Sat, 8 Oct 2016 15:59:14 +0100 > Subject: [PATCH] aacenc: add SIMD optimizations for abs_pow34 and quantization this segfaults on x86-32 make fate-gaplessenc-itunes-to-ipod-aac V=2 ... Stream mapping: Stream #0:0 -> #0:0 (aac (native) -> aac (native)) Segmentation fault make: *** [fate-gaplessenc-itunes-to-ipod-aac] Error 1 Program received signal SIGSEGV, Segmentation fault. ff_abs_pow34_sse () at src/libavcodec/x86/aacencdsp.asm:42 42 mova m0, [inq+sizeq] (gdb) bt Python Exception <type 'exceptions.ImportError'> No module named gdb.frames: #0 ff_abs_pow34_sse () at src/libavcodec/x86/aacencdsp.asm:42 #1 0x08bfe132 in search_for_ms (s=0xf7c02020, cpe=0xf12e9020) at src/libavcodec/aaccoder.c:794 #2 0x089fc754 in aac_encode_frame (avctx=0x98ac900, avpkt=0x98cc000, frame=0x98e55a0, got_packet_ptr=0xffffcedc) at src/libavcodec/aacenc.c:735 #3 0x0877182d in avcodec_encode_audio2 (avctx=0x98ac900, avpkt=0x98cc000, frame=<optimized out>, got_packet_ptr=0xffffcedc) at src/libavcodec/utils.c:1886 #4 0x0877226f in do_encode (avctx=0x98ac900, frame=0x98e55a0, got_packet=0xffffcedc) at src/libavcodec/utils.c:2939 #5 0x08774287 in avcodec_send_frame (avctx=0x98ac900, frame=0x98e55a0) at src/libavcodec/utils.c:2985 #6 0x080ebbbb in do_audio_out (frame=0x98e55a0, ost=0x98ac760, of=0x98a8500) at src/ffmpeg.c:888 #7 reap_filters (flush=0) at src/ffmpeg.c:1460 #8 0x080f1ee5 in transcode_step () at src/ffmpeg.c:4343 #9 transcode () at src/ffmpeg.c:4387 #10 0x080ce57e in main (argc=<optimized out>, argv=<optimized out>) at src/ffmpeg.c:4592 disassemble $pc-32,$pc+32 Dump of assembler code from 0x8c42bcc to 0x8c42c0c: 0x08c42bcc: nop 0x08c42bcd: nop 0x08c42bce: nop 0x08c42bcf: nop 0x08c42bd0 <ff_abs_pow34_sse+0>: mov 0x4(%esp),%eax 0x08c42bd4 <ff_abs_pow34_sse+4>: mov 0x8(%esp),%ecx 0x08c42bd8 <ff_abs_pow34_sse+8>: mov 0xc(%esp),%edx 0x08c42bdc <ff_abs_pow34_sse+12>: movaps 0x9031ef0,%xmm2 0x08c42be3 <ff_abs_pow34_sse+19>: shl $0x2,%edx 0x08c42be6 <ff_abs_pow34_sse+22>: add %edx,%ecx 0x08c42be8 <ff_abs_pow34_sse+24>: add %edx,%eax 0x08c42bea <ff_abs_pow34_sse+26>: neg %edx => 0x08c42bec <ff_abs_pow34_sse+28>: movaps (%ecx,%edx,1),%xmm0 0x08c42bf0 <ff_abs_pow34_sse+32>: andps %xmm2,%xmm0 0x08c42bf3 <ff_abs_pow34_sse+35>: sqrtps %xmm0,%xmm1 0x08c42bf6 <ff_abs_pow34_sse+38>: mulps %xmm1,%xmm0 0x08c42bf9 <ff_abs_pow34_sse+41>: sqrtps %xmm0,%xmm0 0x08c42bfc <ff_abs_pow34_sse+44>: movaps %xmm0,(%eax,%edx,1) 0x08c42c00 <ff_abs_pow34_sse+48>: add $0x10,%edx 0x08c42c03 <ff_abs_pow34_sse+51>: jl 0x8c42bec <ff_abs_pow34_sse+28> 0x08c42c05 <ff_abs_pow34_sse+53>: repz ret 0x08c42c07 <ff_abs_pow34_sse.loop+27>: nopw 0x0(%eax,%eax,1) eax 0xf7c0bbb0 -138363984 ecx 0xffffc4dc -15140 edx 0xfffffff0 -16 ebx 0xffffc6cc -14644 esp 0xffffc36c 0xffffc36c ebp 0xf7c02020 0xf7c02020 esi 0xffffc4cc -15156 edi 0xf12fb320 -248532192 eip 0x8c42bec 0x8c42bec <ff_abs_pow34_sse+28> eflags 0x10287 [ CF PF SF IF RF ] cs 0x23 35 ss 0x2b 43 ds 0x2b 43 es 0x2b 43 fs 0x0 0 gs 0x63 99 st0 457.1953125 (raw 0x4007e499000000000000) st1 13.3905181884765625 (raw 0x4002d63f900000000000) st2 0 (raw 0x00000000000000000000) st3 0 (raw 0x00000000000000000000) st4 0 (raw 0x00000000000000000000) st5 0.5 (raw 0x3ffe8000000000000000) st6 0 (raw 0x00000000000000000000) st7 0 (raw 0x00000000000000000000) fctrl 0x37f 895 fstat 0x36 54 ftag 0xffff 65535 fiseg 0x0 0 fioff 0x8bfe115 146792725 foseg 0x0 0 fooff 0xffffc3e0 -15392 fop 0x0 0 mxcsr 0x1fa0 [ PE IM DM ZM OM UM PM ] ymm0 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0, 0x0, 0x0, 0x3b, 0x0, 0x0, 0x0, 0x3b, 0x0, 0x0, 0x0, 0x3b, 0x0, 0x0, 0x0, 0x3b, 0x0 <repeats 16 times>}, v16_int16 = {0x0, 0x3b00, 0x0, 0x3b00, 0x0, 0x3b00, 0x0, 0x3b00, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0x3b000000, 0x3b000000, 0x3b000000, 0x3b000000, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x3b0000003b000000, 0x3b0000003b000000, 0x0, 0x0}, v2_int128 = {0x3b0000003b0000003b0000003b000000, 0x00000000000000000000000000000000}} ymm1 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x97, 0x90, 0xcf, 0x3e, 0x97, 0x90, 0xcf, 0x3e, 0x97, 0x90, 0xcf, 0x3e, 0x97, 0x90, 0xcf, 0x3e, 0x0 <repeats 16 times>}, v16_int16 = {0x9097, 0x3ecf, 0x9097, 0x3ecf, 0x9097, 0x3ecf, 0x9097, 0x3ecf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0x3ecf9097, 0x3ecf9097, 0x3ecf9097, 0x3ecf9097, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x3ecf90973ecf9097, 0x3ecf90973ecf9097, 0x0, 0x0}, v2_int128 = {0x3ecf90973ecf90973ecf90973ecf9097, 0x00000000000000000000000000000000}} ymm2 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x8000000000000000, 0x8000000000000000, 0x0, 0x0}, v32_int8 = {0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f, 0x0 <repeats 16 times>}, v16_int16 = {0xffff, 0x7fff, 0xffff, 0x7fff, 0xffff, 0x7fff, 0xffff, 0x7fff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x7fffffff7fffffff, 0x7fffffff7fffffff, 0x0, 0x0}, v2_int128 = {0x7fffffff7fffffff7fffffff7fffffff, 0x00000000000000000000000000000000}} ymm3 {v8_float = {0xc, 0xc, 0xc, 0xc, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x200000, 0x200000, 0x0, 0x0}, v32_int8 = {0x0, 0x0, 0x40, 0x41, 0x0, 0x0, 0x40, 0x41, 0x0, 0x0, 0x40, 0x41, 0x0, 0x0, 0x40, 0x41, 0x0 <repeats 16 times>}, v16_int16 = {0x0, 0x4140, 0x0, 0x4140, 0x0, 0x4140, 0x0, 0x4140, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0x41400000, 0x41400000, 0x41400000, 0x41400000, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x4140000041400000, 0x4140000041400000, 0x0, 0x0}, v2_int128 = {0x41400000414000004140000041400000, 0x00000000000000000000000000000000}} ymm4 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0 <repeats 32 times>}, v16_int16 = {0x0 <repeats 16 times>}, v8_int32 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x0, 0x0, 0x0, 0x0}, v2_int128 = { 0x00000000000000000000000000000000, 0x00000000000000000000000000000000}} ymm5 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x0, 0x0, 0x0, 0x0}, v32_int8 = {0x0 <repeats 32 times>}, v16_int16 = {0x0 <repeats 16 times>}, v8_int32 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x0, 0x0, 0x0, 0x0}, v2_int128 = { 0x00000000000000000000000000000000, 0x00000000000000000000000000000000}} ymm6 {v8_float = {0x15, 0xffffffed, 0x12, 0xfffffff0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0xfffffffff9b86bf0, 0xfffffffffdb50378, 0x0, 0x0}, v32_int8 = {0x3e, 0xcb, 0xae, 0x41, 0x50, 0x1e, 0x99, 0xc1, 0x1c, 0x5b, 0x93, 0x41, 0xe4, 0x57, 0x82, 0xc1, 0x0 <repeats 16 times>}, v16_int16 = {0xcb3e, 0x41ae, 0x1e50, 0xc199, 0x5b1c, 0x4193, 0x57e4, 0xc182, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0x41aecb3e, 0xc1991e50, 0x41935b1c, 0xc18257e4, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0xc1991e5041aecb3e, 0xc18257e441935b1c, 0x0, 0x0}, v2_int128 = {0xc18257e441935b1cc1991e5041aecb3e, 0x00000000000000000000000000000000}} ymm7 {v8_float = {0xffffff72, 0x12, 0x8b, 0xfffffff0, 0x0, 0x0, 0x0, 0x0}, v4_double = {0x4d6c730, 0xfffffffffdb50378, 0x0, 0x0}, v32_int8 = {0x43, 0x1b, 0xe, 0xc3, 0x1c, 0x5b, 0x93, 0x41, 0x94, 0x18, 0xb, 0x43, 0xe4, 0x57, 0x82, 0xc1, 0x0 <repeats 16 times>}, v16_int16 = {0x1b43, 0xc30e, 0x5b1c, 0x4193, 0x1894, 0x430b, 0x57e4, 0xc182, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, v8_int32 = {0xc30e1b43, 0x41935b1c, 0x430b1894, 0xc18257e4, 0x0, 0x0, 0x0, 0x0}, v4_int64 = {0x41935b1cc30e1b43, 0xc18257e4430b1894, 0x0, 0x0}, v2_int128 = {0xc18257e4430b189441935b1cc30e1b43, 0x00000000000000000000000000000000}} mm0 {uint64 = 0xe499000000000000, v2_int32 = {0x0, 0xe4990000}, v4_int16 = {0x0, 0x0, 0x0, 0xe499}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x99, 0xe4}} mm1 {uint64 = 0xd63f900000000000, v2_int32 = {0x0, 0xd63f9000}, v4_int16 = {0x0, 0x0, 0x9000, 0xd63f}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x90, 0x3f, 0xd6}} mm2 {uint64 = 0x0, v2_int32 = {0x0, 0x0}, v4_int16 = {0x0, 0x0, 0x0, 0x0}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}} mm3 {uint64 = 0x0, v2_int32 = {0x0, 0x0}, v4_int16 = {0x0, 0x0, 0x0, 0x0}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}} mm4 {uint64 = 0x0, v2_int32 = {0x0, 0x0}, v4_int16 = {0x0, 0x0, 0x0, 0x0}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}} mm5 {uint64 = 0x8000000000000000, v2_int32 = {0x0, 0x80000000}, v4_int16 = {0x0, 0x0, 0x0, 0x8000}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x80}} mm6 {uint64 = 0x0, v2_int32 = {0x0, 0x0}, v4_int16 = {0x0, 0x0, 0x0, 0x0}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}} mm7 {uint64 = 0x0, v2_int32 = {0x0, 0x0}, v4_int16 = {0x0, 0x0, 0x0, 0x0}, v8_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}} [...]
On Sun, Oct 9, 2016 at 5:04 PM, Michael Niedermayer
<michael@niedermayer.cc> wrote:
> this segfaults on x86-32
I'm guessing due to unaligned local arrays in search_for_ms():
float M[128], S[128];
From 3bc5622e5be67698d099a191ebfd297bf1eda7cd Mon Sep 17 00:00:00 2001 From: Rostislav Pehlivanov <atomnuker@gmail.com> Date: Sat, 8 Oct 2016 15:59:14 +0100 Subject: [PATCH] aacenc: add SIMD optimizations for abs_pow34 and quantization Performance improvements: quant_bands: with: 681 decicycles in quant_bands, 8388453 runs, 155 skips without: 1190 decicycles in quant_bands, 8388386 runs, 222 skips Around 42% for the function Twoloop coder: abs_pow34: with/without: 7.82s/8.17s Around 4% for the entire encoder Both: with/without: 7.15s/8.17s Around 12% for the entire encoder Fast coder: abs_pow34: with/without: 3.40s/3.77s Around 10% for the entire encoder Both: with/without: 3.02s/3.77s Around 20% faster for the entire encoder Signed-off-by: Rostislav Pehlivanov <atomnuker@gmail.com> --- libavcodec/aaccoder.c | 22 +++++----- libavcodec/aaccoder_trellis.h | 2 +- libavcodec/aaccoder_twoloop.h | 2 +- libavcodec/aacenc.c | 4 ++ libavcodec/aacenc.h | 6 +++ libavcodec/aacenc_is.c | 6 +-- libavcodec/aacenc_ltp.c | 4 +- libavcodec/aacenc_pred.c | 6 +-- libavcodec/aacenc_quantization.h | 4 +- libavcodec/aacenc_utils.h | 4 +- libavcodec/x86/Makefile | 2 + libavcodec/x86/aacencdsp.asm | 87 ++++++++++++++++++++++++++++++++++++++++ libavcodec/x86/aacencdsp_init.c | 43 ++++++++++++++++++++ 13 files changed, 167 insertions(+), 25 deletions(-) create mode 100644 libavcodec/x86/aacencdsp.asm create mode 100644 libavcodec/x86/aacencdsp_init.c diff --git a/libavcodec/aaccoder.c b/libavcodec/aaccoder.c index 35787e8..6760a70 100644 --- a/libavcodec/aaccoder.c +++ b/libavcodec/aaccoder.c @@ -88,7 +88,7 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce float next_minrd = INFINITY; int next_mincb = 0; - abs_pow34_v(s->scoefs, sce->coeffs, 1024); + s->abs_pow34(s->scoefs, sce->coeffs, 1024); start = win*128; for (cb = 0; cb < CB_TOT_ALL; cb++) { path[0][cb].cost = 0.0f; @@ -299,7 +299,7 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s, } } idx = 1; - abs_pow34_v(s->scoefs, sce->coeffs, 1024); + s->abs_pow34(s->scoefs, sce->coeffs, 1024); for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { start = w*128; for (g = 0; g < sce->ics.num_swb; g++) { @@ -446,7 +446,7 @@ static void search_for_quantizers_fast(AVCodecContext *avctx, AACEncContext *s, if (!allz) return; - abs_pow34_v(s->scoefs, sce->coeffs, 1024); + s->abs_pow34(s->scoefs, sce->coeffs, 1024); ff_quantize_band_cost_cache_init(s); for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { @@ -652,8 +652,8 @@ static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChanne s->fdsp->vector_fmul_scalar(PNS, PNS, scale, sce->ics.swb_sizes[g]); pns_senergy = s->fdsp->scalarproduct_float(PNS, PNS, sce->ics.swb_sizes[g]); pns_energy += pns_senergy; - abs_pow34_v(NOR34, &sce->coeffs[start_c], sce->ics.swb_sizes[g]); - abs_pow34_v(PNS34, PNS, sce->ics.swb_sizes[g]); + s->abs_pow34(NOR34, &sce->coeffs[start_c], sce->ics.swb_sizes[g]); + s->abs_pow34(PNS34, PNS, sce->ics.swb_sizes[g]); dist1 += quantize_band_cost(s, &sce->coeffs[start_c], NOR34, sce->ics.swb_sizes[g], @@ -789,8 +789,8 @@ static void search_for_ms(AACEncContext *s, ChannelElement *cpe) S[i] = M[i] - sce1->coeffs[start+(w+w2)*128+i]; } - abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]); - abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]); + s->abs_pow34(M34, M, sce0->ics.swb_sizes[g]); + s->abs_pow34(S34, S, sce0->ics.swb_sizes[g]); for (i = 0; i < sce0->ics.swb_sizes[g]; i++ ) { Mmax = FFMAX(Mmax, M34[i]); Smax = FFMAX(Smax, S34[i]); @@ -833,10 +833,10 @@ static void search_for_ms(AACEncContext *s, ChannelElement *cpe) - sce1->coeffs[start+(w+w2)*128+i]; } - abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]); - abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]); - abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]); - abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]); + s->abs_pow34(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]); + s->abs_pow34(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]); + s->abs_pow34(M34, M, sce0->ics.swb_sizes[g]); + s->abs_pow34(S34, S, sce0->ics.swb_sizes[g]); dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128], L34, sce0->ics.swb_sizes[g], diff --git a/libavcodec/aaccoder_trellis.h b/libavcodec/aaccoder_trellis.h index 0230052..940ebf0 100644 --- a/libavcodec/aaccoder_trellis.h +++ b/libavcodec/aaccoder_trellis.h @@ -70,7 +70,7 @@ static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce, float next_minbits = INFINITY; int next_mincb = 0; - abs_pow34_v(s->scoefs, sce->coeffs, 1024); + s->abs_pow34(s->scoefs, sce->coeffs, 1024); start = win*128; for (cb = 0; cb < CB_TOT_ALL; cb++) { path[0][cb].cost = run_bits+4; diff --git a/libavcodec/aaccoder_twoloop.h b/libavcodec/aaccoder_twoloop.h index 42aea52..fb9849e 100644 --- a/libavcodec/aaccoder_twoloop.h +++ b/libavcodec/aaccoder_twoloop.h @@ -291,7 +291,7 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx, if (!allz) return; - abs_pow34_v(s->scoefs, sce->coeffs, 1024); + s->abs_pow34(s->scoefs, sce->coeffs, 1024); ff_quantize_band_cost_cache_init(s); for (i = 0; i < sizeof(minsf) / sizeof(minsf[0]); ++i) diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c index ee3cbf8..f30691a 100644 --- a/libavcodec/aacenc.c +++ b/libavcodec/aacenc.c @@ -1033,6 +1033,10 @@ static av_cold int aac_encode_init(AVCodecContext *avctx) ff_lpc_init(&s->lpc, 2*avctx->frame_size, TNS_MAX_ORDER, FF_LPC_TYPE_LEVINSON); s->random_state = 0x1f2e3d4c; + s->abs_pow34 = &abs_pow34_v; + s->quant_bands = &quantize_bands; + ff_aac_dsp_init_x86(s); + if (HAVE_MIPSDSP) ff_aac_coder_init_mips(s); diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h index 1ace00d..8682867 100644 --- a/libavcodec/aacenc.h +++ b/libavcodec/aacenc.h @@ -127,11 +127,17 @@ typedef struct AACEncContext { uint16_t quantize_band_cost_cache_generation; AACQuantizeBandCostCacheEntry quantize_band_cost_cache[256][128]; ///< memoization area for quantize_band_cost + void (*abs_pow34)(float *out, const float *in, const int64_t size); + void (*quant_bands)(int *out, const float *in, const float *scaled, + int size, int is_signed, int maxval, const float Q34, + const float rounding); + struct { float *samples; } buffer; } AACEncContext; +void ff_aac_dsp_init_x86(AACEncContext *s); void ff_aac_coder_init_mips(AACEncContext *c); void ff_quantize_band_cost_cache_init(struct AACEncContext *s); diff --git a/libavcodec/aacenc_is.c b/libavcodec/aacenc_is.c index 473897b..2f5b7eb 100644 --- a/libavcodec/aacenc_is.c +++ b/libavcodec/aacenc_is.c @@ -59,9 +59,9 @@ struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe, float minthr = FFMIN(band0->threshold, band1->threshold); for (i = 0; i < sce0->ics.swb_sizes[g]; i++) IS[i] = (L[start+(w+w2)*128+i] + phase*R[start+(w+w2)*128+i])*sqrt(ener0/ener01); - abs_pow34_v(L34, &L[start+(w+w2)*128], sce0->ics.swb_sizes[g]); - abs_pow34_v(R34, &R[start+(w+w2)*128], sce0->ics.swb_sizes[g]); - abs_pow34_v(I34, IS, sce0->ics.swb_sizes[g]); + s->abs_pow34(L34, &L[start+(w+w2)*128], sce0->ics.swb_sizes[g]); + s->abs_pow34(R34, &R[start+(w+w2)*128], sce0->ics.swb_sizes[g]); + s->abs_pow34(I34, IS, sce0->ics.swb_sizes[g]); maxval = find_max_val(1, sce0->ics.swb_sizes[g], I34); is_band_type = find_min_book(maxval, is_sf_idx); dist1 += quantize_band_cost(s, &L[start + (w+w2)*128], L34, diff --git a/libavcodec/aacenc_ltp.c b/libavcodec/aacenc_ltp.c index b9d43b4..1bec85b 100644 --- a/libavcodec/aacenc_ltp.c +++ b/libavcodec/aacenc_ltp.c @@ -190,8 +190,8 @@ void ff_aac_search_for_ltp(AACEncContext *s, SingleChannelElement *sce, FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g]; for (i = 0; i < sce->ics.swb_sizes[g]; i++) PCD[i] = sce->coeffs[start+(w+w2)*128+i] - sce->lcoeffs[start+(w+w2)*128+i]; - abs_pow34_v(C34, &sce->coeffs[start+(w+w2)*128], sce->ics.swb_sizes[g]); - abs_pow34_v(PCD34, PCD, sce->ics.swb_sizes[g]); + s->abs_pow34(C34, &sce->coeffs[start+(w+w2)*128], sce->ics.swb_sizes[g]); + s->abs_pow34(PCD34, PCD, sce->ics.swb_sizes[g]); dist1 += quantize_band_cost(s, &sce->coeffs[start+(w+w2)*128], C34, sce->ics.swb_sizes[g], sce->sf_idx[(w+w2)*16+g], sce->band_type[(w+w2)*16+g], s->lambda/band->threshold, INFINITY, &bits_tmp1, NULL, 0); diff --git a/libavcodec/aacenc_pred.c b/libavcodec/aacenc_pred.c index e77a3de..d111192 100644 --- a/libavcodec/aacenc_pred.c +++ b/libavcodec/aacenc_pred.c @@ -270,7 +270,7 @@ void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce) continue; /* Normal coefficients */ - abs_pow34_v(O34, &sce->coeffs[start_coef], num_coeffs); + s->abs_pow34(O34, &sce->coeffs[start_coef], num_coeffs); dist1 = quantize_and_encode_band_cost(s, NULL, &sce->coeffs[start_coef], NULL, O34, num_coeffs, sce->sf_idx[sfb], cb_n, s->lambda / band->threshold, INFINITY, &cost1, NULL, 0); @@ -279,7 +279,7 @@ void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce) /* Encoded coefficients - needed for #bits, band type and quant. error */ for (i = 0; i < num_coeffs; i++) SENT[i] = sce->coeffs[start_coef + i] - sce->prcoeffs[start_coef + i]; - abs_pow34_v(S34, SENT, num_coeffs); + s->abs_pow34(S34, SENT, num_coeffs); if (cb_n < RESERVED_BT) cb_p = av_clip(find_min_book(find_max_val(1, num_coeffs, S34), sce->sf_idx[sfb]), cb_min, cb_max); else @@ -291,7 +291,7 @@ void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce) /* Reconstructed coefficients - needed for distortion measurements */ for (i = 0; i < num_coeffs; i++) sce->prcoeffs[start_coef + i] += QERR[i] != 0.0f ? (sce->prcoeffs[start_coef + i] - QERR[i]) : 0.0f; - abs_pow34_v(P34, &sce->prcoeffs[start_coef], num_coeffs); + s->abs_pow34(P34, &sce->prcoeffs[start_coef], num_coeffs); if (cb_n < RESERVED_BT) cb_p = av_clip(find_min_book(find_max_val(1, num_coeffs, P34), sce->sf_idx[sfb]), cb_min, cb_max); else diff --git a/libavcodec/aacenc_quantization.h b/libavcodec/aacenc_quantization.h index 4250407..fc5a46b 100644 --- a/libavcodec/aacenc_quantization.h +++ b/libavcodec/aacenc_quantization.h @@ -74,10 +74,10 @@ static av_always_inline float quantize_and_encode_band_cost_template( return cost * lambda; } if (!scaled) { - abs_pow34_v(s->scoefs, in, size); + s->abs_pow34(s->scoefs, in, size); scaled = s->scoefs; } - quantize_bands(s->qcoefs, in, scaled, size, Q34, !BT_UNSIGNED, aac_cb_maxval[cb], ROUNDING); + s->quant_bands(s->qcoefs, in, scaled, size, !BT_UNSIGNED, aac_cb_maxval[cb], Q34, ROUNDING); if (BT_UNSIGNED) { off = 0; } else { diff --git a/libavcodec/aacenc_utils.h b/libavcodec/aacenc_utils.h index ff9188a..f5cf77d 100644 --- a/libavcodec/aacenc_utils.h +++ b/libavcodec/aacenc_utils.h @@ -37,7 +37,7 @@ #define ROUND_TO_ZERO 0.1054f #define C_QUANT 0.4054f -static inline void abs_pow34_v(float *out, const float *in, const int size) +static inline void abs_pow34_v(float *out, const float *in, const int64_t size) { int i; for (i = 0; i < size; i++) { @@ -63,7 +63,7 @@ static inline int quant(float coef, const float Q, const float rounding) } static inline void quantize_bands(int *out, const float *in, const float *scaled, - int size, float Q34, int is_signed, int maxval, + int size, int is_signed, int maxval, const float Q34, const float rounding) { int i; diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 522b6c2..1db1137 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -42,6 +42,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o # decoders/encoders OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp_init.o \ x86/sbrdsp_init.o +OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp_init.o OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o @@ -132,6 +133,7 @@ YASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \ # decoders/encoders YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp.o \ x86/sbrdsp.o +YASM-OBJS-$(CONFIG_AAC_ENCODER) += x86/aacencdsp.o YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o YASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o diff --git a/libavcodec/x86/aacencdsp.asm b/libavcodec/x86/aacencdsp.asm new file mode 100644 index 0000000..ff4019f --- /dev/null +++ b/libavcodec/x86/aacencdsp.asm @@ -0,0 +1,87 @@ +;****************************************************************************** +;* SIMD optimized AAC encoder DSP functions +;* +;* Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +float_abs_mask: times 4 dd 0x7fffffff + +SECTION .text + +;******************************************************************* +;void ff_abs_pow34_sse(float *out, const float *in, const int64_t size); +;******************************************************************* +INIT_XMM sse +cglobal abs_pow34, 3, 3, 3, out, in, size + mova m2, [float_abs_mask] + shl sizeq, 2 + add inq, sizeq + add outq, sizeq + neg sizeq +.loop: + mova m0, [inq+sizeq] + andps m0, m2 + sqrtps m1, m0 + mulps m0, m1 + sqrtps m0, m0 + mova [outq+sizeq], m0 + add sizeq, mmsize + jl .loop + RET + +;******************************************************************* +;void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled, +; int size, int is_signed, int maxval, const float Q34, +; const float rounding) +;******************************************************************* +INIT_XMM sse2 +cglobal aac_quantize_bands, 6, 6, 6, out, in, scaled, size, is_signed, maxval, Q34, rounding +%if UNIX64 == 0 + movss m0, Q34m + movss m1, roundingm +%endif + SPLATD m0 + SPLATD m1 + cvtsi2ss m3, maxvald + SPLATD m3 + shl is_signedd, 31 + movd m4, is_signedd + SPLATD m4 + shl sizeq, 2 + add inq, sizeq + add outq, sizeq + add scaledq, sizeq + neg sizeq +.loop: + mova m2, [scaledq+sizeq] + mulps m2, m0 + addps m2, m1 + minps m2, m3 + mova m5, [inq+sizeq] + pand m5, m4 + orps m2, m5 + cvttps2dq m2, m2 + mova [outq+sizeq], m2 + add sizeq, mmsize + jl .loop + RET diff --git a/libavcodec/x86/aacencdsp_init.c b/libavcodec/x86/aacencdsp_init.c new file mode 100644 index 0000000..4b88ea6 --- /dev/null +++ b/libavcodec/x86/aacencdsp_init.c @@ -0,0 +1,43 @@ +/* + * AAC encoder assembly optimizations + * Copyright (C) 2016 Rostislav Pehlivanov <atomnuker@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/float_dsp.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/aacenc.h" + +void ff_abs_pow34_sse(float *out, const float *in, const int64_t size); + +void ff_aac_quantize_bands_sse2(int *out, const float *in, const float *scaled, + int size, int is_signed, int maxval, const float Q34, + const float rounding); + +av_cold void ff_aac_dsp_init_x86(AACEncContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE(cpu_flags)) + s->abs_pow34 = &ff_abs_pow34_sse; + + if (EXTERNAL_SSE2(cpu_flags)) + s->quant_bands = &ff_aac_quantize_bands_sse2; +} -- 2.9.3