diff mbox

[FFmpeg-devel,3/8] avcodec/flac: add SSE4.2 version of the 32-bit lpc encoder

Message ID 20171126225111.5108-4-james.darnley@gmail.com
State New
Headers show

Commit Message

James Darnley Nov. 26, 2017, 10:51 p.m. UTC
From 1.3 to 2.5 times faster.  Runtime reduced by 4 to 58%.  As with the
16-bit version the speed-up generally increases with compression_level.

Also like the 16-bit version, it is not used with levels less than 3.

After this bug fix in long, long ago in e609cfd697 this 32-bit lpc
encoder is heavily used with 16-bit samples.
---
 libavcodec/x86/flac_dsp_gpl.asm | 106 ++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/flacdsp_init.c   |   5 ++
 2 files changed, 111 insertions(+)

Comments

Carl Eugen Hoyos Nov. 26, 2017, 11:07 p.m. UTC | #1
2017-11-26 23:51 GMT+01:00 James Darnley <james.darnley@gmail.com>:

> +    if (EXTERNAL_SSE42(cpu_flags)) {
> +        if (CONFIG_GPL)
> +            c->lpc32_encode = ff_flac_enc_lpc_32_sse42;
> +    }

Any objections over "if (CONFIG_GPL && EXTERNAL_..)"?

Carl Eugen
James Almer Nov. 26, 2017, 11:23 p.m. UTC | #2
On 11/26/2017 8:07 PM, Carl Eugen Hoyos wrote:
> 2017-11-26 23:51 GMT+01:00 James Darnley <james.darnley@gmail.com>:
> 
>> +    if (EXTERNAL_SSE42(cpu_flags)) {
>> +        if (CONFIG_GPL)
>> +            c->lpc32_encode = ff_flac_enc_lpc_32_sse42;
>> +    }
> 
> Any objections over "if (CONFIG_GPL && EXTERNAL_..)"?
> 
> Carl Eugen

I prefer it as is. It's not only similar to other checks around it, but
also if someone decides to write an lgpl sse4.2 function they will not
have to change the existing statement or add a duplicate one.
diff mbox

Patch

diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index c461c666be..618306eb5f 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -22,6 +22,12 @@ 
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA
+
+pd_0_int_min: times  2 dd 0, -2147483648
+pq_int_min:   times  2 dq -2147483648
+pq_int_max:   times  2 dq  2147483647
+
 SECTION .text
 
 %macro FUNCTION_BODY_16 0
@@ -116,8 +122,108 @@  RET
 
 %endmacro
 
+%macro PMINSQ 3
+    pcmpgtq %3, %2, %1
+    pand    %1, %3
+    pandn   %3, %2
+    por     %1, %3
+%endmacro
+
+%macro PMAXSQ 3
+    pcmpgtq %3, %1, %2
+    pand    %1, %3
+    pandn   %3, %2
+    por     %1, %3
+%endmacro
+
+%macro CLIPQ 4 ; reg, min, max, tmp
+    PMAXSQ %1, %2, %4
+    PMINSQ %1, %3, %4
+%endmacro
+
+%macro HACK_PSRAQ 4 ; dst, src (shift), sign extend mask, tmp
+    pxor    %4, %4 ; zero
+    pcmpgtq %4, %1 ; mask where 0 > dst
+    pand    %4, %3 ; mask & sign extend mask
+    psrlq   %1, %2 ; dst >>= shift
+    por     %1, %4 ; dst | mask
+%endmacro
+
+%macro FUNCTION_BODY_32 0
+
+%if ARCH_X86_64
+    cglobal flac_enc_lpc_32, 5, 7, 4, mmsize, res, smp, len, order, coefs
+    DECLARE_REG_TMP 5, 6
+    %define length r2d
+
+    movsxd orderq, orderd
+%else
+    cglobal flac_enc_lpc_32, 5, 6, 4, mmsize, res, smp, len, order, coefs
+    DECLARE_REG_TMP 2, 5
+    %define length r2mp
+%endif
+
+; Here we assume that the maximum order value is 32.  This means that we only
+; need to copy a maximum of 32 samples.  Therefore we let the preprocessor
+; unroll this loop and copy all 32.
+%assign iter 0
+%rep 32/(mmsize/4)
+    movu  m0,         [smpq+iter]
+    movu [resq+iter],  m0
+    %assign iter iter+mmsize
+%endrep
+
+lea    resq,   [resq+orderq*4]
+lea    smpq,   [smpq+orderq*4]
+lea    coefsq, [coefsq+orderq*4]
+sub    length,  orderd
+movd   m3,      r5m
+neg    orderq
+
+movu   m4,     [pd_0_int_min] ; load 1 bit
+psrad  m4,      m3            ; turn that into shift+1 bits
+pslld  m4,      1             ; reduce that
+mova  [rsp],    m4            ; save sign extend mask
+
+%define posj t0q
+%define negj t1q
+
+.looplen:
+    pxor m0,   m0
+    mov  posj, orderq
+    xor  negj, negj
+
+    .looporder:
+        movd   m2,  [coefsq+posj*4] ; c = coefs[j]
+        SPLATD m2
+        pmovzxdq m1,  [smpq+negj*4-4] ; s = smp[i-j-1]
+        pmuldq m1,   m2
+        paddq  m0,   m1             ; p += c * s
+
+        dec    negj
+        inc    posj
+    jnz .looporder
+
+    HACK_PSRAQ m0, m3, [rsp], m2    ; p >>= shift
+    CLIPQ   m0,   [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
+    pshufd  m0,    m0, q0020 ; pack into first 2 dwords
+    movh    m1,   [smpq]
+    psubd   m1,    m0               ; smp[i] - p
+    movh   [resq], m1               ; res[i] = smp[i] - (p >> shift)
+
+    add resq,   mmsize/2
+    add smpq,   mmsize/2
+    sub length, mmsize/8
+jg .looplen
+RET
+
+%endmacro ; FUNCTION_BODY_32
+
 INIT_XMM sse4
 FUNCTION_BODY_16
 
+INIT_XMM sse42
+FUNCTION_BODY_32
+
 INIT_YMM avx2
 FUNCTION_BODY_16
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
index 0a5c01859f..f827186c26 100644
--- a/libavcodec/x86/flacdsp_init.c
+++ b/libavcodec/x86/flacdsp_init.c
@@ -29,6 +29,7 @@  void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
 
 void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
 void ff_flac_enc_lpc_16_avx2(int32_t *, const int32_t *, int, int, const int32_t *,int);
+void ff_flac_enc_lpc_32_sse42(int32_t *, const int32_t *, int, int, const int32_t *,int);
 
 #define DECORRELATE_FUNCS(fmt, opt)                                                      \
 void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
@@ -111,6 +112,10 @@  av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
         if (CONFIG_GPL)
             c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
     }
+    if (EXTERNAL_SSE42(cpu_flags)) {
+        if (CONFIG_GPL)
+            c->lpc32_encode = ff_flac_enc_lpc_32_sse42;
+    }
     if (EXTERNAL_AVX2(cpu_flags)) {
         if (CONFIG_GPL)
             c->lpc16_encode = ff_flac_enc_lpc_16_avx2;