diff mbox

[FFmpeg-devel,6/8] lavc/x86/flac_dsp_gpl: partially unroll 32-bit LPC encoder

Message ID 20171126225111.5108-7-james.darnley@gmail.com
State New
Headers show

Commit Message

James Darnley Nov. 26, 2017, 10:51 p.m. UTC
Around 1.1 times faster and reduces runtime by up to 6%.
---
 libavcodec/x86/flac_dsp_gpl.asm | 91 ++++++++++++++++++++++++++++++++---------
 1 file changed, 72 insertions(+), 19 deletions(-)

Comments

Rostislav Pehlivanov Nov. 26, 2017, 11:17 p.m. UTC | #1
On 26 November 2017 at 22:51, James Darnley <james.darnley@gmail.com> wrote:

> Around 1.1 times faster and reduces runtime by up to 6%.
> ---
>  libavcodec/x86/flac_dsp_gpl.asm | 91 ++++++++++++++++++++++++++++++
> ++---------
>  1 file changed, 72 insertions(+), 19 deletions(-)
>
> diff --git a/libavcodec/x86/flac_dsp_gpl.asm
> b/libavcodec/x86/flac_dsp_gpl.asm
> index 952fc8b86b..91989ce560 100644
> --- a/libavcodec/x86/flac_dsp_gpl.asm
> +++ b/libavcodec/x86/flac_dsp_gpl.asm
> @@ -152,13 +152,13 @@ RET
>  %macro FUNCTION_BODY_32 0
>
>  %if ARCH_X86_64
> -    cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs
> +    cglobal flac_enc_lpc_32, 5, 7, 8, mmsize*4, res, smp, len, order,
> coefs
>

Why x4, shouldn't this be x2?


>      DECLARE_REG_TMP 5, 6
>      %define length r2d
>
>      movsxd orderq, orderd
>  %else
> -    cglobal flac_enc_lpc_32, 5, 6, 8, mmsize, res, smp, len, order, coefs
> +    cglobal flac_enc_lpc_32, 5, 6, 8, mmsize*4, res, smp, len, order,
> coefs
>      DECLARE_REG_TMP 2, 5
>      %define length r2mp
>  %endif
> @@ -189,18 +189,23 @@ mova  [rsp],    m4            ; save sign extend mask
>  %define negj t1q
>
>  .looplen:
> +    ; process "odd" samples
>      pxor m0,   m0
>      pxor m4,   m4
>      pxor m6,   m6
>      mov  posj, orderq
>      xor  negj, negj
>
> -    .looporder:
> +    .looporder1:
>          movd   m2,  [coefsq+posj*4] ; c = coefs[j]
>          SPLATD m2
> -        pmovzxdq m1,  [smpq+negj*4-4] ; s = smp[i-j-1]
> -        pmovzxdq m5,  [smpq+negj*4-4+mmsize/2]
> -        pmovzxdq m7,  [smpq+negj*4-4+mmsize]
> +        movu   m1,  [smpq+negj*4-4] ; s = smp[i-j-1]
> +        movu   m5,  [smpq+negj*4-4+mmsize]
> +        movu   m7,  [smpq+negj*4-4+mmsize*2]
> +        ; Rather than explicitly unpack adjacent samples into qwords we
> can let
> +        ; the pmuldq instruction unpack the 0th and 2nd samples for us
> when it
> +        ; does its multiply.  This saves an unpack for every sample in
> the inner
> +        ; loop meaning it should be (much) quicker.
>          pmuldq m1,   m2
>          pmuldq m5,   m2
>          pmuldq m7,   m2
> @@ -210,7 +215,7 @@ mova  [rsp],    m4            ; save sign extend mask
>
>          dec    negj
>          inc    posj
> -    jnz .looporder
> +    jnz .looporder1
>
>      HACK_PSRAQ m0, m3, [rsp], m2    ; p >>= shift
>      HACK_PSRAQ m4, m3, [rsp], m2
> @@ -218,22 +223,70 @@ mova  [rsp],    m4            ; save sign extend mask
>      CLIPQ   m0,   [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
>      CLIPQ   m4,   [pq_int_min], [pq_int_max], m2
>      CLIPQ   m6,   [pq_int_min], [pq_int_max], m2
> -    pshufd  m0,    m0, q0020 ; pack into first 2 dwords
> -    pshufd  m4,    m4, q0020
> -    pshufd  m6,    m6, q0020
> -    movh    m1,   [smpq]
> -    movh    m5,   [smpq+mmsize/2]
> -    movh    m7,   [smpq+mmsize]
> +    movu    m1,   [smpq]
> +    movu    m5,   [smpq+mmsize]
> +    movu    m7,   [smpq+mmsize*2]
>      psubd   m1,    m0               ; smp[i] - p
>      psubd   m5,    m4
>      psubd   m7,    m6
> -    movh   [resq], m1               ; res[i] = smp[i] - (p >> shift)
> -    movh   [resq+mmsize/2], m5
> -    movh   [resq+mmsize], m7
> +    mova   [rsp+mmsize], m1               ; res[i] = smp[i] - (p >> shift)
> +    mova   [rsp+mmsize*2], m5
> +    mova   [rsp+mmsize*3], m7
> +
> +    ; process "even" samples
> +    pxor m0,   m0
> +    pxor m4,   m4
> +    pxor m6,   m6
> +    mov  posj, orderq
> +    xor  negj, negj
> +
> +    .looporder2:
> +        movd   m2,  [coefsq+posj*4] ; c = coefs[j]
> +        SPLATD m2
> +        movu   m1,  [smpq+negj*4] ; s = smp[i-j-1]
> +        movu   m5,  [smpq+negj*4+mmsize]
> +        movu   m7,  [smpq+negj*4+mmsize*2]
> +        pmuldq m1,   m2
> +        pmuldq m5,   m2
> +        pmuldq m7,   m2
> +        paddq  m0,   m1             ; p += c * s
> +        paddq  m4,   m5
> +        paddq  m6,   m7
> +
> +        dec    negj
> +        inc    posj
> +    jnz .looporder2
> +
> +    HACK_PSRAQ m0, m3, [rsp], m2    ; p >>= shift
> +    HACK_PSRAQ m4, m3, [rsp], m2
> +    HACK_PSRAQ m6, m3, [rsp], m2
> +    CLIPQ   m0,   [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
> +    CLIPQ   m4,   [pq_int_min], [pq_int_max], m2
> +    CLIPQ   m6,   [pq_int_min], [pq_int_max], m2
> +    movu    m1,   [smpq+4]
> +    movu    m5,   [smpq+4+mmsize]
> +    movu    m7,   [smpq+4+mmsize*2]
> +    psubd   m1,    m0               ; smp[i] - p
> +    psubd   m5,    m4
> +    psubd   m7,    m6
> +
> +    ; interleave odd and even samples
> +    pslldq  m1, 4
> +    pslldq  m5, 4
> +    pslldq  m7, 4
> +
> +    pblendw m1, [rsp+mmsize], q0303
> +    pblendw m5, [rsp+mmsize*2], q0303
> +    pblendw m7, [rsp+mmsize*3], q0303
> +
> +    movu [resq], m1
> +    movu [resq+mmsize], m5
> +    movu [resq+mmsize*2], m7
> +
> +    add resq,    3*mmsize
> +    add smpq,    3*mmsize
> +    sub length, (3*mmsize)/4
>
> -    add resq,   (3*mmsize)/2
> -    add smpq,   (3*mmsize)/2
> -    sub length, (3*mmsize)/8
>  jg .looplen
>  RET
>
> --
> 2.15.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>


Apart from that lgtm
James Darnley Nov. 26, 2017, 11:36 p.m. UTC | #2
On 2017-11-27 00:17, Rostislav Pehlivanov wrote:
> On 26 November 2017 at 22:51, James Darnley <james.darnley@gmail.com> wrote:
>> @@ -152,13 +152,13 @@ RET
>>  %macro FUNCTION_BODY_32 0
>>
>>  %if ARCH_X86_64
>> -    cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs
>> +    cglobal flac_enc_lpc_32, 5, 7, 8, mmsize*4, res, smp, len, order,
>> coefs
>>
> 
> Why x4, shouldn't this be x2?

I write 3 mm registers more to the stack.  The first one is the sign
extension for my hacked qword arithmetic shift added in the first 32-bit
patch.  The new 3 are to store the "odd" values created in the first
inner loop.

I admit that this is a rather ugly construction for a little speed gain
but I think I've seen other ugly things since writing this.
diff mbox

Patch

diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index 952fc8b86b..91989ce560 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -152,13 +152,13 @@  RET
 %macro FUNCTION_BODY_32 0
 
 %if ARCH_X86_64
-    cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs
+    cglobal flac_enc_lpc_32, 5, 7, 8, mmsize*4, res, smp, len, order, coefs
     DECLARE_REG_TMP 5, 6
     %define length r2d
 
     movsxd orderq, orderd
 %else
-    cglobal flac_enc_lpc_32, 5, 6, 8, mmsize, res, smp, len, order, coefs
+    cglobal flac_enc_lpc_32, 5, 6, 8, mmsize*4, res, smp, len, order, coefs
     DECLARE_REG_TMP 2, 5
     %define length r2mp
 %endif
@@ -189,18 +189,23 @@  mova  [rsp],    m4            ; save sign extend mask
 %define negj t1q
 
 .looplen:
+    ; process "odd" samples
     pxor m0,   m0
     pxor m4,   m4
     pxor m6,   m6
     mov  posj, orderq
     xor  negj, negj
 
-    .looporder:
+    .looporder1:
         movd   m2,  [coefsq+posj*4] ; c = coefs[j]
         SPLATD m2
-        pmovzxdq m1,  [smpq+negj*4-4] ; s = smp[i-j-1]
-        pmovzxdq m5,  [smpq+negj*4-4+mmsize/2]
-        pmovzxdq m7,  [smpq+negj*4-4+mmsize]
+        movu   m1,  [smpq+negj*4-4] ; s = smp[i-j-1]
+        movu   m5,  [smpq+negj*4-4+mmsize]
+        movu   m7,  [smpq+negj*4-4+mmsize*2]
+        ; Rather than explicitly unpack adjacent samples into qwords we can let
+        ; the pmuldq instruction unpack the 0th and 2nd samples for us when it
+        ; does its multiply.  This saves an unpack for every sample in the inner
+        ; loop meaning it should be (much) quicker.
         pmuldq m1,   m2
         pmuldq m5,   m2
         pmuldq m7,   m2
@@ -210,7 +215,7 @@  mova  [rsp],    m4            ; save sign extend mask
 
         dec    negj
         inc    posj
-    jnz .looporder
+    jnz .looporder1
 
     HACK_PSRAQ m0, m3, [rsp], m2    ; p >>= shift
     HACK_PSRAQ m4, m3, [rsp], m2
@@ -218,22 +223,70 @@  mova  [rsp],    m4            ; save sign extend mask
     CLIPQ   m0,   [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
     CLIPQ   m4,   [pq_int_min], [pq_int_max], m2
     CLIPQ   m6,   [pq_int_min], [pq_int_max], m2
-    pshufd  m0,    m0, q0020 ; pack into first 2 dwords
-    pshufd  m4,    m4, q0020
-    pshufd  m6,    m6, q0020
-    movh    m1,   [smpq]
-    movh    m5,   [smpq+mmsize/2]
-    movh    m7,   [smpq+mmsize]
+    movu    m1,   [smpq]
+    movu    m5,   [smpq+mmsize]
+    movu    m7,   [smpq+mmsize*2]
     psubd   m1,    m0               ; smp[i] - p
     psubd   m5,    m4
     psubd   m7,    m6
-    movh   [resq], m1               ; res[i] = smp[i] - (p >> shift)
-    movh   [resq+mmsize/2], m5
-    movh   [resq+mmsize], m7
+    mova   [rsp+mmsize], m1               ; res[i] = smp[i] - (p >> shift)
+    mova   [rsp+mmsize*2], m5
+    mova   [rsp+mmsize*3], m7
+
+    ; process "even" samples
+    pxor m0,   m0
+    pxor m4,   m4
+    pxor m6,   m6
+    mov  posj, orderq
+    xor  negj, negj
+
+    .looporder2:
+        movd   m2,  [coefsq+posj*4] ; c = coefs[j]
+        SPLATD m2
+        movu   m1,  [smpq+negj*4] ; s = smp[i-j-1]
+        movu   m5,  [smpq+negj*4+mmsize]
+        movu   m7,  [smpq+negj*4+mmsize*2]
+        pmuldq m1,   m2
+        pmuldq m5,   m2
+        pmuldq m7,   m2
+        paddq  m0,   m1             ; p += c * s
+        paddq  m4,   m5
+        paddq  m6,   m7
+
+        dec    negj
+        inc    posj
+    jnz .looporder2
+
+    HACK_PSRAQ m0, m3, [rsp], m2    ; p >>= shift
+    HACK_PSRAQ m4, m3, [rsp], m2
+    HACK_PSRAQ m6, m3, [rsp], m2
+    CLIPQ   m0,   [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
+    CLIPQ   m4,   [pq_int_min], [pq_int_max], m2
+    CLIPQ   m6,   [pq_int_min], [pq_int_max], m2
+    movu    m1,   [smpq+4]
+    movu    m5,   [smpq+4+mmsize]
+    movu    m7,   [smpq+4+mmsize*2]
+    psubd   m1,    m0               ; smp[i] - p
+    psubd   m5,    m4
+    psubd   m7,    m6
+
+    ; interleave odd and even samples
+    pslldq  m1, 4
+    pslldq  m5, 4
+    pslldq  m7, 4
+
+    pblendw m1, [rsp+mmsize], q0303
+    pblendw m5, [rsp+mmsize*2], q0303
+    pblendw m7, [rsp+mmsize*3], q0303
+
+    movu [resq], m1
+    movu [resq+mmsize], m5
+    movu [resq+mmsize*2], m7
+
+    add resq,    3*mmsize
+    add smpq,    3*mmsize
+    sub length, (3*mmsize)/4
 
-    add resq,   (3*mmsize)/2
-    add smpq,   (3*mmsize)/2
-    sub length, (3*mmsize)/8
 jg .looplen
 RET