diff mbox

[FFmpeg-devel,2/8] avcodec/flac: add AVX2 version of the 16-bit LPC encoder

Message ID 20171126225111.5108-3-james.darnley@gmail.com
State New
Headers show

Commit Message

James Darnley Nov. 26, 2017, 10:51 p.m. UTC
When compared to the SSE4 version, runtime is reduced by 0.5 to 20%.
After a bug fix log, long ago in e609cfd697 the 16-bit lpc encoder is
used so little that the runtime reduction is no longer correct.  The
function itself is around 2 times faster.  (As one might expect for
doing twice as many samples every iteration.)
---
 libavcodec/flacenc.c            |  2 +-
 libavcodec/x86/flac_dsp_gpl.asm | 32 +++++++++++++++++++++++++++-----
 libavcodec/x86/flacdsp_init.c   |  5 +++++
 3 files changed, 33 insertions(+), 6 deletions(-)

Comments

Rostislav Pehlivanov Nov. 26, 2017, 11:20 p.m. UTC | #1
On 26 November 2017 at 22:51, James Darnley <james.darnley@gmail.com> wrote:

> When compared to the SSE4 version, runtime is reduced by 0.5 to 20%.
> After a bug fix log, long ago in e609cfd697 the 16-bit lpc encoder is
> used so little that the runtime reduction is no longer correct.  The
> function itself is around 2 times faster.  (As one might expect for
> doing twice as many samples every iteration.)
> ---
>  libavcodec/flacenc.c            |  2 +-
>  libavcodec/x86/flac_dsp_gpl.asm | 32 +++++++++++++++++++++++++++-----
>  libavcodec/x86/flacdsp_init.c   |  5 +++++
>  3 files changed, 33 insertions(+), 6 deletions(-)
>
> diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
> index 170c3caf48..cf25982c91 100644
> --- a/libavcodec/flacenc.c
> +++ b/libavcodec/flacenc.c
> @@ -88,7 +88,7 @@ typedef struct FlacSubframe {
>      uint64_t rc_sums[32][MAX_PARTITIONS];
>
>      int32_t samples[FLAC_MAX_BLOCKSIZE];
> -    int32_t residual[FLAC_MAX_BLOCKSIZE+11];
> +    int32_t residual[FLAC_MAX_BLOCKSIZE+23];
>  } FlacSubframe;
>
>  typedef struct FlacFrame {
> diff --git a/libavcodec/x86/flac_dsp_gpl.asm
> b/libavcodec/x86/flac_dsp_gpl.asm
> index e285158185..c461c666be 100644
> --- a/libavcodec/x86/flac_dsp_gpl.asm
> +++ b/libavcodec/x86/flac_dsp_gpl.asm
> @@ -24,7 +24,8 @@
>
>  SECTION .text
>
> -INIT_XMM sse4
> +%macro FUNCTION_BODY_16 0
> +
>  %if ARCH_X86_64
>      cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
>      DECLARE_REG_TMP 5, 6
> @@ -51,7 +52,7 @@ lea  resq,   [resq+orderq*4]
>  lea  smpq,   [smpq+orderq*4]
>  lea  coefsq, [coefsq+orderq*4]
>  sub  length,  orderd
> -movd m3,      r5m
> +movd xm3,     r5m
>  neg  orderq
>
>  %define posj t0q
> @@ -65,8 +66,20 @@ neg  orderq
>      xor  negj, negj
>
>      .looporder:
> +%if cpuflag(avx)
> +        vbroadcastss m2, [coefsq+posj*4]
> +%else
>          movd   m2, [coefsq+posj*4] ; c = coefs[j]
>          SPLATD m2
> +%endif
> +%if cpuflag(avx)
> +        vpmulld m1, m2, [smpq+negj*4-4]
> +        vpmulld m5, m2, [smpq+negj*4-4+mmsize]
> +        vpmulld m7, m2, [smpq+negj*4-4+mmsize*2]
> +        vpaddd  m0, m1
> +        vpaddd  m4, m5
> +        vpaddd  m6, m7
>

Same as the 32bit lpc avx2 patch


> +%else
>          movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
>          movu   m5, [smpq+negj*4-4+mmsize]
>          movu   m7, [smpq+negj*4-4+mmsize*2]
> @@ -76,14 +89,15 @@ neg  orderq
>          paddd  m0,  m1             ; p += c * s
>          paddd  m4,  m5
>          paddd  m6,  m7
> +%endif
>
>          dec    negj
>          inc    posj
>      jnz .looporder
>
> -    psrad  m0,     m3              ; p >>= shift
> -    psrad  m4,     m3
> -    psrad  m6,     m3
> +    psrad  m0,     xm3              ; p >>= shift
> +    psrad  m4,     xm3
> +    psrad  m6,     xm3
>      movu   m1,    [smpq]
>      movu   m5,    [smpq+mmsize]
>      movu   m7,    [smpq+mmsize*2]
> @@ -99,3 +113,11 @@ neg  orderq
>      sub length, (3*mmsize)/4
>  jg .looplen
>  RET
> +
> +%endmacro
> +
> +INIT_XMM sse4
> +FUNCTION_BODY_16
> +
> +INIT_YMM avx2
> +FUNCTION_BODY_16
> diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
> index 1971f81b8d..0a5c01859f 100644
> --- a/libavcodec/x86/flacdsp_init.c
> +++ b/libavcodec/x86/flacdsp_init.c
> @@ -28,6 +28,7 @@ void ff_flac_lpc_32_xop(int32_t *samples, const int
> coeffs[32], int order,
>                          int qlevel, int len);
>
>  void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const
> int32_t *,int);
> +void ff_flac_enc_lpc_16_avx2(int32_t *, const int32_t *, int, int, const
> int32_t *,int);
>
>  #define DECORRELATE_FUNCS(fmt, opt)
>                 \
>  void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in,
> int channels,     \
> @@ -110,6 +111,10 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c,
> enum AVSampleFormat fmt, int
>          if (CONFIG_GPL)
>              c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
>      }
> +    if (EXTERNAL_AVX2(cpu_flags)) {
> +        if (CONFIG_GPL)
>

yeah, just combine them, if someone wants to add non-gpl asm this is the
least of their problems


> +            c->lpc16_encode = ff_flac_enc_lpc_16_avx2;
> +    }
>  #endif
>  #endif /* HAVE_X86ASM */
>  }
> --
> 2.15.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
diff mbox

Patch

diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
index 170c3caf48..cf25982c91 100644
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@@ -88,7 +88,7 @@  typedef struct FlacSubframe {
     uint64_t rc_sums[32][MAX_PARTITIONS];
 
     int32_t samples[FLAC_MAX_BLOCKSIZE];
-    int32_t residual[FLAC_MAX_BLOCKSIZE+11];
+    int32_t residual[FLAC_MAX_BLOCKSIZE+23];
 } FlacSubframe;
 
 typedef struct FlacFrame {
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index e285158185..c461c666be 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -24,7 +24,8 @@ 
 
 SECTION .text
 
-INIT_XMM sse4
+%macro FUNCTION_BODY_16 0
+
 %if ARCH_X86_64
     cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
     DECLARE_REG_TMP 5, 6
@@ -51,7 +52,7 @@  lea  resq,   [resq+orderq*4]
 lea  smpq,   [smpq+orderq*4]
 lea  coefsq, [coefsq+orderq*4]
 sub  length,  orderd
-movd m3,      r5m
+movd xm3,     r5m
 neg  orderq
 
 %define posj t0q
@@ -65,8 +66,20 @@  neg  orderq
     xor  negj, negj
 
     .looporder:
+%if cpuflag(avx)
+        vbroadcastss m2, [coefsq+posj*4]
+%else
         movd   m2, [coefsq+posj*4] ; c = coefs[j]
         SPLATD m2
+%endif
+%if cpuflag(avx)
+        vpmulld m1, m2, [smpq+negj*4-4]
+        vpmulld m5, m2, [smpq+negj*4-4+mmsize]
+        vpmulld m7, m2, [smpq+negj*4-4+mmsize*2]
+        vpaddd  m0, m1
+        vpaddd  m4, m5
+        vpaddd  m6, m7
+%else
         movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
         movu   m5, [smpq+negj*4-4+mmsize]
         movu   m7, [smpq+negj*4-4+mmsize*2]
@@ -76,14 +89,15 @@  neg  orderq
         paddd  m0,  m1             ; p += c * s
         paddd  m4,  m5
         paddd  m6,  m7
+%endif
 
         dec    negj
         inc    posj
     jnz .looporder
 
-    psrad  m0,     m3              ; p >>= shift
-    psrad  m4,     m3
-    psrad  m6,     m3
+    psrad  m0,     xm3              ; p >>= shift
+    psrad  m4,     xm3
+    psrad  m6,     xm3
     movu   m1,    [smpq]
     movu   m5,    [smpq+mmsize]
     movu   m7,    [smpq+mmsize*2]
@@ -99,3 +113,11 @@  neg  orderq
     sub length, (3*mmsize)/4
 jg .looplen
 RET
+
+%endmacro
+
+INIT_XMM sse4
+FUNCTION_BODY_16
+
+INIT_YMM avx2
+FUNCTION_BODY_16
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
index 1971f81b8d..0a5c01859f 100644
--- a/libavcodec/x86/flacdsp_init.c
+++ b/libavcodec/x86/flacdsp_init.c
@@ -28,6 +28,7 @@  void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
                         int qlevel, int len);
 
 void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
+void ff_flac_enc_lpc_16_avx2(int32_t *, const int32_t *, int, int, const int32_t *,int);
 
 #define DECORRELATE_FUNCS(fmt, opt)                                                      \
 void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels,     \
@@ -110,6 +111,10 @@  av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
         if (CONFIG_GPL)
             c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
     }
+    if (EXTERNAL_AVX2(cpu_flags)) {
+        if (CONFIG_GPL)
+            c->lpc16_encode = ff_flac_enc_lpc_16_avx2;
+    }
 #endif
 #endif /* HAVE_X86ASM */
 }