diff mbox series

[FFmpeg-devel,1/7] proresdec2: port and fix for cached reader

Message ID 20230908081508.510-1-christophe.gisquet@gmail.com
State New
Headers show
Series [FFmpeg-devel,1/7] proresdec2: port and fix for cached reader | expand

Checks

Context Check Description
andriy/make_x86 fail Make failed

Commit Message

Christophe Gisquet Sept. 8, 2023, 8:15 a.m. UTC
Summary of changes
- move back to regular, non-macro, get_bits API
- reduce the lookup to switch the coding method
- shorter reads wherever possible, in particular for the end of bitstream
  (16 bits instead of 32, as per the above)

There are cases that really need longer lengths (larger EG codes) of up
to 27 bits.

Win64: 6.10 -> 4.87 (~20% speedup)

Reference for an hypothetical 32bits version of the cached reader:
Win32: 11.4 -> 9.8  (14%, because iDCT is not SIMDed)
---
 libavcodec/proresdec2.c | 53 ++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 30 deletions(-)

Comments

Christophe Gisquet Sept. 8, 2023, 8:20 a.m. UTC | #1
Le ven. 8 sept. 2023 à 10:15, Christophe Gisquet
<christophe.gisquet@gmail.com> a écrit :
>
> Summary of changes

git send-email --cover-letter apparently didn't let me edit one, so here goes.

This patchset requires my previous one improving the cached bitstream
reader, and serves as its justification. It, basically, moves to using
VLC wherever possible, and in particular when codewords are
sufficiently short/there's some kind of well-behaved laplacian
distribution for codewords that make VLCs efficient.

Total speedup is around 40% here.
Andreas Rheinhardt Sept. 8, 2023, 8:30 a.m. UTC | #2
Christophe Gisquet:
> Le ven. 8 sept. 2023 à 10:15, Christophe Gisquet
> <christophe.gisquet@gmail.com> a écrit :
>>
>> Summary of changes
> 
> git send-email --cover-letter apparently didn't let me edit one, so here goes.
> 

Use --compose.

> This patchset requires my previous one improving the cached bitstream
> reader, and serves as its justification. It, basically, moves to using
> VLC wherever possible, and in particular when codewords are
> sufficiently short/there's some kind of well-behaved laplacian
> distribution for codewords that make VLCs efficient.
> 
> Total speedup is around 40% here.
>
Andreas Rheinhardt Sept. 8, 2023, 8:34 a.m. UTC | #3
Andreas Rheinhardt:
> Christophe Gisquet:
>> Le ven. 8 sept. 2023 à 10:15, Christophe Gisquet
>> <christophe.gisquet@gmail.com> a écrit :
>>>
>>> Summary of changes
>>
>> git send-email --cover-letter apparently didn't let me edit one, so here goes.
>>
> 
> Use --compose.

Or even better: --cover-letter --annotate
That way you get a default cover-letter to annotate

> 
>> This patchset requires my previous one improving the cached bitstream
>> reader, and serves as its justification. It, basically, moves to using
>> VLC wherever possible, and in particular when codewords are
>> sufficiently short/there's some kind of well-behaved laplacian
>> distribution for codewords that make VLCs efficient.
>>
>> Total speedup is around 40% here.
>>
Andreas Rheinhardt Sept. 8, 2023, 8:36 a.m. UTC | #4
Christophe Gisquet:
> Summary of changes
> - move back to regular, non-macro, get_bits API
> - reduce the lookup to switch the coding method
> - shorter reads wherever possible, in particular for the end of bitstream
>   (16 bits instead of 32, as per the above)
> 
> There are cases that really need longer lengths (larger EG codes) of up
> to 27 bits.
> 
> Win64: 6.10 -> 4.87 (~20% speedup)
> 
> Reference for an hypothetical 32bits version of the cached reader:
> Win32: 11.4 -> 9.8  (14%, because iDCT is not SIMDed)
> ---

The commit message claims to fix something; what does it fix?
Also, changing to the non-macro API should be done in a separate commit
to the one changing the type of bitstream reader.

Furthermore, you should also provide information about the code size
impact when switching the type of reader.

>  libavcodec/proresdec2.c | 53 ++++++++++++++++++-----------------------
>  1 file changed, 23 insertions(+), 30 deletions(-)
> 
> diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
> index 9297860946..6e243cfc17 100644
> --- a/libavcodec/proresdec2.c
> +++ b/libavcodec/proresdec2.c
> @@ -24,9 +24,7 @@
>   * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'apco' (Proxy), 'ap4h' (4444), 'ap4x' (4444 XQ)
>   */
>  
> -//#define DEBUG
> -
> -#define LONG_BITSTREAM_READER
> +#define CACHED_BITSTREAM_READER 1
>  
>  #include "config_components.h"
>  
> @@ -422,35 +420,37 @@ static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, cons
>      return pic_data_size;
>  }
>  
> -#define DECODE_CODEWORD(val, codebook, SKIP)                            \
> +/* bitstream_read may fail on 32bits ARCHS for >24 bits, so use long version there */
> +#if 0 //BITSTREAM_BITS == 32
> +# define READ_BITS get_bits_long
> +#else
> +# define READ_BITS get_bits
> +#endif
> +
> +#define DECODE_CODEWORD(val, codebook)                                  \
>      do {                                                                \
>          unsigned int rice_order, exp_order, switch_bits;                \
>          unsigned int q, buf, bits;                                      \
>                                                                          \
> -        UPDATE_CACHE(re, gb);                                           \
> -        buf = GET_CACHE(re, gb);                                        \
> +        buf = show_bits(gb, 14);                                        \
>                                                                          \
>          /* number of bits to switch between rice and exp golomb */      \
>          switch_bits =  codebook & 3;                                    \
>          rice_order  =  codebook >> 5;                                   \
>          exp_order   = (codebook >> 2) & 7;                              \
>                                                                          \
> -        q = 31 - av_log2(buf);                                          \
> +        q = 13 - av_log2(buf);                                          \
>                                                                          \
>          if (q > switch_bits) { /* exp golomb */                         \
>              bits = exp_order - switch_bits + (q<<1);                    \
> -            if (bits > FFMIN(MIN_CACHE_BITS, 31))                       \
> -                return AVERROR_INVALIDDATA;                             \
> -            val = SHOW_UBITS(re, gb, bits) - (1 << exp_order) +         \
> +            val = READ_BITS(gb, bits) - (1 << exp_order) +              \
>                  ((switch_bits + 1) << rice_order);                      \
> -            SKIP(re, gb, bits);                                         \
>          } else if (rice_order) {                                        \
> -            SKIP_BITS(re, gb, q+1);                                     \
> -            val = (q << rice_order) + SHOW_UBITS(re, gb, rice_order);   \
> -            SKIP(re, gb, rice_order);                                   \
> +            skip_remaining(gb, q+1);                                    \
> +            val = (q << rice_order) + get_bits(gb, rice_order);         \
>          } else {                                                        \
>              val = q;                                                    \
> -            SKIP(re, gb, q+1);                                          \
> +            skip_remaining(gb, q+1);                                    \
>          }                                                               \
>      } while (0)
>  
> @@ -466,9 +466,7 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
>      int16_t prev_dc;
>      int code, i, sign;
>  
> -    OPEN_READER(re, gb);
> -
> -    DECODE_CODEWORD(code, FIRST_DC_CB, LAST_SKIP_BITS);
> +    DECODE_CODEWORD(code, FIRST_DC_CB);
>      prev_dc = TOSIGNED(code);
>      out[0] = prev_dc;
>  
> @@ -477,13 +475,12 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
>      code = 5;
>      sign = 0;
>      for (i = 1; i < blocks_per_slice; i++, out += 64) {
> -        DECODE_CODEWORD(code, dc_codebook[FFMIN(code, 6U)], LAST_SKIP_BITS);
> +        DECODE_CODEWORD(code, dc_codebook[FFMIN(code, 6U)]);
>          if(code) sign ^= -(code & 1);
>          else     sign  = 0;
>          prev_dc += (((code + 1) >> 1) ^ sign) - sign;
>          out[0] = prev_dc;
>      }
> -    CLOSE_READER(re, gb);
>      return 0;
>  }
>  
> @@ -497,11 +494,9 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>      const ProresContext *ctx = avctx->priv_data;
>      int block_mask, sign;
>      unsigned pos, run, level;
> -    int max_coeffs, i, bits_left;
> +    int max_coeffs, i, bits_rem;
>      int log2_block_count = av_log2(blocks_per_slice);
>  
> -    OPEN_READER(re, gb);
> -    UPDATE_CACHE(re, gb);                                           \
>      run   = 4;
>      level = 2;
>  
> @@ -509,28 +504,26 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>      block_mask = blocks_per_slice - 1;
>  
>      for (pos = block_mask;;) {
> -        bits_left = gb->size_in_bits - re_index;
> -        if (!bits_left || (bits_left < 32 && !SHOW_UBITS(re, gb, bits_left)))
> +        bits_rem = get_bits_left(gb);
> +        if (!bits_rem || (bits_rem < 16 && !show_bits(gb, bits_rem)))
>              break;
>  
> -        DECODE_CODEWORD(run, run_to_cb[FFMIN(run,  15)], LAST_SKIP_BITS);
> +        DECODE_CODEWORD(run, run_to_cb[FFMIN(run,  15)]);
>          pos += run + 1;
>          if (pos >= max_coeffs) {
>              av_log(avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", pos, max_coeffs);
>              return AVERROR_INVALIDDATA;
>          }
>  
> -        DECODE_CODEWORD(level, lev_to_cb[FFMIN(level, 9)], SKIP_BITS);
> +        DECODE_CODEWORD(level, lev_to_cb[FFMIN(level, 9)]);
>          level += 1;
>  
>          i = pos >> log2_block_count;
>  
> -        sign = SHOW_SBITS(re, gb, 1);
> -        SKIP_BITS(re, gb, 1);
> +        sign = -get_bits1(gb);
>          out[((pos & block_mask) << 6) + ctx->scan[i]] = ((level ^ sign) - sign);
>      }
>  
> -    CLOSE_READER(re, gb);
>      return 0;
>  }
>
Michael Niedermayer Sept. 8, 2023, 9 p.m. UTC | #5
On Fri, Sep 08, 2023 at 10:15:02AM +0200, Christophe Gisquet wrote:
> Summary of changes
> - move back to regular, non-macro, get_bits API
> - reduce the lookup to switch the coding method
> - shorter reads wherever possible, in particular for the end of bitstream
>   (16 bits instead of 32, as per the above)
> 
> There are cases that really need longer lengths (larger EG codes) of up
> to 27 bits.
> 
> Win64: 6.10 -> 4.87 (~20% speedup)
> 
> Reference for an hypothetical 32bits version of the cached reader:
> Win32: 11.4 -> 9.8  (14%, because iDCT is not SIMDed)
> ---
>  libavcodec/proresdec2.c | 53 ++++++++++++++++++-----------------------
>  1 file changed, 23 insertions(+), 30 deletions(-)

causes assertion failure:

Assertion n <= 32 failed at libavcodec/bitstream_template.h:338/0

Thread 36 "read_thread" received signal SIGABRT, Aborted.
[Switching to Thread 0x7fff56ffd700 (LWP 12751)]
__GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
51	../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
(gdb) bt
#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
#1  0x00007fffefd097f1 in __GI_abort () at abort.c:79
#2  0x0000555556035277 in bits_peek_be (bc=0x7fff56ffb270, n=4294967295) at libavcodec/bitstream_template.h:338
#3  0x0000555556036b35 in decode_ac_coeffs (avctx=0x7fff28002680, gb=0x7fff56ffb270, out=0x7fff56ffb2a0, blocks_per_slice=32) at libavcodec/proresdec2.c:590
#4  0x000055555603715c in decode_slice_chroma (avctx=0x7fff28002680, slice=0x7fff28055c20, dst=0x7fff9450b940, dst_stride=7680, buf=0x7fff2803d072 "#\a`\002g\240", <incomplete sequence \323>, buf_size=43, qmat=0x7fff56ffc450, log2_blocks_per_mb=2) at libavcodec/proresdec2.c:674
#5  0x0000555556037ae2 in decode_slice_thread (avctx=0x7fff28002680, arg=0x0, jobnr=145, threadnr=0) at libavcodec/proresdec2.c:807
#6  0x0000555555c815a0 in avcodec_default_execute2 (c=0x7fff28002680, func=0x555556037465 <decode_slice_thread>, arg=0x0, ret=0x0, count=510) at libavcodec/avcodec.c:74
#7  0x0000555556037d1e in decode_picture (avctx=0x7fff28002680) at libavcodec/proresdec2.c:846
#8  0x0000555556037fda in decode_frame (avctx=0x7fff28002680, frame=0x7fff28007440, got_frame=0x7fff56ffc5f0, avpkt=0x7fff28007dc0) at libavcodec/proresdec2.c:912
#9  0x0000555555d732ec in decode_simple_internal (avctx=0x7fff28002680, frame=0x7fff28007440, discarded_samples=0x7fff56ffc650) at libavcodec/decode.c:433
#10 0x0000555555d73843 in decode_simple_receive_frame (avctx=0x7fff28002680, frame=0x7fff28007440) at libavcodec/decode.c:607
#11 0x0000555555d739b3 in decode_receive_frame_internal (avctx=0x7fff28002680, frame=0x7fff28007440) at libavcodec/decode.c:635
#12 0x0000555555d73d78 in avcodec_send_packet (avctx=0x7fff28002680, avpkt=0x7fff28007208) at libavcodec/decode.c:732
#13 0x0000555555a6ef78 in try_decode_frame (s=0x7fff28000c80, st=0x7fff28002240, pkt=0x7fff28007208, options=0x7fff280071c0) at libavformat/demux.c:2075
#14 0x0000555555a71d92 in avformat_find_stream_info (ic=0x7fff28000c80, options=0x7fff280071c0) at libavformat/demux.c:2771
#15 0x0000555555699823 in read_thread (arg=0x7fffd45ca040) at fftools/ffplay.c:2806
#16 0x00007ffff5deed6c in ?? () from /usr/lib/x86_64-linux-gnu/libSDL2-2.0.so.0
#17 0x00007ffff5e640f9 in ?? () from /usr/lib/x86_64-linux-gnu/libSDL2-2.0.so.0
#18 0x00007ffff00c16db in start_thread (arg=0x7fff56ffd700) at pthread_create.c:463
#19 0x00007fffefdea61f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

[...]
Christophe Gisquet Sept. 11, 2023, 8:54 p.m. UTC | #6
Le ven. 8 sept. 2023 à 10:20, Christophe Gisquet
<christophe.gisquet@gmail.com> a écrit :
> This patchset requires my previous one improving the cached bitstream
> reader, and serves as its justification. It, basically, moves to using
> VLC wherever possible, and in particular when codewords are
> sufficiently short/there's some kind of well-behaved laplacian
> distribution for codewords that make VLCs efficient.
>
> Total speedup is around 40% here.

It's unfortunate I cannot devote as much time and effort to fix some
fundamental problems. But as I don't want Andreas to have wasted his
time reviewing, and me answering as best as possible, the last state
(maybe addressing 90% of the review?) can be obtained from repo at
https://github.com/cgisquet/ffmpeg.git branch prores.

Best regards,
diff mbox series

Patch

diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
index 9297860946..6e243cfc17 100644
--- a/libavcodec/proresdec2.c
+++ b/libavcodec/proresdec2.c
@@ -24,9 +24,7 @@ 
  * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'apco' (Proxy), 'ap4h' (4444), 'ap4x' (4444 XQ)
  */
 
-//#define DEBUG
-
-#define LONG_BITSTREAM_READER
+#define CACHED_BITSTREAM_READER 1
 
 #include "config_components.h"
 
@@ -422,35 +420,37 @@  static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, cons
     return pic_data_size;
 }
 
-#define DECODE_CODEWORD(val, codebook, SKIP)                            \
+/* bitstream_read may fail on 32bits ARCHS for >24 bits, so use long version there */
+#if 0 //BITSTREAM_BITS == 32
+# define READ_BITS get_bits_long
+#else
+# define READ_BITS get_bits
+#endif
+
+#define DECODE_CODEWORD(val, codebook)                                  \
     do {                                                                \
         unsigned int rice_order, exp_order, switch_bits;                \
         unsigned int q, buf, bits;                                      \
                                                                         \
-        UPDATE_CACHE(re, gb);                                           \
-        buf = GET_CACHE(re, gb);                                        \
+        buf = show_bits(gb, 14);                                        \
                                                                         \
         /* number of bits to switch between rice and exp golomb */      \
         switch_bits =  codebook & 3;                                    \
         rice_order  =  codebook >> 5;                                   \
         exp_order   = (codebook >> 2) & 7;                              \
                                                                         \
-        q = 31 - av_log2(buf);                                          \
+        q = 13 - av_log2(buf);                                          \
                                                                         \
         if (q > switch_bits) { /* exp golomb */                         \
             bits = exp_order - switch_bits + (q<<1);                    \
-            if (bits > FFMIN(MIN_CACHE_BITS, 31))                       \
-                return AVERROR_INVALIDDATA;                             \
-            val = SHOW_UBITS(re, gb, bits) - (1 << exp_order) +         \
+            val = READ_BITS(gb, bits) - (1 << exp_order) +              \
                 ((switch_bits + 1) << rice_order);                      \
-            SKIP(re, gb, bits);                                         \
         } else if (rice_order) {                                        \
-            SKIP_BITS(re, gb, q+1);                                     \
-            val = (q << rice_order) + SHOW_UBITS(re, gb, rice_order);   \
-            SKIP(re, gb, rice_order);                                   \
+            skip_remaining(gb, q+1);                                    \
+            val = (q << rice_order) + get_bits(gb, rice_order);         \
         } else {                                                        \
             val = q;                                                    \
-            SKIP(re, gb, q+1);                                          \
+            skip_remaining(gb, q+1);                                    \
         }                                                               \
     } while (0)
 
@@ -466,9 +466,7 @@  static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
     int16_t prev_dc;
     int code, i, sign;
 
-    OPEN_READER(re, gb);
-
-    DECODE_CODEWORD(code, FIRST_DC_CB, LAST_SKIP_BITS);
+    DECODE_CODEWORD(code, FIRST_DC_CB);
     prev_dc = TOSIGNED(code);
     out[0] = prev_dc;
 
@@ -477,13 +475,12 @@  static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
     code = 5;
     sign = 0;
     for (i = 1; i < blocks_per_slice; i++, out += 64) {
-        DECODE_CODEWORD(code, dc_codebook[FFMIN(code, 6U)], LAST_SKIP_BITS);
+        DECODE_CODEWORD(code, dc_codebook[FFMIN(code, 6U)]);
         if(code) sign ^= -(code & 1);
         else     sign  = 0;
         prev_dc += (((code + 1) >> 1) ^ sign) - sign;
         out[0] = prev_dc;
     }
-    CLOSE_READER(re, gb);
     return 0;
 }
 
@@ -497,11 +494,9 @@  static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
     const ProresContext *ctx = avctx->priv_data;
     int block_mask, sign;
     unsigned pos, run, level;
-    int max_coeffs, i, bits_left;
+    int max_coeffs, i, bits_rem;
     int log2_block_count = av_log2(blocks_per_slice);
 
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);                                           \
     run   = 4;
     level = 2;
 
@@ -509,28 +504,26 @@  static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
     block_mask = blocks_per_slice - 1;
 
     for (pos = block_mask;;) {
-        bits_left = gb->size_in_bits - re_index;
-        if (!bits_left || (bits_left < 32 && !SHOW_UBITS(re, gb, bits_left)))
+        bits_rem = get_bits_left(gb);
+        if (!bits_rem || (bits_rem < 16 && !show_bits(gb, bits_rem)))
             break;
 
-        DECODE_CODEWORD(run, run_to_cb[FFMIN(run,  15)], LAST_SKIP_BITS);
+        DECODE_CODEWORD(run, run_to_cb[FFMIN(run,  15)]);
         pos += run + 1;
         if (pos >= max_coeffs) {
             av_log(avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", pos, max_coeffs);
             return AVERROR_INVALIDDATA;
         }
 
-        DECODE_CODEWORD(level, lev_to_cb[FFMIN(level, 9)], SKIP_BITS);
+        DECODE_CODEWORD(level, lev_to_cb[FFMIN(level, 9)]);
         level += 1;
 
         i = pos >> log2_block_count;
 
-        sign = SHOW_SBITS(re, gb, 1);
-        SKIP_BITS(re, gb, 1);
+        sign = -get_bits1(gb);
         out[((pos & block_mask) << 6) + ctx->scan[i]] = ((level ^ sign) - sign);
     }
 
-    CLOSE_READER(re, gb);
     return 0;
 }