diff mbox series

[FFmpeg-devel,3/7] proresdec2: use VLC for level instead of EC switch

Message ID 20230908081508.510-3-christophe.gisquet@gmail.com
State New
Headers show
Series [FFmpeg-devel,1/7] proresdec2: port and fix for cached reader | expand

Checks

Context Check Description
andriy/make_x86 fail Make failed

Commit Message

Christophe Gisquet Sept. 8, 2023, 8:15 a.m. UTC
x86/x64: 61/52 -> 55/46
Around 7-10% speedup.

Run and DC do not lend themselves to such changes, likely because
their distribution is less skewed, and need larger average vlc read
iterations.
---
 libavcodec/proresdec.h  |  1 +
 libavcodec/proresdec2.c | 77 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 66 insertions(+), 12 deletions(-)

Comments

Andreas Rheinhardt Sept. 8, 2023, 8:44 a.m. UTC | #1
Christophe Gisquet:
> x86/x64: 61/52 -> 55/46
> Around 7-10% speedup.
> 
> Run and DC do not lend themselves to such changes, likely because
> their distribution is less skewed, and need larger average vlc read
> iterations.
> ---
>  libavcodec/proresdec.h  |  1 +
>  libavcodec/proresdec2.c | 77 ++++++++++++++++++++++++++++++++++-------
>  2 files changed, 66 insertions(+), 12 deletions(-)
> 
> diff --git a/libavcodec/proresdec.h b/libavcodec/proresdec.h
> index 1e48752e6f..7ebacaeb21 100644
> --- a/libavcodec/proresdec.h
> +++ b/libavcodec/proresdec.h
> @@ -22,6 +22,7 @@
>  #ifndef AVCODEC_PRORESDEC_H
>  #define AVCODEC_PRORESDEC_H
>  
> +#define CACHED_BITSTREAM_READER 1

This should be in the commit switching to the cached bitstream reader.

>  #include "get_bits.h"
>  #include "blockdsp.h"
>  #include "proresdsp.h"
> diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
> index 65e8b01755..91c689d9ef 100644
> --- a/libavcodec/proresdec2.c
> +++ b/libavcodec/proresdec2.c
> @@ -24,17 +24,17 @@
>   * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'apco' (Proxy), 'ap4h' (4444), 'ap4x' (4444 XQ)
>   */
>  
> -#define CACHED_BITSTREAM_READER 1
> +//#define DEBUG
>  
>  #include "config_components.h"
>  
>  #include "libavutil/internal.h"
>  #include "libavutil/mem_internal.h"
> +#include "libavutil/thread.h"
>  
>  #include "avcodec.h"
>  #include "codec_internal.h"
>  #include "decode.h"
> -#include "get_bits.h"
>  #include "hwaccel_internal.h"
>  #include "hwconfig.h"
>  #include "idctdsp.h"
> @@ -129,8 +129,64 @@ static void unpack_alpha_12(GetBitContext *gb, uint16_t *dst, int num_coeffs,
>      }
>  }
>  
> +#define AC_BITS 12
> +#define PRORES_LEV_BITS 9
> +
> +static const uint8_t ac_info[] = { 0x04, 0x0A, 0x05, 0x06, 0x28, 0x4C };
> +static VLC ac_vlc[6];
> +
> +static av_cold void init_vlcs(void)
> +{
> +    int i;
> +    for (i = 0; i < sizeof(ac_info); i++) {

FF_ARRAY_ELEMS() is cleaner; also we support and prefer declarations
inside for-loops: for (int i = 0;

> +        uint32_t ac_codes[1<<AC_BITS];
> +        uint8_t ac_bits[1<<AC_BITS];
> +        unsigned int rice_order, exp_order, switch_bits, switch_val;
> +        int ac, max_bits = 0, codebook = ac_info[i];
> +
> +        /* number of prefix bits to switch between Rice and expGolomb */
> +        switch_bits = (codebook & 3);
> +        rice_order  =  codebook >> 5;       /* rice code order */
> +        exp_order   = (codebook >> 2) & 7;  /* exp golomb code order */
> +
> +        switch_val  = (switch_bits+1) << rice_order;
> +
> +        // Values are actually transformed, but this is more a wrapping
> +        for (ac = 0; ac <1<<AC_BITS; ac++) {
> +            int exponent, bits, val = ac;
> +            unsigned int code;
> +
> +            if (val >= switch_val) {
> +                val += (1 << exp_order) - switch_val;
> +                exponent = av_log2(val);
> +                bits = exponent+1+switch_bits-exp_order/*0*/ + exponent+1/*val*/;
> +                code = val;
> +            } else if (rice_order) {
> +                bits = (val >> rice_order)/*0*/ + 1/*1*/ + rice_order/*val*/;
> +                code = (1 << rice_order) | val;
> +            } else {
> +                bits = val/*0*/ + 1/*1*/;
> +                code = 1;
> +            }
> +            if (bits > max_bits) max_bits = bits;
> +            ac_bits [ac] = bits;
> +            ac_codes[ac] = code;
> +        }
> +
> +        ff_free_vlc(ac_vlc+i);

This is unnecessary, as the VLC is initially blank and is not
initialized multiple times.

> +
> +        if (init_vlc(ac_vlc+i, PRORES_LEV_BITS, 1<<AC_BITS,
> +                     ac_bits, 1, 1, ac_codes, 4, 4, 0) < 0) {
> +            av_log(NULL, AV_LOG_ERROR, "Error for %d(0x%02X), max bits %d\n",
> +                   i, codebook, max_bits);
> +            break; //return AVERROR_BUG;

This is not how you initialize a static table (you miss the
INIT_VLC_USE_NEW_STATIC flag and don't set the static store buffer).
Search for INIT_VLC_STATIC_OVERLONG for an idea of how to do it.

> +        }
> +    }
> +}
> +
>  static av_cold int decode_init(AVCodecContext *avctx)
>  {
> +    static AVOnce init_static_once = AV_ONCE_INIT;
>      int ret = 0;
>      ProresContext *ctx = avctx->priv_data;
>      uint8_t idct_permutation[64];
> @@ -184,6 +240,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
>  
>      ctx->pix_fmt = AV_PIX_FMT_NONE;
>  
> +    // init dc_tables
> +    ff_thread_once(&init_static_once, init_vlcs);
> +
>      if (avctx->bits_per_raw_sample == 10){
>          ctx->unpack_alpha = unpack_alpha_10;
>      } else if (avctx->bits_per_raw_sample == 12){
> @@ -510,7 +569,7 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
>      return 0;
>  }
>  
> -// adaptive codebook switching lut according to previous run/level values
> +// adaptive codebook switching lut according to previous run values
>  static const char run_to_cb[16][4] = {
>      { 2, 0, -1,  1 }, { 2, 0, -1,  1 }, { 1, 0, 0,  0 }, { 1, 0,  0,  0 }, { 0, 0, 1, -1 },
>      { 1, 1,  1,  0 }, { 1, 1,  1,  0 }, { 1, 1, 1,  0 }, { 1, 1,  1,  0 },
> @@ -518,12 +577,6 @@ static const char run_to_cb[16][4] = {
>      { 0, 2,  3, -4 }
>  };
>  
> -static const char lev_to_cb[10][4] = {
> -    { 0, 0,  1, -1 }, { 2, 0,  0, -1 }, { 1, 0, 0,  0 }, { 2, 0, -1,  1 }, { 0, 0, 1, -1 },
> -    { 0, 1,  2, -2 }, { 0, 1,  2, -2 }, { 0, 1, 2, -2 }, { 0, 1,  2, -2 },
> -    { 0, 2,  3, -4 }
> -};
> -
>  static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
>                                               int16_t *out, int blocks_per_slice)
>  {
> @@ -540,8 +593,9 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>      block_mask = blocks_per_slice - 1;
>  
>      for (pos = block_mask;;) {
> +        static const uint8_t ctx_to_tbl[] = { 0, 1, 2, 3, 0, 4, 4, 4, 4, 5 };
> +        const VLC* tbl = ac_vlc + ctx_to_tbl[FFMIN(level, 9)];
>          unsigned int runcb = FFMIN(run,  15);
> -        unsigned int levcb = FFMIN(level, 9);
>          bits_rem = get_bits_left(gb);
>          if (!bits_rem || (bits_rem < 16 && !show_bits(gb, bits_rem)))
>              break;
> @@ -554,8 +608,7 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>              return AVERROR_INVALIDDATA;
>          }
>  
> -        DECODE_CODEWORD2(level, lev_to_cb[levcb][0], lev_to_cb[levcb][1],
> -                                lev_to_cb[levcb][2], lev_to_cb[levcb][3]);
> +        level = get_vlc2(gb, tbl->table, PRORES_LEV_BITS, 3);
>          level += 1;
>  
>          i = pos >> log2_block_count;
Andreas Rheinhardt Sept. 8, 2023, 9:58 a.m. UTC | #2
Andreas Rheinhardt:
> Christophe Gisquet:
>> x86/x64: 61/52 -> 55/46
>> Around 7-10% speedup.
>>
>> Run and DC do not lend themselves to such changes, likely because
>> their distribution is less skewed, and need larger average vlc read
>> iterations.
>> ---
>>  libavcodec/proresdec.h  |  1 +
>>  libavcodec/proresdec2.c | 77 ++++++++++++++++++++++++++++++++++-------
>>  2 files changed, 66 insertions(+), 12 deletions(-)
>>
>> diff --git a/libavcodec/proresdec.h b/libavcodec/proresdec.h
>> index 1e48752e6f..7ebacaeb21 100644
>> --- a/libavcodec/proresdec.h
>> +++ b/libavcodec/proresdec.h
>> @@ -22,6 +22,7 @@
>>  #ifndef AVCODEC_PRORESDEC_H
>>  #define AVCODEC_PRORESDEC_H
>>  
>> +#define CACHED_BITSTREAM_READER 1
> 
> This should be in the commit switching to the cached bitstream reader.

Correction: This header is included in videotoolbox.c and there is other
stuff that also includes get_bits.h included in said file (and currently
gets included before proresdec.h). This means that proresdec2.c and
videotoolbox.c will have different opinions on what a GetBitContext is:
It will be the non-cached one in videotoolbox.c and the cached one in
proresdec2.c. This will work in practice, because ProresContext does not
need the complete GetBitContext type at all (it does not contain a
GetBitContext at all), so offsets are not affected. But it is
nevertheless undefined behaviour and could become dangerous when using LTO.

So you should switch the type of the pointer to BitstreamContextBE* in
proresdec2.h. Furthermore, you can either include bitstream.h in
proresdec.h or (IMO better) use a forward declaration and struct
BitstreamContextBE* in the function pointer without including get_bits.h
in the header at all.

> 
>>  #include "get_bits.h"
>>  #include "blockdsp.h"
>>  #include "proresdsp.h"
>> diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
>> index 65e8b01755..91c689d9ef 100644
>> --- a/libavcodec/proresdec2.c
>> +++ b/libavcodec/proresdec2.c
>> @@ -24,17 +24,17 @@
>>   * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'apco' (Proxy), 'ap4h' (4444), 'ap4x' (4444 XQ)
>>   */
>>  
>> -#define CACHED_BITSTREAM_READER 1
>> +//#define DEBUG
>>  
>>  #include "config_components.h"
>>  
>>  #include "libavutil/internal.h"
>>  #include "libavutil/mem_internal.h"
>> +#include "libavutil/thread.h"
>>  
>>  #include "avcodec.h"
>>  #include "codec_internal.h"
>>  #include "decode.h"
>> -#include "get_bits.h"
>>  #include "hwaccel_internal.h"
>>  #include "hwconfig.h"
>>  #include "idctdsp.h"
>> @@ -129,8 +129,64 @@ static void unpack_alpha_12(GetBitContext *gb, uint16_t *dst, int num_coeffs,
>>      }
>>  }
>>  
>> +#define AC_BITS 12
>> +#define PRORES_LEV_BITS 9
>> +
>> +static const uint8_t ac_info[] = { 0x04, 0x0A, 0x05, 0x06, 0x28, 0x4C };
>> +static VLC ac_vlc[6];
>> +
>> +static av_cold void init_vlcs(void)
>> +{
>> +    int i;
>> +    for (i = 0; i < sizeof(ac_info); i++) {
> 
> FF_ARRAY_ELEMS() is cleaner; also we support and prefer declarations
> inside for-loops: for (int i = 0;
> 
>> +        uint32_t ac_codes[1<<AC_BITS];
>> +        uint8_t ac_bits[1<<AC_BITS];
>> +        unsigned int rice_order, exp_order, switch_bits, switch_val;
>> +        int ac, max_bits = 0, codebook = ac_info[i];
>> +
>> +        /* number of prefix bits to switch between Rice and expGolomb */
>> +        switch_bits = (codebook & 3);
>> +        rice_order  =  codebook >> 5;       /* rice code order */
>> +        exp_order   = (codebook >> 2) & 7;  /* exp golomb code order */
>> +
>> +        switch_val  = (switch_bits+1) << rice_order;
>> +
>> +        // Values are actually transformed, but this is more a wrapping
>> +        for (ac = 0; ac <1<<AC_BITS; ac++) {
>> +            int exponent, bits, val = ac;
>> +            unsigned int code;
>> +
>> +            if (val >= switch_val) {
>> +                val += (1 << exp_order) - switch_val;
>> +                exponent = av_log2(val);
>> +                bits = exponent+1+switch_bits-exp_order/*0*/ + exponent+1/*val*/;
>> +                code = val;
>> +            } else if (rice_order) {
>> +                bits = (val >> rice_order)/*0*/ + 1/*1*/ + rice_order/*val*/;
>> +                code = (1 << rice_order) | val;
>> +            } else {
>> +                bits = val/*0*/ + 1/*1*/;
>> +                code = 1;
>> +            }
>> +            if (bits > max_bits) max_bits = bits;
>> +            ac_bits [ac] = bits;
>> +            ac_codes[ac] = code;
>> +        }
>> +
>> +        ff_free_vlc(ac_vlc+i);
> 
> This is unnecessary, as the VLC is initially blank and is not
> initialized multiple times.
> 
>> +
>> +        if (init_vlc(ac_vlc+i, PRORES_LEV_BITS, 1<<AC_BITS,
>> +                     ac_bits, 1, 1, ac_codes, 4, 4, 0) < 0) {
>> +            av_log(NULL, AV_LOG_ERROR, "Error for %d(0x%02X), max bits %d\n",
>> +                   i, codebook, max_bits);
>> +            break; //return AVERROR_BUG;
> 
> This is not how you initialize a static table (you miss the
> INIT_VLC_USE_NEW_STATIC flag and don't set the static store buffer).
> Search for INIT_VLC_STATIC_OVERLONG for an idea of how to do it.
> 
>> +        }
>> +    }
>> +}
>> +
>>  static av_cold int decode_init(AVCodecContext *avctx)
>>  {
>> +    static AVOnce init_static_once = AV_ONCE_INIT;
>>      int ret = 0;
>>      ProresContext *ctx = avctx->priv_data;
>>      uint8_t idct_permutation[64];
>> @@ -184,6 +240,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
>>  
>>      ctx->pix_fmt = AV_PIX_FMT_NONE;
>>  
>> +    // init dc_tables
>> +    ff_thread_once(&init_static_once, init_vlcs);
>> +
>>      if (avctx->bits_per_raw_sample == 10){
>>          ctx->unpack_alpha = unpack_alpha_10;
>>      } else if (avctx->bits_per_raw_sample == 12){
>> @@ -510,7 +569,7 @@ static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
>>      return 0;
>>  }
>>  
>> -// adaptive codebook switching lut according to previous run/level values
>> +// adaptive codebook switching lut according to previous run values
>>  static const char run_to_cb[16][4] = {
>>      { 2, 0, -1,  1 }, { 2, 0, -1,  1 }, { 1, 0, 0,  0 }, { 1, 0,  0,  0 }, { 0, 0, 1, -1 },
>>      { 1, 1,  1,  0 }, { 1, 1,  1,  0 }, { 1, 1, 1,  0 }, { 1, 1,  1,  0 },
>> @@ -518,12 +577,6 @@ static const char run_to_cb[16][4] = {
>>      { 0, 2,  3, -4 }
>>  };
>>  
>> -static const char lev_to_cb[10][4] = {
>> -    { 0, 0,  1, -1 }, { 2, 0,  0, -1 }, { 1, 0, 0,  0 }, { 2, 0, -1,  1 }, { 0, 0, 1, -1 },
>> -    { 0, 1,  2, -2 }, { 0, 1,  2, -2 }, { 0, 1, 2, -2 }, { 0, 1,  2, -2 },
>> -    { 0, 2,  3, -4 }
>> -};
>> -
>>  static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
>>                                               int16_t *out, int blocks_per_slice)
>>  {
>> @@ -540,8 +593,9 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>>      block_mask = blocks_per_slice - 1;
>>  
>>      for (pos = block_mask;;) {
>> +        static const uint8_t ctx_to_tbl[] = { 0, 1, 2, 3, 0, 4, 4, 4, 4, 5 };
>> +        const VLC* tbl = ac_vlc + ctx_to_tbl[FFMIN(level, 9)];
>>          unsigned int runcb = FFMIN(run,  15);
>> -        unsigned int levcb = FFMIN(level, 9);
>>          bits_rem = get_bits_left(gb);
>>          if (!bits_rem || (bits_rem < 16 && !show_bits(gb, bits_rem)))
>>              break;
>> @@ -554,8 +608,7 @@ static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
>>              return AVERROR_INVALIDDATA;
>>          }
>>  
>> -        DECODE_CODEWORD2(level, lev_to_cb[levcb][0], lev_to_cb[levcb][1],
>> -                                lev_to_cb[levcb][2], lev_to_cb[levcb][3]);
>> +        level = get_vlc2(gb, tbl->table, PRORES_LEV_BITS, 3);
>>          level += 1;
>>  
>>          i = pos >> log2_block_count;
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Christophe Gisquet Sept. 10, 2023, 3:28 p.m. UTC | #3
Hello,

Le ven. 8 sept. 2023 à 11:57, Andreas Rheinhardt
<andreas.rheinhardt@outlook.com> a écrit :
> >> +#define CACHED_BITSTREAM_READER 1
> >
> > This should be in the commit switching to the cached bitstream reader.
>
> Correction: This header is included in videotoolbox.c and there is other
> stuff that also includes get_bits.h included in said file (and currently
> gets included before proresdec.h). This means that proresdec2.c and
> videotoolbox.c will have different opinions on what a GetBitContext is:
> It will be the non-cached one in videotoolbox.c and the cached one in
> proresdec2.c. This will work in practice, because ProresContext does not
> need the complete GetBitContext type at all (it does not contain a
> GetBitContext at all), so offsets are not affected. But it is
> nevertheless undefined behaviour and could become dangerous when using LTO.
>
> So you should switch the type of the pointer to BitstreamContextBE* in
> proresdec2.h. Furthermore, you can either include bitstream.h in
> proresdec.h or (IMO better) use a forward declaration and struct
> BitstreamContextBE* in the function pointer without including get_bits.h
> in the header at all.

On that point only: I don't recall (yes that's 3+ years old) the issue
being videotoolbox, it didn't have that include back when I wrote this
code.

It's a very faint recollection, and I don't find proof in the ffmpeg
code today or of 3+ years ago, but the problem you mention was
happening with the encoder instead. So maybe the fix now is needed
only by videotoolbox then.
Andreas Rheinhardt Sept. 10, 2023, 3:41 p.m. UTC | #4
Christophe Gisquet:
> Hello,
> 
> Le ven. 8 sept. 2023 à 11:57, Andreas Rheinhardt
> <andreas.rheinhardt@outlook.com> a écrit :
>>>> +#define CACHED_BITSTREAM_READER 1
>>>
>>> This should be in the commit switching to the cached bitstream reader.
>>
>> Correction: This header is included in videotoolbox.c and there is other
>> stuff that also includes get_bits.h included in said file (and currently
>> gets included before proresdec.h). This means that proresdec2.c and
>> videotoolbox.c will have different opinions on what a GetBitContext is:
>> It will be the non-cached one in videotoolbox.c and the cached one in
>> proresdec2.c. This will work in practice, because ProresContext does not
>> need the complete GetBitContext type at all (it does not contain a
>> GetBitContext at all), so offsets are not affected. But it is
>> nevertheless undefined behaviour and could become dangerous when using LTO.
>>
>> So you should switch the type of the pointer to BitstreamContextBE* in
>> proresdec2.h. Furthermore, you can either include bitstream.h in
>> proresdec.h or (IMO better) use a forward declaration and struct
>> BitstreamContextBE* in the function pointer without including get_bits.h
>> in the header at all.
> 
> On that point only: I don't recall (yes that's 3+ years old) the issue
> being videotoolbox, it didn't have that include back when I wrote this
> code.
> 
> It's a very faint recollection, and I don't find proof in the ffmpeg
> code today or of 3+ years ago, but the problem you mention was
> happening with the encoder instead. So maybe the fix now is needed
> only by videotoolbox then.
> 

The videotoolbox prores hwaccel has only been added in November 2021;
before that, nobody but proresdec2.c included proresdec.h, so that there
was no issue with GetBitContext being cached or not in different files.

Another solution would be to use void* instead of GetBitContext* in the
header and in the implementation and then convert this void* to
GetBitContext* in the function.

I do not know what you mean by "the encoder instead". What problem
happens with the encoder? Why would the encoder include proresdec.h at
all and why would it be affected by changes to the decoder?

- Andreas
Christophe Gisquet Sept. 10, 2023, 3:56 p.m. UTC | #5
Hello

Le dim. 10 sept. 2023 à 17:40, Andreas Rheinhardt
<andreas.rheinhardt@outlook.com> a écrit :
> Another solution would be to use void* instead of GetBitContext* in the
> header and in the implementation and then convert this void* to
> GetBitContext* in the function.

The forward declaration will be enough.

> I do not know what you mean by "the encoder instead". What problem
> happens with the encoder? Why would the encoder include proresdec.h at
> all and why would it be affected by changes to the decoder?

The keyword was "faint", and it's a non-issue now. Just explaining why
I would have had that code change that now appears as an attempted fix
for videotoolbox, but really was never meant that way: pure luck, the
actual reason being lost to time.
diff mbox series

Patch

diff --git a/libavcodec/proresdec.h b/libavcodec/proresdec.h
index 1e48752e6f..7ebacaeb21 100644
--- a/libavcodec/proresdec.h
+++ b/libavcodec/proresdec.h
@@ -22,6 +22,7 @@ 
 #ifndef AVCODEC_PRORESDEC_H
 #define AVCODEC_PRORESDEC_H
 
+#define CACHED_BITSTREAM_READER 1
 #include "get_bits.h"
 #include "blockdsp.h"
 #include "proresdsp.h"
diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
index 65e8b01755..91c689d9ef 100644
--- a/libavcodec/proresdec2.c
+++ b/libavcodec/proresdec2.c
@@ -24,17 +24,17 @@ 
  * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'apco' (Proxy), 'ap4h' (4444), 'ap4x' (4444 XQ)
  */
 
-#define CACHED_BITSTREAM_READER 1
+//#define DEBUG
 
 #include "config_components.h"
 
 #include "libavutil/internal.h"
 #include "libavutil/mem_internal.h"
+#include "libavutil/thread.h"
 
 #include "avcodec.h"
 #include "codec_internal.h"
 #include "decode.h"
-#include "get_bits.h"
 #include "hwaccel_internal.h"
 #include "hwconfig.h"
 #include "idctdsp.h"
@@ -129,8 +129,64 @@  static void unpack_alpha_12(GetBitContext *gb, uint16_t *dst, int num_coeffs,
     }
 }
 
+#define AC_BITS 12
+#define PRORES_LEV_BITS 9
+
+static const uint8_t ac_info[] = { 0x04, 0x0A, 0x05, 0x06, 0x28, 0x4C };
+static VLC ac_vlc[6];
+
+static av_cold void init_vlcs(void)
+{
+    int i;
+    for (i = 0; i < sizeof(ac_info); i++) {
+        uint32_t ac_codes[1<<AC_BITS];
+        uint8_t ac_bits[1<<AC_BITS];
+        unsigned int rice_order, exp_order, switch_bits, switch_val;
+        int ac, max_bits = 0, codebook = ac_info[i];
+
+        /* number of prefix bits to switch between Rice and expGolomb */
+        switch_bits = (codebook & 3);
+        rice_order  =  codebook >> 5;       /* rice code order */
+        exp_order   = (codebook >> 2) & 7;  /* exp golomb code order */
+
+        switch_val  = (switch_bits+1) << rice_order;
+
+        // Values are actually transformed, but this is more a wrapping
+        for (ac = 0; ac <1<<AC_BITS; ac++) {
+            int exponent, bits, val = ac;
+            unsigned int code;
+
+            if (val >= switch_val) {
+                val += (1 << exp_order) - switch_val;
+                exponent = av_log2(val);
+                bits = exponent+1+switch_bits-exp_order/*0*/ + exponent+1/*val*/;
+                code = val;
+            } else if (rice_order) {
+                bits = (val >> rice_order)/*0*/ + 1/*1*/ + rice_order/*val*/;
+                code = (1 << rice_order) | val;
+            } else {
+                bits = val/*0*/ + 1/*1*/;
+                code = 1;
+            }
+            if (bits > max_bits) max_bits = bits;
+            ac_bits [ac] = bits;
+            ac_codes[ac] = code;
+        }
+
+        ff_free_vlc(ac_vlc+i);
+
+        if (init_vlc(ac_vlc+i, PRORES_LEV_BITS, 1<<AC_BITS,
+                     ac_bits, 1, 1, ac_codes, 4, 4, 0) < 0) {
+            av_log(NULL, AV_LOG_ERROR, "Error for %d(0x%02X), max bits %d\n",
+                   i, codebook, max_bits);
+            break; //return AVERROR_BUG;
+        }
+    }
+}
+
 static av_cold int decode_init(AVCodecContext *avctx)
 {
+    static AVOnce init_static_once = AV_ONCE_INIT;
     int ret = 0;
     ProresContext *ctx = avctx->priv_data;
     uint8_t idct_permutation[64];
@@ -184,6 +240,9 @@  static av_cold int decode_init(AVCodecContext *avctx)
 
     ctx->pix_fmt = AV_PIX_FMT_NONE;
 
+    // init dc_tables
+    ff_thread_once(&init_static_once, init_vlcs);
+
     if (avctx->bits_per_raw_sample == 10){
         ctx->unpack_alpha = unpack_alpha_10;
     } else if (avctx->bits_per_raw_sample == 12){
@@ -510,7 +569,7 @@  static av_always_inline int decode_dc_coeffs(GetBitContext *gb, int16_t *out,
     return 0;
 }
 
-// adaptive codebook switching lut according to previous run/level values
+// adaptive codebook switching lut according to previous run values
 static const char run_to_cb[16][4] = {
     { 2, 0, -1,  1 }, { 2, 0, -1,  1 }, { 1, 0, 0,  0 }, { 1, 0,  0,  0 }, { 0, 0, 1, -1 },
     { 1, 1,  1,  0 }, { 1, 1,  1,  0 }, { 1, 1, 1,  0 }, { 1, 1,  1,  0 },
@@ -518,12 +577,6 @@  static const char run_to_cb[16][4] = {
     { 0, 2,  3, -4 }
 };
 
-static const char lev_to_cb[10][4] = {
-    { 0, 0,  1, -1 }, { 2, 0,  0, -1 }, { 1, 0, 0,  0 }, { 2, 0, -1,  1 }, { 0, 0, 1, -1 },
-    { 0, 1,  2, -2 }, { 0, 1,  2, -2 }, { 0, 1, 2, -2 }, { 0, 1,  2, -2 },
-    { 0, 2,  3, -4 }
-};
-
 static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
                                              int16_t *out, int blocks_per_slice)
 {
@@ -540,8 +593,9 @@  static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
     block_mask = blocks_per_slice - 1;
 
     for (pos = block_mask;;) {
+        static const uint8_t ctx_to_tbl[] = { 0, 1, 2, 3, 0, 4, 4, 4, 4, 5 };
+        const VLC* tbl = ac_vlc + ctx_to_tbl[FFMIN(level, 9)];
         unsigned int runcb = FFMIN(run,  15);
-        unsigned int levcb = FFMIN(level, 9);
         bits_rem = get_bits_left(gb);
         if (!bits_rem || (bits_rem < 16 && !show_bits(gb, bits_rem)))
             break;
@@ -554,8 +608,7 @@  static av_always_inline int decode_ac_coeffs(AVCodecContext *avctx, GetBitContex
             return AVERROR_INVALIDDATA;
         }
 
-        DECODE_CODEWORD2(level, lev_to_cb[levcb][0], lev_to_cb[levcb][1],
-                                lev_to_cb[levcb][2], lev_to_cb[levcb][3]);
+        level = get_vlc2(gb, tbl->table, PRORES_LEV_BITS, 3);
         level += 1;
 
         i = pos >> log2_block_count;