diff mbox series

[FFmpeg-devel,2/2] libavcodec/ffv1: Support storing LSB raw

Message ID 20241015231735.937409-2-michael@niedermayer.cc
State New
Headers show
Series [FFmpeg-devel,1/2] avcodec/ffv1: add a named constant for the quant table size | expand

Checks

Context Check Description
yinshiyou/configure_loongarch64 warning Failed to apply patch

Commit Message

Michael Niedermayer Oct. 15, 2024, 11:17 p.m. UTC
This makes a 16bit RGB raw sample 25% faster at a 2% loss of compression with rawlsb=4

Please test and comment

This stores the LSB through non binary range coding, this is simpler than using a
separate coder
For cases where range coding is not wanted its probably best to use golomb rice
for everything.

We also pass the LSB through the decorrelation and context stages (which is basically free)
this leads to slightly better compression than separating them earlier.

Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
---
 libavcodec/ffv1.h             |  2 ++
 libavcodec/ffv1_template.c    | 19 ++++++++++---------
 libavcodec/ffv1dec.c          |  2 ++
 libavcodec/ffv1dec_template.c | 16 +++++++++++++---
 libavcodec/ffv1enc.c          | 15 ++++++++++++++-
 libavcodec/ffv1enc_template.c | 17 +++++++++++++++--
 libavcodec/rangecoder.h       | 20 ++++++++++++++++++++
 libavcodec/tests/rangecoder.c |  9 +++++++++
 8 files changed, 85 insertions(+), 15 deletions(-)

Comments

Lynne Oct. 16, 2024, 12:13 a.m. UTC | #1
On 16/10/2024 01:17, Michael Niedermayer wrote:
> This makes a 16bit RGB raw sample 25% faster at a 2% loss of compression with rawlsb=4
> 
> Please test and comment
> 
> This stores the LSB through non binary range coding, this is simpler than using a
> separate coder
> For cases where range coding is not wanted its probably best to use golomb rice
> for everything.
> 
> We also pass the LSB through the decorrelation and context stages (which is basically free)
> this leads to slightly better compression than separating them earlier.
> 
> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
> ---
>   libavcodec/ffv1.h             |  2 ++
>   libavcodec/ffv1_template.c    | 19 ++++++++++---------
>   libavcodec/ffv1dec.c          |  2 ++
>   libavcodec/ffv1dec_template.c | 16 +++++++++++++---
>   libavcodec/ffv1enc.c          | 15 ++++++++++++++-
>   libavcodec/ffv1enc_template.c | 17 +++++++++++++++--
>   libavcodec/rangecoder.h       | 20 ++++++++++++++++++++
>   libavcodec/tests/rangecoder.c |  9 +++++++++
>   8 files changed, 85 insertions(+), 15 deletions(-)
> 
> diff --git a/libavcodec/ffv1.h b/libavcodec/ffv1.h
> index 4f5a8ab2be7..02bfc33f680 100644
> --- a/libavcodec/ffv1.h
> +++ b/libavcodec/ffv1.h
> @@ -83,6 +83,7 @@ typedef struct FFV1SliceContext {
>       int slice_coding_mode;
>       int slice_rct_by_coef;
>       int slice_rct_ry_coef;
> +    int rawlsb;
>   
>       // RefStruct reference, array of MAX_PLANES elements
>       PlaneContext *plane;
> @@ -139,6 +140,7 @@ typedef struct FFV1Context {
>       int key_frame_ok;
>       int context_model;
>       int qtable;
> +    int rawlsb;
>   
>       int bits_per_raw_sample;
>       int packed_at_lsb;
> diff --git a/libavcodec/ffv1_template.c b/libavcodec/ffv1_template.c
> index abb90a12e49..10206702ee8 100644
> --- a/libavcodec/ffv1_template.c
> +++ b/libavcodec/ffv1_template.c
> @@ -30,24 +30,25 @@ static inline int RENAME(predict)(TYPE *src, TYPE *last)
>   }
>   
>   static inline int RENAME(get_context)(const int16_t quant_table[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE],
> -                                      TYPE *src, TYPE *last, TYPE *last2)
> +                                      TYPE *src, TYPE *last, TYPE *last2, int rawlsb)
>   {
>       const int LT = last[-1];
>       const int T  = last[0];
>       const int RT = last[1];
>       const int L  = src[-1];
> +    const int rawoff = (1<<rawlsb) >> 1;
>   
>       if (quant_table[3][127] || quant_table[4][127]) {
>           const int TT = last2[0];
>           const int LL = src[-2];
> -        return quant_table[0][(L - LT) & MAX_QUANT_TABLE_MASK] +
> -               quant_table[1][(LT - T) & MAX_QUANT_TABLE_MASK] +
> -               quant_table[2][(T - RT) & MAX_QUANT_TABLE_MASK] +
> -               quant_table[3][(LL - L) & MAX_QUANT_TABLE_MASK] +
> -               quant_table[4][(TT - T) & MAX_QUANT_TABLE_MASK];
> +        return quant_table[0][(L - LT + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK] +
> +               quant_table[1][(LT - T + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK] +
> +               quant_table[2][(T - RT + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK] +
> +               quant_table[3][(LL - L + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK] +
> +               quant_table[4][(TT - T + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK];
>       } else
> -        return quant_table[0][(L - LT) & MAX_QUANT_TABLE_MASK] +
> -               quant_table[1][(LT - T) & MAX_QUANT_TABLE_MASK] +
> -               quant_table[2][(T - RT) & MAX_QUANT_TABLE_MASK];
> +        return quant_table[0][(L - LT + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK] +
> +               quant_table[1][(LT - T + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK] +
> +               quant_table[2][(T - RT + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK];
>   }
>   
> diff --git a/libavcodec/ffv1dec.c b/libavcodec/ffv1dec.c
> index 5c099e49ad4..fc96bfb4cea 100644
> --- a/libavcodec/ffv1dec.c
> +++ b/libavcodec/ffv1dec.c
> @@ -249,6 +249,8 @@ static int decode_slice_header(const FFV1Context *f,
>                   return AVERROR_INVALIDDATA;
>               }
>           }
> +        if (f->micro_version > 2)
> +            sc->rawlsb = get_symbol(c, state, 0);
>       }
>   
>       return 0;
> diff --git a/libavcodec/ffv1dec_template.c b/libavcodec/ffv1dec_template.c
> index 2da6bd935dc..dbdcad7768e 100644
> --- a/libavcodec/ffv1dec_template.c
> +++ b/libavcodec/ffv1dec_template.c
> @@ -60,8 +60,13 @@ RENAME(decode_line)(FFV1Context *f, FFV1SliceContext *sc,
>                   return AVERROR_INVALIDDATA;
>           }
>   
> -        context = RENAME(get_context)(quant_table,
> -                                      sample[1] + x, sample[0] + x, sample[1] + x);
> +        if (sc->rawlsb) {
> +            context = RENAME(get_context)(quant_table,
> +                                          sample[1] + x, sample[0] + x, sample[1] + x, sc->rawlsb);
> +        } else {
> +            context = RENAME(get_context)(quant_table,
> +                                          sample[1] + x, sample[0] + x, sample[1] + x, 0);
> +        }
>           if (context < 0) {
>               context = -context;
>               sign    = 1;
> @@ -71,7 +76,12 @@ RENAME(decode_line)(FFV1Context *f, FFV1SliceContext *sc,
>           av_assert2(context < p->context_count);
>   
>           if (ac != AC_GOLOMB_RICE) {
> -            diff = get_symbol_inline(c, p->state[context], 1);
> +            if (sc->rawlsb) {
> +                const int rawoff = (1<<sc->rawlsb) >> 1;
> +                diff = get_rac_raw(c, sc->rawlsb);
> +                diff += (get_symbol_inline(c, p->state[context], 1) << sc->rawlsb) - rawoff;
> +            } else
> +                diff = get_symbol_inline(c, p->state[context], 1);
>           } else {
>               if (context == 0 && run_mode == 0)
>                   run_mode = 1;
> diff --git a/libavcodec/ffv1enc.c b/libavcodec/ffv1enc.c
> index 0dbfebc1a1a..c574c739380 100644
> --- a/libavcodec/ffv1enc.c
> +++ b/libavcodec/ffv1enc.c
> @@ -416,7 +416,7 @@ static int write_extradata(FFV1Context *f)
>           if (f->version == 3) {
>               f->micro_version = 4;
>           } else if (f->version == 4)
> -            f->micro_version = 2;
> +            f->micro_version = 3;
>           put_symbol(&c, state, f->micro_version, 0);
>       }
>   
> @@ -564,6 +564,9 @@ static av_cold int encode_init(AVCodecContext *avctx)
>       if (s->ec == 2)
>           s->version = FFMAX(s->version, 4);
>   
> +    if (s->rawlsb)
> +        s->version = FFMAX(s->version, 4);
> +
>       if ((s->version == 2 || s->version>3) && avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
>           av_log(avctx, AV_LOG_ERROR, "Version 2 or 4 needed for requested features but version 2 or 4 is experimental and not enabled\n");
>           return AVERROR_INVALIDDATA;
> @@ -716,6 +719,11 @@ static av_cold int encode_init(AVCodecContext *avctx)
>           }
>       }
>   
> +    if (s->rawlsb > s->bits_per_raw_sample) {
> +        av_log(avctx, AV_LOG_ERROR, "too many raw lsb\n");
> +        return AVERROR(EINVAL);
> +    }
> +
>       if (s->ac == AC_RANGE_CUSTOM_TAB) {
>           for (i = 1; i < 256; i++)
>               s->state_transition[i] = ver2_state[i];
> @@ -958,6 +966,7 @@ static void encode_slice_header(FFV1Context *f, FFV1SliceContext *sc)
>               put_symbol(c, state, sc->slice_rct_by_coef, 0);
>               put_symbol(c, state, sc->slice_rct_ry_coef, 0);
>           }
> +        put_symbol(c, state, sc->rawlsb, 0);
>       }
>   }
>   
> @@ -1077,6 +1086,8 @@ static int encode_slice(AVCodecContext *c, void *arg)
>           sc->slice_rct_ry_coef = 1;
>       }
>   
> +    sc->rawlsb = f->rawlsb; // we do not optimize this per slice, but other encoders could
> +
>   retry:
>       if (f->key_frame)
>           ff_ffv1_clear_slice_state(f, sc);
> @@ -1291,6 +1302,8 @@ static const AVOption options[] = {
>               { .i64 = 0 }, 0, 1, VE },
>       { "qtable", "Quantization table", OFFSET(qtable), AV_OPT_TYPE_INT,
>               { .i64 = -1 }, -1, 2, VE },
> +    { "rawlsb", "number of LSBs stored RAW", OFFSET(rawlsb), AV_OPT_TYPE_INT,
> +            { .i64 = 0 }, 0, 16, VE },
>   
>       { NULL }
>   };
> diff --git a/libavcodec/ffv1enc_template.c b/libavcodec/ffv1enc_template.c
> index bc14926ab95..848328c70af 100644
> --- a/libavcodec/ffv1enc_template.c
> +++ b/libavcodec/ffv1enc_template.c
> @@ -62,8 +62,14 @@ RENAME(encode_line)(FFV1Context *f, FFV1SliceContext *sc,
>       for (x = 0; x < w; x++) {
>           int diff, context;
>   
> -        context = RENAME(get_context)(f->quant_tables[p->quant_table_index],
> -                                      sample[0] + x, sample[1] + x, sample[2] + x);
> +        if (f->rawlsb) {
> +            context = RENAME(get_context)(f->quant_tables[p->quant_table_index],
> +                                        sample[0] + x, sample[1] + x, sample[2] + x, f->rawlsb);
> +        } else {
> +            //try to force a version with rawlsb optimized out
> +            context = RENAME(get_context)(f->quant_tables[p->quant_table_index],
> +                                        sample[0] + x, sample[1] + x, sample[2] + x, 0);
> +        }
>           diff    = sample[0][x] - RENAME(predict)(sample[0] + x, sample[1] + x);
>   
>           if (context < 0) {
> @@ -74,6 +80,13 @@ RENAME(encode_line)(FFV1Context *f, FFV1SliceContext *sc,
>           diff = fold(diff, bits);
>   
>           if (ac != AC_GOLOMB_RICE) {
> +            if (f->rawlsb) {
> +                const int rawoff = (1<<f->rawlsb) >> 1;
> +                const unsigned mask = (1<<f->rawlsb) - 1;
> +                diff += rawoff;
> +                put_rac_raw(c, (diff & mask), f->rawlsb);
> +                diff = diff >> f->rawlsb; // Note, this will be biased on small rawlsb
> +            }
>               if (pass1) {
>                   put_symbol_inline(c, p->state[context], diff, 1, sc->rc_stat,
>                                     sc->rc_stat2[p->quant_table_index][context]);
> diff --git a/libavcodec/rangecoder.h b/libavcodec/rangecoder.h
> index 89d178ac314..d02a65fa7da 100644
> --- a/libavcodec/rangecoder.h
> +++ b/libavcodec/rangecoder.h
> @@ -111,6 +111,16 @@ static inline void put_rac(RangeCoder *c, uint8_t *const state, int bit)
>       renorm_encoder(c);
>   }
>   
> +static inline void put_rac_raw(RangeCoder *c, int bits, int len)
> +{
> +    int r = c->range >> len;
> +
> +    c->low += r * bits;
> +    c->range = r;
> +
> +    renorm_encoder(c);
> +}
> +
>   static inline void refill(RangeCoder *c)
>   {
>       if (c->range < 0x100) {
> @@ -142,4 +152,14 @@ static inline int get_rac(RangeCoder *c, uint8_t *const state)
>       }
>   }
>   
> +static inline int get_rac_raw(RangeCoder *c, int len)
> +{
> +    int r = c->range >> len;
> +    int bits = c->low / r;
> +    c->low -= r * bits;
> +    c->range = r;
> +    refill(c);
> +    return bits;
> +}
> +
>   #endif /* AVCODEC_RANGECODER_H */
> diff --git a/libavcodec/tests/rangecoder.c b/libavcodec/tests/rangecoder.c
> index fd858535a5b..9205be2bf3f 100644
> --- a/libavcodec/tests/rangecoder.c
> +++ b/libavcodec/tests/rangecoder.c
> @@ -76,6 +76,10 @@ int main(void)
>               for (i = 0; i < SIZE; i++)
>                   put_rac(&c, state, r[i] & 1);
>   
> +            for (i = 0; i < 30; i++) {
> +                put_rac_raw(&c, r[i]&7, 3);
> +            }
> +
>               actual_length = ff_rac_terminate(&c, version);
>   
>               ff_init_range_decoder(&c, b, version ? SIZE : actual_length);
> @@ -87,6 +91,11 @@ int main(void)
>                       av_log(NULL, AV_LOG_ERROR, "rac failure at %d pass %d version %d\n", i, p, version);
>                       return 1;
>                   }
> +            for (i = 0; i < 30; i++)
> +                if ((r[i] & 7) != get_rac_raw(&c, 3)) {
> +                    av_log(NULL, AV_LOG_ERROR, "rac raw failure at %d pass %d version %d\n", i, p, version);
> +                    return 1;
> +                }
>   
>               if (rac_check_termination(&c, version) < 0) {
>                   av_log(NULL, AV_LOG_ERROR, "rac failure at termination pass %d version %d\n", p, version);

You're interfering with the rangecoder by asking it to write very random 
data in between each symbol.
You should do what Opus does and write the rawbits in a separate buffer 
which gets merged at the very end.

I think rather than doing this, you should instead simply permit golomb 
coding to be used on high bit-depths.
Michael Niedermayer Oct. 16, 2024, 1:36 p.m. UTC | #2
On Wed, Oct 16, 2024 at 02:13:35AM +0200, Lynne via ffmpeg-devel wrote:
> On 16/10/2024 01:17, Michael Niedermayer wrote:
> > This makes a 16bit RGB raw sample 25% faster at a 2% loss of compression with rawlsb=4
> > 
> > Please test and comment
> > 
> > This stores the LSB through non binary range coding, this is simpler than using a
> > separate coder
> > For cases where range coding is not wanted its probably best to use golomb rice
> > for everything.
> > 
> > We also pass the LSB through the decorrelation and context stages (which is basically free)
> > this leads to slightly better compression than separating them earlier.
> > 
> > Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
> > ---
> >   libavcodec/ffv1.h             |  2 ++
> >   libavcodec/ffv1_template.c    | 19 ++++++++++---------
> >   libavcodec/ffv1dec.c          |  2 ++
> >   libavcodec/ffv1dec_template.c | 16 +++++++++++++---
> >   libavcodec/ffv1enc.c          | 15 ++++++++++++++-
> >   libavcodec/ffv1enc_template.c | 17 +++++++++++++++--
> >   libavcodec/rangecoder.h       | 20 ++++++++++++++++++++
> >   libavcodec/tests/rangecoder.c |  9 +++++++++
> >   8 files changed, 85 insertions(+), 15 deletions(-)
[...]
> > diff --git a/libavcodec/rangecoder.h b/libavcodec/rangecoder.h
> > index 89d178ac314..d02a65fa7da 100644
> > --- a/libavcodec/rangecoder.h
> > +++ b/libavcodec/rangecoder.h
> > @@ -111,6 +111,16 @@ static inline void put_rac(RangeCoder *c, uint8_t *const state, int bit)
> >       renorm_encoder(c);
> >   }
> > +static inline void put_rac_raw(RangeCoder *c, int bits, int len)
> > +{
> > +    int r = c->range >> len;
> > +
> > +    c->low += r * bits;
> > +    c->range = r;
> > +
> > +    renorm_encoder(c);
> > +}
> > +
> >   static inline void refill(RangeCoder *c)
> >   {
> >       if (c->range < 0x100) {
> > @@ -142,4 +152,14 @@ static inline int get_rac(RangeCoder *c, uint8_t *const state)
> >       }
> >   }
> > +static inline int get_rac_raw(RangeCoder *c, int len)
> > +{
> > +    int r = c->range >> len;
> > +    int bits = c->low / r;
> > +    c->low -= r * bits;
> > +    c->range = r;
> > +    refill(c);
> > +    return bits;
> > +}
> > +
[...]
>
> You're interfering with the rangecoder by asking it to write very random
> data in between each symbol.

the data is needed in that order for context modeling and decorrelation
to work.

At least with the CPU implementation we have this gives the same speedup
but better compression and its simpler code


> You should do what Opus does and write the rawbits in a separate buffer
> which gets merged at the very end.

I like more what h264 does with storing raw bits (get_cabac_bypass())
and given that h264 also works with much higher bitrates as a video codec
than opus as a audio codec, it seems the example is closer to our use case.


> 
> I think rather than doing this, you should instead simply permit golomb
> coding to be used on high bit-depths.

yes or rather, not "instead" but too.
We should permit golomb coding on high bit-depths.

thx

[...]
Michael Niedermayer Oct. 16, 2024, 2:08 p.m. UTC | #3
On Wed, Oct 16, 2024 at 03:36:55PM +0200, Michael Niedermayer wrote:
> On Wed, Oct 16, 2024 at 02:13:35AM +0200, Lynne via ffmpeg-devel wrote:
> > On 16/10/2024 01:17, Michael Niedermayer wrote:
> > > This makes a 16bit RGB raw sample 25% faster at a 2% loss of compression with rawlsb=4
> > > 
> > > Please test and comment
> > > 
> > > This stores the LSB through non binary range coding, this is simpler than using a
> > > separate coder
> > > For cases where range coding is not wanted its probably best to use golomb rice
> > > for everything.
> > > 
> > > We also pass the LSB through the decorrelation and context stages (which is basically free)
> > > this leads to slightly better compression than separating them earlier.
> > > 
> > > Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
> > > ---
> > >   libavcodec/ffv1.h             |  2 ++
> > >   libavcodec/ffv1_template.c    | 19 ++++++++++---------
> > >   libavcodec/ffv1dec.c          |  2 ++
> > >   libavcodec/ffv1dec_template.c | 16 +++++++++++++---
> > >   libavcodec/ffv1enc.c          | 15 ++++++++++++++-
> > >   libavcodec/ffv1enc_template.c | 17 +++++++++++++++--
> > >   libavcodec/rangecoder.h       | 20 ++++++++++++++++++++
> > >   libavcodec/tests/rangecoder.c |  9 +++++++++
> > >   8 files changed, 85 insertions(+), 15 deletions(-)
> [...]
> > > diff --git a/libavcodec/rangecoder.h b/libavcodec/rangecoder.h
> > > index 89d178ac314..d02a65fa7da 100644
> > > --- a/libavcodec/rangecoder.h
> > > +++ b/libavcodec/rangecoder.h
> > > @@ -111,6 +111,16 @@ static inline void put_rac(RangeCoder *c, uint8_t *const state, int bit)
> > >       renorm_encoder(c);
> > >   }
> > > +static inline void put_rac_raw(RangeCoder *c, int bits, int len)
> > > +{
> > > +    int r = c->range >> len;
> > > +
> > > +    c->low += r * bits;
> > > +    c->range = r;
> > > +
> > > +    renorm_encoder(c);
> > > +}
> > > +
> > >   static inline void refill(RangeCoder *c)
> > >   {
> > >       if (c->range < 0x100) {
> > > @@ -142,4 +152,14 @@ static inline int get_rac(RangeCoder *c, uint8_t *const state)
> > >       }
> > >   }
> > > +static inline int get_rac_raw(RangeCoder *c, int len)
> > > +{
> > > +    int r = c->range >> len;
> > > +    int bits = c->low / r;
> > > +    c->low -= r * bits;
> > > +    c->range = r;
> > > +    refill(c);
> > > +    return bits;
> > > +}
> > > +
> [...]
> >
> > You're interfering with the rangecoder by asking it to write very random
> > data in between each symbol.
> 
> the data is needed in that order for context modeling and decorrelation
> to work.
> 
> At least with the CPU implementation we have this gives the same speedup
> but better compression and its simpler code

btw, on a related note, whould something like this:

diff --git a/libavcodec/ffv1dec_template.c b/libavcodec/ffv1dec_template.c
index dbdcad7768e..5d4d51cc070 100644
--- a/libavcodec/ffv1dec_template.c
+++ b/libavcodec/ffv1dec_template.c
@@ -39,6 +39,8 @@ RENAME(decode_line)(FFV1Context *f, FFV1SliceContext *sc,
     if (is_input_end(c, gb, ac))
         return AVERROR_INVALIDDATA;

+    c->range = 1<<av_log2(c->range);
+
     if (sc->slice_coding_mode == 1) {
         int i;
         for (x = 0; x < w; x++) {
diff --git a/libavcodec/ffv1enc_template.c b/libavcodec/ffv1enc_template.c
index 848328c70af..0a3cb8f28b9 100644
--- a/libavcodec/ffv1enc_template.c
+++ b/libavcodec/ffv1enc_template.c
@@ -40,6 +40,7 @@ RENAME(encode_line)(FFV1Context *f, FFV1SliceContext *sc,
             av_log(logctx, AV_LOG_ERROR, "encoded frame too large\n");
             return AVERROR_INVALIDDATA;
         }
+        c->range = 1<<av_log2(c->range);
     } else {
         if (put_bytes_left(&sc->pb, 0) < w * 4) {
             av_log(logctx, AV_LOG_ERROR, "encoded frame too large\n");

help a GPU implementation ?
The idea here is to reduce the number of different states the range coder can be in
at the end of each line so the next line has fewer states to consider
but maybe iam totally wrong and misunderstanding the problem

Above change seems to cost maybe 0.1-0.2% compression on vsynth1 and vsynth2 tests

thx

[...]
diff mbox series

Patch

diff --git a/libavcodec/ffv1.h b/libavcodec/ffv1.h
index 4f5a8ab2be7..02bfc33f680 100644
--- a/libavcodec/ffv1.h
+++ b/libavcodec/ffv1.h
@@ -83,6 +83,7 @@  typedef struct FFV1SliceContext {
     int slice_coding_mode;
     int slice_rct_by_coef;
     int slice_rct_ry_coef;
+    int rawlsb;
 
     // RefStruct reference, array of MAX_PLANES elements
     PlaneContext *plane;
@@ -139,6 +140,7 @@  typedef struct FFV1Context {
     int key_frame_ok;
     int context_model;
     int qtable;
+    int rawlsb;
 
     int bits_per_raw_sample;
     int packed_at_lsb;
diff --git a/libavcodec/ffv1_template.c b/libavcodec/ffv1_template.c
index abb90a12e49..10206702ee8 100644
--- a/libavcodec/ffv1_template.c
+++ b/libavcodec/ffv1_template.c
@@ -30,24 +30,25 @@  static inline int RENAME(predict)(TYPE *src, TYPE *last)
 }
 
 static inline int RENAME(get_context)(const int16_t quant_table[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE],
-                                      TYPE *src, TYPE *last, TYPE *last2)
+                                      TYPE *src, TYPE *last, TYPE *last2, int rawlsb)
 {
     const int LT = last[-1];
     const int T  = last[0];
     const int RT = last[1];
     const int L  = src[-1];
+    const int rawoff = (1<<rawlsb) >> 1;
 
     if (quant_table[3][127] || quant_table[4][127]) {
         const int TT = last2[0];
         const int LL = src[-2];
-        return quant_table[0][(L - LT) & MAX_QUANT_TABLE_MASK] +
-               quant_table[1][(LT - T) & MAX_QUANT_TABLE_MASK] +
-               quant_table[2][(T - RT) & MAX_QUANT_TABLE_MASK] +
-               quant_table[3][(LL - L) & MAX_QUANT_TABLE_MASK] +
-               quant_table[4][(TT - T) & MAX_QUANT_TABLE_MASK];
+        return quant_table[0][(L - LT + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK] +
+               quant_table[1][(LT - T + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK] +
+               quant_table[2][(T - RT + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK] +
+               quant_table[3][(LL - L + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK] +
+               quant_table[4][(TT - T + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK];
     } else
-        return quant_table[0][(L - LT) & MAX_QUANT_TABLE_MASK] +
-               quant_table[1][(LT - T) & MAX_QUANT_TABLE_MASK] +
-               quant_table[2][(T - RT) & MAX_QUANT_TABLE_MASK];
+        return quant_table[0][(L - LT + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK] +
+               quant_table[1][(LT - T + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK] +
+               quant_table[2][(T - RT + rawoff >> rawlsb) & MAX_QUANT_TABLE_MASK];
 }
 
diff --git a/libavcodec/ffv1dec.c b/libavcodec/ffv1dec.c
index 5c099e49ad4..fc96bfb4cea 100644
--- a/libavcodec/ffv1dec.c
+++ b/libavcodec/ffv1dec.c
@@ -249,6 +249,8 @@  static int decode_slice_header(const FFV1Context *f,
                 return AVERROR_INVALIDDATA;
             }
         }
+        if (f->micro_version > 2)
+            sc->rawlsb = get_symbol(c, state, 0);
     }
 
     return 0;
diff --git a/libavcodec/ffv1dec_template.c b/libavcodec/ffv1dec_template.c
index 2da6bd935dc..dbdcad7768e 100644
--- a/libavcodec/ffv1dec_template.c
+++ b/libavcodec/ffv1dec_template.c
@@ -60,8 +60,13 @@  RENAME(decode_line)(FFV1Context *f, FFV1SliceContext *sc,
                 return AVERROR_INVALIDDATA;
         }
 
-        context = RENAME(get_context)(quant_table,
-                                      sample[1] + x, sample[0] + x, sample[1] + x);
+        if (sc->rawlsb) {
+            context = RENAME(get_context)(quant_table,
+                                          sample[1] + x, sample[0] + x, sample[1] + x, sc->rawlsb);
+        } else {
+            context = RENAME(get_context)(quant_table,
+                                          sample[1] + x, sample[0] + x, sample[1] + x, 0);
+        }
         if (context < 0) {
             context = -context;
             sign    = 1;
@@ -71,7 +76,12 @@  RENAME(decode_line)(FFV1Context *f, FFV1SliceContext *sc,
         av_assert2(context < p->context_count);
 
         if (ac != AC_GOLOMB_RICE) {
-            diff = get_symbol_inline(c, p->state[context], 1);
+            if (sc->rawlsb) {
+                const int rawoff = (1<<sc->rawlsb) >> 1;
+                diff = get_rac_raw(c, sc->rawlsb);
+                diff += (get_symbol_inline(c, p->state[context], 1) << sc->rawlsb) - rawoff;
+            } else
+                diff = get_symbol_inline(c, p->state[context], 1);
         } else {
             if (context == 0 && run_mode == 0)
                 run_mode = 1;
diff --git a/libavcodec/ffv1enc.c b/libavcodec/ffv1enc.c
index 0dbfebc1a1a..c574c739380 100644
--- a/libavcodec/ffv1enc.c
+++ b/libavcodec/ffv1enc.c
@@ -416,7 +416,7 @@  static int write_extradata(FFV1Context *f)
         if (f->version == 3) {
             f->micro_version = 4;
         } else if (f->version == 4)
-            f->micro_version = 2;
+            f->micro_version = 3;
         put_symbol(&c, state, f->micro_version, 0);
     }
 
@@ -564,6 +564,9 @@  static av_cold int encode_init(AVCodecContext *avctx)
     if (s->ec == 2)
         s->version = FFMAX(s->version, 4);
 
+    if (s->rawlsb)
+        s->version = FFMAX(s->version, 4);
+
     if ((s->version == 2 || s->version>3) && avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
         av_log(avctx, AV_LOG_ERROR, "Version 2 or 4 needed for requested features but version 2 or 4 is experimental and not enabled\n");
         return AVERROR_INVALIDDATA;
@@ -716,6 +719,11 @@  static av_cold int encode_init(AVCodecContext *avctx)
         }
     }
 
+    if (s->rawlsb > s->bits_per_raw_sample) {
+        av_log(avctx, AV_LOG_ERROR, "too many raw lsb\n");
+        return AVERROR(EINVAL);
+    }
+
     if (s->ac == AC_RANGE_CUSTOM_TAB) {
         for (i = 1; i < 256; i++)
             s->state_transition[i] = ver2_state[i];
@@ -958,6 +966,7 @@  static void encode_slice_header(FFV1Context *f, FFV1SliceContext *sc)
             put_symbol(c, state, sc->slice_rct_by_coef, 0);
             put_symbol(c, state, sc->slice_rct_ry_coef, 0);
         }
+        put_symbol(c, state, sc->rawlsb, 0);
     }
 }
 
@@ -1077,6 +1086,8 @@  static int encode_slice(AVCodecContext *c, void *arg)
         sc->slice_rct_ry_coef = 1;
     }
 
+    sc->rawlsb = f->rawlsb; // we do not optimize this per slice, but other encoders could
+
 retry:
     if (f->key_frame)
         ff_ffv1_clear_slice_state(f, sc);
@@ -1291,6 +1302,8 @@  static const AVOption options[] = {
             { .i64 = 0 }, 0, 1, VE },
     { "qtable", "Quantization table", OFFSET(qtable), AV_OPT_TYPE_INT,
             { .i64 = -1 }, -1, 2, VE },
+    { "rawlsb", "number of LSBs stored RAW", OFFSET(rawlsb), AV_OPT_TYPE_INT,
+            { .i64 = 0 }, 0, 16, VE },
 
     { NULL }
 };
diff --git a/libavcodec/ffv1enc_template.c b/libavcodec/ffv1enc_template.c
index bc14926ab95..848328c70af 100644
--- a/libavcodec/ffv1enc_template.c
+++ b/libavcodec/ffv1enc_template.c
@@ -62,8 +62,14 @@  RENAME(encode_line)(FFV1Context *f, FFV1SliceContext *sc,
     for (x = 0; x < w; x++) {
         int diff, context;
 
-        context = RENAME(get_context)(f->quant_tables[p->quant_table_index],
-                                      sample[0] + x, sample[1] + x, sample[2] + x);
+        if (f->rawlsb) {
+            context = RENAME(get_context)(f->quant_tables[p->quant_table_index],
+                                        sample[0] + x, sample[1] + x, sample[2] + x, f->rawlsb);
+        } else {
+            //try to force a version with rawlsb optimized out
+            context = RENAME(get_context)(f->quant_tables[p->quant_table_index],
+                                        sample[0] + x, sample[1] + x, sample[2] + x, 0);
+        }
         diff    = sample[0][x] - RENAME(predict)(sample[0] + x, sample[1] + x);
 
         if (context < 0) {
@@ -74,6 +80,13 @@  RENAME(encode_line)(FFV1Context *f, FFV1SliceContext *sc,
         diff = fold(diff, bits);
 
         if (ac != AC_GOLOMB_RICE) {
+            if (f->rawlsb) {
+                const int rawoff = (1<<f->rawlsb) >> 1;
+                const unsigned mask = (1<<f->rawlsb) - 1;
+                diff += rawoff;
+                put_rac_raw(c, (diff & mask), f->rawlsb);
+                diff = diff >> f->rawlsb; // Note, this will be biased on small rawlsb
+            }
             if (pass1) {
                 put_symbol_inline(c, p->state[context], diff, 1, sc->rc_stat,
                                   sc->rc_stat2[p->quant_table_index][context]);
diff --git a/libavcodec/rangecoder.h b/libavcodec/rangecoder.h
index 89d178ac314..d02a65fa7da 100644
--- a/libavcodec/rangecoder.h
+++ b/libavcodec/rangecoder.h
@@ -111,6 +111,16 @@  static inline void put_rac(RangeCoder *c, uint8_t *const state, int bit)
     renorm_encoder(c);
 }
 
+static inline void put_rac_raw(RangeCoder *c, int bits, int len)
+{
+    int r = c->range >> len;
+
+    c->low += r * bits;
+    c->range = r;
+
+    renorm_encoder(c);
+}
+
 static inline void refill(RangeCoder *c)
 {
     if (c->range < 0x100) {
@@ -142,4 +152,14 @@  static inline int get_rac(RangeCoder *c, uint8_t *const state)
     }
 }
 
+static inline int get_rac_raw(RangeCoder *c, int len)
+{
+    int r = c->range >> len;
+    int bits = c->low / r;
+    c->low -= r * bits;
+    c->range = r;
+    refill(c);
+    return bits;
+}
+
 #endif /* AVCODEC_RANGECODER_H */
diff --git a/libavcodec/tests/rangecoder.c b/libavcodec/tests/rangecoder.c
index fd858535a5b..9205be2bf3f 100644
--- a/libavcodec/tests/rangecoder.c
+++ b/libavcodec/tests/rangecoder.c
@@ -76,6 +76,10 @@  int main(void)
             for (i = 0; i < SIZE; i++)
                 put_rac(&c, state, r[i] & 1);
 
+            for (i = 0; i < 30; i++) {
+                put_rac_raw(&c, r[i]&7, 3);
+            }
+
             actual_length = ff_rac_terminate(&c, version);
 
             ff_init_range_decoder(&c, b, version ? SIZE : actual_length);
@@ -87,6 +91,11 @@  int main(void)
                     av_log(NULL, AV_LOG_ERROR, "rac failure at %d pass %d version %d\n", i, p, version);
                     return 1;
                 }
+            for (i = 0; i < 30; i++)
+                if ((r[i] & 7) != get_rac_raw(&c, 3)) {
+                    av_log(NULL, AV_LOG_ERROR, "rac raw failure at %d pass %d version %d\n", i, p, version);
+                    return 1;
+                }
 
             if (rac_check_termination(&c, version) < 0) {
                 av_log(NULL, AV_LOG_ERROR, "rac failure at termination pass %d version %d\n", p, version);