diff mbox series

[FFmpeg-devel,13/39] lavc/ffv1: drop redundant PlaneContext.quant_table

Message ID 20240716171155.31838-13-anton@khirnov.net
State New
Headers show
Series [FFmpeg-devel,01/39] tests/fate/vcodec: add vsynth tests for FFV1 version 2 | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Anton Khirnov July 16, 2024, 5:11 p.m. UTC
It is a copy of FFV1Context.quant_tables[quant_table_index].
---
 libavcodec/ffv1.h             |  1 -
 libavcodec/ffv1_template.c    | 22 +++++++++++-----------
 libavcodec/ffv1dec.c          | 28 ++++++++++++----------------
 libavcodec/ffv1dec_template.c | 14 +++++++++-----
 libavcodec/ffv1enc.c          | 24 ++++++++++++------------
 libavcodec/ffv1enc_template.c | 13 ++++++++-----
 6 files changed, 52 insertions(+), 50 deletions(-)

Comments

Michael Niedermayer July 17, 2024, 10:32 p.m. UTC | #1
On Tue, Jul 16, 2024 at 07:11:28PM +0200, Anton Khirnov wrote:
> It is a copy of FFV1Context.quant_tables[quant_table_index].
> ---
>  libavcodec/ffv1.h             |  1 -
>  libavcodec/ffv1_template.c    | 22 +++++++++++-----------
>  libavcodec/ffv1dec.c          | 28 ++++++++++++----------------
>  libavcodec/ffv1dec_template.c | 14 +++++++++-----
>  libavcodec/ffv1enc.c          | 24 ++++++++++++------------
>  libavcodec/ffv1enc_template.c | 13 ++++++++-----
>  6 files changed, 52 insertions(+), 50 deletions(-)
> 
> diff --git a/libavcodec/ffv1.h b/libavcodec/ffv1.h
> index 4d57172d5b..a87c2d2a36 100644
> --- a/libavcodec/ffv1.h
> +++ b/libavcodec/ffv1.h
> @@ -59,7 +59,6 @@ typedef struct VlcState {
>  } VlcState;
>  
>  typedef struct PlaneContext {
> -    int16_t quant_table[MAX_CONTEXT_INPUTS][256];
>      int quant_table_index;
>      int context_count;
>      uint8_t (*state)[CONTEXT_SIZE];
> diff --git a/libavcodec/ffv1_template.c b/libavcodec/ffv1_template.c
> index c5f61b0182..d15ad11021 100644
> --- a/libavcodec/ffv1_template.c
> +++ b/libavcodec/ffv1_template.c
> @@ -29,25 +29,25 @@ static inline int RENAME(predict)(TYPE *src, TYPE *last)
>      return mid_pred(L, L + T - LT, T);
>  }
>  
> -static inline int RENAME(get_context)(PlaneContext *p, TYPE *src,
> -                                      TYPE *last, TYPE *last2)
> +static inline int RENAME(get_context)(const int16_t quant_table[MAX_CONTEXT_INPUTS][256],
> +                                      TYPE *src, TYPE *last, TYPE *last2)
>  {
>      const int LT = last[-1];
>      const int T  = last[0];
>      const int RT = last[1];
>      const int L  = src[-1];
>  
> -    if (p->quant_table[3][127] || p->quant_table[4][127]) {
> +    if (quant_table[3][127] || quant_table[4][127]) {

the data for each decoder task should be together and not scattered around
more than needed, reducing cache efficiency


[...]
> diff --git a/libavcodec/ffv1dec.c b/libavcodec/ffv1dec.c
> index 66d9f63c1a..618020d10f 100644
> --- a/libavcodec/ffv1dec.c
> +++ b/libavcodec/ffv1dec.c
> @@ -117,7 +117,8 @@ static int is_input_end(FFV1Context *s, GetBitContext *gb)
>  #define RENAME(name) name ## 32
>  #include "ffv1dec_template.c"
>  
> -static int decode_plane(FFV1Context *s, FFV1SliceContext *sc,
> +static int decode_plane(FFV1Context *f,
> +                        FFV1Context *s, FFV1SliceContext *sc,

[...]
>  static av_always_inline int
> -RENAME(decode_line)(FFV1Context *s, FFV1SliceContext *sc, GetBitContext *gb,
> +RENAME(decode_line)(FFV1Context *f,
> +                    FFV1Context *s, FFV1SliceContext *sc,
> +                    GetBitContext *gb,
>                      int w, TYPE *sample[2], int plane_index, int bits)
>  {
>      PlaneContext *const p = &s->plane[plane_index];
> @@ -57,7 +59,8 @@ RENAME(decode_line)(FFV1Context *s, FFV1SliceContext *sc, GetBitContext *gb,
>                  return AVERROR_INVALIDDATA;
>          }
>  
> -        context = RENAME(get_context)(p, sample[1] + x, sample[0] + x, sample[1] + x);
> +        context = RENAME(get_context)(f->quant_tables[p->quant_table_index],
> +                                      sample[1] + x, sample[0] + x, sample[1] + x);

putting all this extra code in the inner per pixel loop is not ok
especially not for the sake of avoiding a memcpy of a few hundread bytes multiple levels of loops outside

[...]

thx
Anton Khirnov July 18, 2024, 8:20 a.m. UTC | #2
Quoting Michael Niedermayer (2024-07-18 00:32:38)
> the data for each decoder task should be together and not scattered around
> more than needed, reducing cache efficiency
> 
> putting all this extra code in the inner per pixel loop is not ok
> especially not for the sake of avoiding a memcpy of a few hundread bytes multiple levels of loops outside

A nice theory, but in practice this patchset makes single-threaded
decoding about 4% faster overall, on a 1920x1080 10bit sample. That's
just the ffv1 parts (up to patch 28), full set also improves frame
threading performance as follows:
threads         improvement
---------------------------
2                  52% (yes really)
4                  16%
8                  12%
Michael Niedermayer July 18, 2024, 2:31 p.m. UTC | #3
On Thu, Jul 18, 2024 at 10:20:09AM +0200, Anton Khirnov wrote:
> Quoting Michael Niedermayer (2024-07-18 00:32:38)
> > the data for each decoder task should be together and not scattered around
> > more than needed, reducing cache efficiency
> > 
> > putting all this extra code in the inner per pixel loop is not ok
> > especially not for the sake of avoiding a memcpy of a few hundread bytes multiple levels of loops outside
> 

> A nice theory,

that doesnt sound like a friendly description


> but in practice

> this patchset

"this patchset" isnt "this patch", nor does any improvment from "this patchset"
depend on the change of "this patch"
In fact it would probably benefit from droping this patch

thx


[...]
Anton Khirnov July 18, 2024, 3:14 p.m. UTC | #4
Quoting Michael Niedermayer (2024-07-18 16:31:51)
> On Thu, Jul 18, 2024 at 10:20:09AM +0200, Anton Khirnov wrote:
> > Quoting Michael Niedermayer (2024-07-18 00:32:38)
> > > the data for each decoder task should be together and not scattered around
> > > more than needed, reducing cache efficiency
> > > 
> > > putting all this extra code in the inner per pixel loop is not ok
> > > especially not for the sake of avoiding a memcpy of a few hundread bytes multiple levels of loops outside
> > 
> 
> > A nice theory,
> 
> that doesnt sound like a friendly description

And this does not look like a friendly review. I am getting a very
strong impression that you're looking for reasons to object to the
patches, rather seriously review them.

> > but in practice
> 
> > this patchset
> 
> "this patchset" isnt "this patch", nor does any improvment from "this patchset"
> depend on the change of "this patch"
> In fact it would probably benefit from droping this patch

Just what would that benefit be?

Storing and copying around multiple copies of the same data is generally
bad for readability, as it requires more cognitive effort to understand
the code - which is why I wrote this patch in the first place. It is
also inefficient in terms of overall memory use and cache utilization,
but I expect the impact of that to be small.

If you're concerned about dereferencing a pointer in an inner loop, I
can add a temporary variable to decode_line() as below, but removing
duplicated data seems like an unambiguous improvement to me.

diff --git a/libavcodec/ffv1dec_template.c b/libavcodec/ffv1dec_template.c
index 97a28b085a..d68bbda5be 100644
--- a/libavcodec/ffv1dec_template.c
+++ b/libavcodec/ffv1dec_template.c
@@ -30,6 +30,7 @@ RENAME(decode_line)(FFV1Context *f,
 {
     PlaneContext *const p = &s->plane[plane_index];
     RangeCoder *const c   = &s->c;
+    const int16_t (*quant_table)[256] = f->quant_tables[p->quant_table_index];
     int x;
     int run_count = 0;
     int run_mode  = 0;
@@ -59,7 +60,7 @@ RENAME(decode_line)(FFV1Context *f,
                 return AVERROR_INVALIDDATA;
         }

-        context = RENAME(get_context)(f->quant_tables[p->quant_table_index],
+        context = RENAME(get_context)(quant_table,
                                       sample[1] + x, sample[0] + x, sample[1] + x);
         if (context < 0) {
             context = -context;
Paul B Mahol July 18, 2024, 3:31 p.m. UTC | #5
On Thu, Jul 18, 2024 at 10:20 AM Anton Khirnov <anton@khirnov.net> wrote:

> Quoting Michael Niedermayer (2024-07-18 00:32:38)
> > the data for each decoder task should be together and not scattered
> around
> > more than needed, reducing cache efficiency
> >
> > putting all this extra code in the inner per pixel loop is not ok
> > especially not for the sake of avoiding a memcpy of a few hundread bytes
> multiple levels of loops outside
>
> A nice theory, but in practice this patchset makes single-threaded
> decoding about 4% faster overall, on a 1920x1080 10bit sample. That's
> just the ffv1 parts (up to patch 28), full set also improves frame
> threading performance as follows:
> threads         improvement
> ---------------------------
> 2                  52% (yes really)
>

What?

Sloppy programming skills....


> 4                  16%
> 8                  12%
>
> --
> Anton Khirnov
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Anton Khirnov July 18, 2024, 3:43 p.m. UTC | #6
Quoting Paul B Mahol (2024-07-18 17:31:50)
> On Thu, Jul 18, 2024 at 10:20 AM Anton Khirnov <anton@khirnov.net> wrote:
> 
> > Quoting Michael Niedermayer (2024-07-18 00:32:38)
> > > the data for each decoder task should be together and not scattered
> > around
> > > more than needed, reducing cache efficiency
> > >
> > > putting all this extra code in the inner per pixel loop is not ok
> > > especially not for the sake of avoiding a memcpy of a few hundread bytes
> > multiple levels of loops outside
> >
> > A nice theory, but in practice this patchset makes single-threaded
> > decoding about 4% faster overall, on a 1920x1080 10bit sample. That's
> > just the ffv1 parts (up to patch 28), full set also improves frame
> > threading performance as follows:
> > threads         improvement
> > ---------------------------
> > 2                  52% (yes really)
> >
> 
> What?

Current code is effectively serial with 2 threads.
Paul B Mahol July 18, 2024, 3:47 p.m. UTC | #7
On Thu, Jul 18, 2024 at 5:43 PM Anton Khirnov <anton@khirnov.net> wrote:

> Quoting Paul B Mahol (2024-07-18 17:31:50)
> > On Thu, Jul 18, 2024 at 10:20 AM Anton Khirnov <anton@khirnov.net>
> wrote:
> >
> > > Quoting Michael Niedermayer (2024-07-18 00:32:38)
> > > > the data for each decoder task should be together and not scattered
> > > around
> > > > more than needed, reducing cache efficiency
> > > >
> > > > putting all this extra code in the inner per pixel loop is not ok
> > > > especially not for the sake of avoiding a memcpy of a few hundread
> bytes
> > > multiple levels of loops outside
> > >
> > > A nice theory, but in practice this patchset makes single-threaded
> > > decoding about 4% faster overall, on a 1920x1080 10bit sample. That's
> > > just the ffv1 parts (up to patch 28), full set also improves frame
> > > threading performance as follows:
> > > threads         improvement
> > > ---------------------------
> > > 2                  52% (yes really)
> > >
> >
> > What?
>
> Current code is effectively serial with 2 threads.
>

Explains why FFv1 implementation is so slow.


>
> --
> Anton Khirnov
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Michael Niedermayer July 18, 2024, 5:03 p.m. UTC | #8
On Thu, Jul 18, 2024 at 05:14:12PM +0200, Anton Khirnov wrote:
[...]
> diff --git a/libavcodec/ffv1dec_template.c b/libavcodec/ffv1dec_template.c
> index 97a28b085a..d68bbda5be 100644
> --- a/libavcodec/ffv1dec_template.c
> +++ b/libavcodec/ffv1dec_template.c
> @@ -30,6 +30,7 @@ RENAME(decode_line)(FFV1Context *f,
>  {
>      PlaneContext *const p = &s->plane[plane_index];
>      RangeCoder *const c   = &s->c;
> +    const int16_t (*quant_table)[256] = f->quant_tables[p->quant_table_index];
>      int x;
>      int run_count = 0;
>      int run_mode  = 0;
> @@ -59,7 +60,7 @@ RENAME(decode_line)(FFV1Context *f,
>                  return AVERROR_INVALIDDATA;
>          }
> 
> -        context = RENAME(get_context)(f->quant_tables[p->quant_table_index],
> +        context = RENAME(get_context)(quant_table,
>                                        sample[1] + x, sample[0] + x, sample[1] + x);
>          if (context < 0) {
>              context = -context;

that change is fine.

the removial of the quant_table copy should be benchmarked

thx

[...]
Michael Niedermayer July 18, 2024, 5:40 p.m. UTC | #9
On Thu, Jul 18, 2024 at 10:20:09AM +0200, Anton Khirnov wrote:
> Quoting Michael Niedermayer (2024-07-18 00:32:38)
> > the data for each decoder task should be together and not scattered around
> > more than needed, reducing cache efficiency
> > 
> > putting all this extra code in the inner per pixel loop is not ok
> > especially not for the sake of avoiding a memcpy of a few hundread bytes multiple levels of loops outside
> 
> A nice theory, but in practice this patchset makes single-threaded
> decoding about 4% faster overall, on a 1920x1080 10bit sample. That's
> just the ffv1 parts (up to patch 28), full set also improves frame
> threading performance as follows:
> threads         improvement
> ---------------------------
> 2                  52% (yes really)
> 4                  16%
> 8                  12%

I do want the speed improvements, yes.

But
you compare frame threading when slice threading performed
much better than frame threading prior to the patch

also id like to see the individual changes which look like they should
make teh code slower, to be tested individually. If they make the code slower
they should be dropped

Also the code has a bug, benchmarks may theoretically changes once it is fixed

using matrixbench
-i matrixbench_mpeg2.mpg -an -vcodec ffv1  -slices 4 -t 100 -coder 1  -context 1 -bitexact /tCmp/ffv1.2-11.avi

prior patchset:
time ./ffmpeg -thread_type slice  -threads 1 -i /tmp/ffv1.2-11.avi -f null -
real	0m31.976s
user	0m32.001s
sys	0m0.080s

time ./ffmpeg -thread_type slice  -threads 2 -i /tmp/ffv1.2-11.avi -f null -
real	0m18.086s
user	0m34.199s
sys	0m0.089s

time ./ffmpeg   -threads 2 -i /tmp/ffv1.2-11.avi -f null -
real	0m33.578s
user	0m33.611s
sys	0m0.052s

time ./ffmpeg -thread_type slice  -threads 4 -i /tmp/ffv1.2-11.avi -f null -
real	0m9.189s
user	0m33.608s
sys	0m0.073s

time ./ffmpeg   -threads 4 -i /tmp/ffv1.2-11.avi -f null -
real	0m11.159s
user	0m32.712s
sys	0m0.124s


post patchset:
time ./ffmpeg -thread_type slice  -threads 1 -i /tmp/ffv1.2-11.avi -f null -
eal	0m31.755s
user	0m31.758s
sys	0m0.096s

time ./ffmpeg -thread_type slice  -threads 2 -i /tmp/ffv1.2-11.avi -f null -
real	0m17.481s
user	0m33.385s
sys	0m0.076s

time ./ffmpeg   -threads 2 -i /tmp/ffv1.2-11.avi -f null -
real	0m16.893s
user	0m33.465s
sys	0m0.113s

time ./ffmpeg -thread_type slice  -threads 4 -i /tmp/ffv1.2-11.avi -f null -
real	0m9.180s
user	0m33.500s
sys	0m0.088s

time ./ffmpeg   -threads 4 -i /tmp/ffv1.2-11.avi -f null -
real	0m8.811s
user	0m33.338s
sys	0m0.061s



[...]
Anton Khirnov July 20, 2024, 9:22 a.m. UTC | #10
Quoting Michael Niedermayer (2024-07-18 19:40:04)
> On Thu, Jul 18, 2024 at 10:20:09AM +0200, Anton Khirnov wrote:
> > Quoting Michael Niedermayer (2024-07-18 00:32:38)
> > > the data for each decoder task should be together and not scattered around
> > > more than needed, reducing cache efficiency
> > > 
> > > putting all this extra code in the inner per pixel loop is not ok
> > > especially not for the sake of avoiding a memcpy of a few hundread bytes multiple levels of loops outside
> > 
> > A nice theory, but in practice this patchset makes single-threaded
> > decoding about 4% faster overall, on a 1920x1080 10bit sample. That's
> > just the ffv1 parts (up to patch 28), full set also improves frame
> > threading performance as follows:
> > threads         improvement
> > ---------------------------
> > 2                  52% (yes really)
> > 4                  16%
> > 8                  12%
> 
> I do want the speed improvements, yes.
> 
> But
> you compare frame threading when slice threading performed
> much better than frame threading prior to the patch

If that were true in general, there'd be no reason for frame threading
support in ffv1, as it has a higher latency and uses more memory; higher
performance is its only advantage.

However you added frame threading in
a0c0900e470fde0d6db360e555620476c2323895 claiming it is faster, which I
can partially confirm even with current master - slice threading
saturates at thread count = slice count, while frame threading scales
beyond it. Frame threading also improves significantly after this set:

threads | slice    | frame/before | frame/after
-----------------------------------------------
2         22.6124    43.738         22.0354
4         14.3367    15.115         13.1964
6         14.3850    11.974         10.9745
8         14.3472    9.7229         8.76617
10        14.3579    8.4638         8.6499
12        14.3665    8.4636         8.5735
16        14.2960    7.6926         7.1696
-----------------------------------------------
(values are total decode time in seconds)

Note that after this set frame threading is ALWAYS faster than slice
threading, for any thread count.

> also id like to see the individual changes which look like they should
> make teh code slower, to be tested individually. If they make the code slower
> they should be dropped

I don't think it's meaningful to individually benchmark the patches
moving per-slice data into the new per-slice context. I split them to
simplify testing and review, but it only makes sense to apply all of
them or none, otherwise the code gets more complex.
diff mbox series

Patch

diff --git a/libavcodec/ffv1.h b/libavcodec/ffv1.h
index 4d57172d5b..a87c2d2a36 100644
--- a/libavcodec/ffv1.h
+++ b/libavcodec/ffv1.h
@@ -59,7 +59,6 @@  typedef struct VlcState {
 } VlcState;
 
 typedef struct PlaneContext {
-    int16_t quant_table[MAX_CONTEXT_INPUTS][256];
     int quant_table_index;
     int context_count;
     uint8_t (*state)[CONTEXT_SIZE];
diff --git a/libavcodec/ffv1_template.c b/libavcodec/ffv1_template.c
index c5f61b0182..d15ad11021 100644
--- a/libavcodec/ffv1_template.c
+++ b/libavcodec/ffv1_template.c
@@ -29,25 +29,25 @@  static inline int RENAME(predict)(TYPE *src, TYPE *last)
     return mid_pred(L, L + T - LT, T);
 }
 
-static inline int RENAME(get_context)(PlaneContext *p, TYPE *src,
-                                      TYPE *last, TYPE *last2)
+static inline int RENAME(get_context)(const int16_t quant_table[MAX_CONTEXT_INPUTS][256],
+                                      TYPE *src, TYPE *last, TYPE *last2)
 {
     const int LT = last[-1];
     const int T  = last[0];
     const int RT = last[1];
     const int L  = src[-1];
 
-    if (p->quant_table[3][127] || p->quant_table[4][127]) {
+    if (quant_table[3][127] || quant_table[4][127]) {
         const int TT = last2[0];
         const int LL = src[-2];
-        return p->quant_table[0][(L - LT) & 0xFF] +
-               p->quant_table[1][(LT - T) & 0xFF] +
-               p->quant_table[2][(T - RT) & 0xFF] +
-               p->quant_table[3][(LL - L) & 0xFF] +
-               p->quant_table[4][(TT - T) & 0xFF];
+        return quant_table[0][(L - LT) & 0xFF] +
+               quant_table[1][(LT - T) & 0xFF] +
+               quant_table[2][(T - RT) & 0xFF] +
+               quant_table[3][(LL - L) & 0xFF] +
+               quant_table[4][(TT - T) & 0xFF];
     } else
-        return p->quant_table[0][(L - LT) & 0xFF] +
-               p->quant_table[1][(LT - T) & 0xFF] +
-               p->quant_table[2][(T - RT) & 0xFF];
+        return quant_table[0][(L - LT) & 0xFF] +
+               quant_table[1][(LT - T) & 0xFF] +
+               quant_table[2][(T - RT) & 0xFF];
 }
 
diff --git a/libavcodec/ffv1dec.c b/libavcodec/ffv1dec.c
index 66d9f63c1a..618020d10f 100644
--- a/libavcodec/ffv1dec.c
+++ b/libavcodec/ffv1dec.c
@@ -117,7 +117,8 @@  static int is_input_end(FFV1Context *s, GetBitContext *gb)
 #define RENAME(name) name ## 32
 #include "ffv1dec_template.c"
 
-static int decode_plane(FFV1Context *s, FFV1SliceContext *sc,
+static int decode_plane(FFV1Context *f,
+                        FFV1Context *s, FFV1SliceContext *sc,
                         GetBitContext *gb,
                         uint8_t *src, int w, int h, int stride, int plane_index,
                          int pixel_stride)
@@ -141,13 +142,13 @@  static int decode_plane(FFV1Context *s, FFV1SliceContext *sc,
         sample[0][w]  = sample[0][w - 1];
 
         if (s->avctx->bits_per_raw_sample <= 8) {
-            int ret = decode_line(s, sc, gb, w, sample, plane_index, 8);
+            int ret = decode_line(f, s, sc, gb, w, sample, plane_index, 8);
             if (ret < 0)
                 return ret;
             for (x = 0; x < w; x++)
                 src[x*pixel_stride + stride * y] = sample[1][x];
         } else {
-            int ret = decode_line(s, sc, gb, w, sample, plane_index, s->avctx->bits_per_raw_sample);
+            int ret = decode_line(f, s, sc, gb, w, sample, plane_index, s->avctx->bits_per_raw_sample);
             if (ret < 0)
                 return ret;
             if (s->packed_at_lsb) {
@@ -207,7 +208,6 @@  static int decode_slice_header(const FFV1Context *f, FFV1Context *fs,
             return -1;
         }
         p->quant_table_index = idx;
-        memcpy(p->quant_table, f->quant_tables[idx], sizeof(p->quant_table));
         context_count = f->context_count[idx];
 
         if (p->context_count < context_count) {
@@ -335,29 +335,29 @@  static int decode_slice(AVCodecContext *c, void *arg)
         const int chroma_height = AV_CEIL_RSHIFT(height, f->chroma_v_shift);
         const int cx            = x >> f->chroma_h_shift;
         const int cy            = y >> f->chroma_v_shift;
-        decode_plane(fs, sc, &gb, p->data[0] + ps*x + y*p->linesize[0], width, height, p->linesize[0], 0, 1);
+        decode_plane(f, fs, sc, &gb, p->data[0] + ps*x + y*p->linesize[0], width, height, p->linesize[0], 0, 1);
 
         if (f->chroma_planes) {
-            decode_plane(fs, sc, &gb, p->data[1] + ps*cx+cy*p->linesize[1], chroma_width, chroma_height, p->linesize[1], 1, 1);
-            decode_plane(fs, sc, &gb, p->data[2] + ps*cx+cy*p->linesize[2], chroma_width, chroma_height, p->linesize[2], 1, 1);
+            decode_plane(f, fs, sc, &gb, p->data[1] + ps*cx+cy*p->linesize[1], chroma_width, chroma_height, p->linesize[1], 1, 1);
+            decode_plane(f, fs, sc, &gb, p->data[2] + ps*cx+cy*p->linesize[2], chroma_width, chroma_height, p->linesize[2], 1, 1);
         }
         if (fs->transparency)
-            decode_plane(fs, sc, &gb, p->data[3] + ps*x + y*p->linesize[3], width, height, p->linesize[3], (f->version >= 4 && !f->chroma_planes) ? 1 : 2, 1);
+            decode_plane(f, fs, sc, &gb, p->data[3] + ps*x + y*p->linesize[3], width, height, p->linesize[3], (f->version >= 4 && !f->chroma_planes) ? 1 : 2, 1);
     } else if (f->colorspace == 0) {
-         decode_plane(fs, sc, &gb, p->data[0] + ps*x + y*p->linesize[0]    , width, height, p->linesize[0], 0, 2);
-         decode_plane(fs, sc, &gb, p->data[0] + ps*x + y*p->linesize[0] + 1, width, height, p->linesize[0], 1, 2);
+         decode_plane(f, fs, sc, &gb, p->data[0] + ps*x + y*p->linesize[0]    , width, height, p->linesize[0], 0, 2);
+         decode_plane(f, fs, sc, &gb, p->data[0] + ps*x + y*p->linesize[0] + 1, width, height, p->linesize[0], 1, 2);
     } else if (f->use32bit) {
         uint8_t *planes[4] = { p->data[0] + ps * x + y * p->linesize[0],
                                p->data[1] + ps * x + y * p->linesize[1],
                                p->data[2] + ps * x + y * p->linesize[2],
                                p->data[3] + ps * x + y * p->linesize[3] };
-        decode_rgb_frame32(fs, sc, &gb, planes, width, height, p->linesize);
+        decode_rgb_frame32(f, fs, sc, &gb, planes, width, height, p->linesize);
     } else {
         uint8_t *planes[4] = { p->data[0] + ps * x + y * p->linesize[0],
                                p->data[1] + ps * x + y * p->linesize[1],
                                p->data[2] + ps * x + y * p->linesize[2],
                                p->data[3] + ps * x + y * p->linesize[3] };
-        decode_rgb_frame(fs, sc, &gb, planes, width, height, p->linesize);
+        decode_rgb_frame(f, fs, sc, &gb, planes, width, height, p->linesize);
     }
     if (fs->ac != AC_GOLOMB_RICE && f->version > 2) {
         int v;
@@ -830,11 +830,7 @@  static int read_header(FFV1Context *f)
                     return AVERROR_INVALIDDATA;
                 }
                 p->quant_table_index = idx;
-                memcpy(p->quant_table, f->quant_tables[idx],
-                       sizeof(p->quant_table));
                 context_count = f->context_count[idx];
-            } else {
-                memcpy(p->quant_table, f->quant_tables[0], sizeof(p->quant_table));
             }
 
             if (f->version <= 2) {
diff --git a/libavcodec/ffv1dec_template.c b/libavcodec/ffv1dec_template.c
index e9d3002be9..97a28b085a 100644
--- a/libavcodec/ffv1dec_template.c
+++ b/libavcodec/ffv1dec_template.c
@@ -23,7 +23,9 @@ 
 #include "ffv1_template.c"
 
 static av_always_inline int
-RENAME(decode_line)(FFV1Context *s, FFV1SliceContext *sc, GetBitContext *gb,
+RENAME(decode_line)(FFV1Context *f,
+                    FFV1Context *s, FFV1SliceContext *sc,
+                    GetBitContext *gb,
                     int w, TYPE *sample[2], int plane_index, int bits)
 {
     PlaneContext *const p = &s->plane[plane_index];
@@ -57,7 +59,8 @@  RENAME(decode_line)(FFV1Context *s, FFV1SliceContext *sc, GetBitContext *gb,
                 return AVERROR_INVALIDDATA;
         }
 
-        context = RENAME(get_context)(p, sample[1] + x, sample[0] + x, sample[1] + x);
+        context = RENAME(get_context)(f->quant_tables[p->quant_table_index],
+                                      sample[1] + x, sample[0] + x, sample[1] + x);
         if (context < 0) {
             context = -context;
             sign    = 1;
@@ -127,7 +130,8 @@  RENAME(decode_line)(FFV1Context *s, FFV1SliceContext *sc, GetBitContext *gb,
     return 0;
 }
 
-static int RENAME(decode_rgb_frame)(FFV1Context *s, FFV1SliceContext *sc,
+static int RENAME(decode_rgb_frame)(FFV1Context *f,
+                                    FFV1Context *s, FFV1SliceContext *sc,
                                     GetBitContext *gb,
                                     uint8_t *src[4], int w, int h, int stride[4])
 {
@@ -158,9 +162,9 @@  static int RENAME(decode_rgb_frame)(FFV1Context *s, FFV1SliceContext *sc,
             sample[p][1][-1]= sample[p][0][0  ];
             sample[p][0][ w]= sample[p][0][w-1];
             if (lbd && s->slice_coding_mode == 0)
-                ret = RENAME(decode_line)(s, sc, gb, w, sample[p], (p + 1)/2, 9);
+                ret = RENAME(decode_line)(f, s, sc, gb, w, sample[p], (p + 1)/2, 9);
             else
-                ret = RENAME(decode_line)(s, sc, gb, w, sample[p], (p + 1)/2, bits + (s->slice_coding_mode != 1));
+                ret = RENAME(decode_line)(f, s, sc, gb, w, sample[p], (p + 1)/2, bits + (s->slice_coding_mode != 1));
             if (ret < 0)
                 return ret;
         }
diff --git a/libavcodec/ffv1enc.c b/libavcodec/ffv1enc.c
index 1d00a46cdd..714e007659 100644
--- a/libavcodec/ffv1enc.c
+++ b/libavcodec/ffv1enc.c
@@ -269,7 +269,8 @@  static inline void put_vlc_symbol(PutBitContext *pb, VlcState *const state,
 #define RENAME(name) name ## 32
 #include "ffv1enc_template.c"
 
-static int encode_plane(FFV1Context *s, FFV1SliceContext *sc,
+static int encode_plane(FFV1Context *f,
+                        FFV1Context *s, FFV1SliceContext *sc,
                         const uint8_t *src, int w, int h,
                          int stride, int plane_index, int pixel_stride)
 {
@@ -289,7 +290,7 @@  static int encode_plane(FFV1Context *s, FFV1SliceContext *sc,
         if (s->bits_per_raw_sample <= 8) {
             for (x = 0; x < w; x++)
                 sample[0][x] = src[x * pixel_stride + stride * y];
-            if((ret = encode_line(s, sc, w, sample, plane_index, 8)) < 0)
+            if((ret = encode_line(f, s, sc, w, sample, plane_index, 8)) < 0)
                 return ret;
         } else {
             if (s->packed_at_lsb) {
@@ -301,7 +302,7 @@  static int encode_plane(FFV1Context *s, FFV1SliceContext *sc,
                     sample[0][x] = ((uint16_t*)(src + stride*y))[x] >> (16 - s->bits_per_raw_sample);
                 }
             }
-            if((ret = encode_line(s, sc, w, sample, plane_index, s->bits_per_raw_sample)) < 0)
+            if((ret = encode_line(f, s, sc, w, sample, plane_index, s->bits_per_raw_sample)) < 0)
                 return ret;
         }
     }
@@ -741,7 +742,6 @@  static av_cold int encode_init(AVCodecContext *avctx)
 
         p->quant_table_index = s->context_model;
         p->context_count     = s->context_count[p->quant_table_index];
-        memcpy(p->quant_table, s->quant_tables[p->quant_table_index], sizeof(p->quant_table));
     }
 
     if ((ret = ff_ffv1_allocate_initial_states(s)) < 0)
@@ -1066,21 +1066,21 @@  retry:
         const int cx            = x >> f->chroma_h_shift;
         const int cy            = y >> f->chroma_v_shift;
 
-        ret = encode_plane(fs, sc, p->data[0] + ps*x + y*p->linesize[0], width, height, p->linesize[0], 0, 1);
+        ret = encode_plane(f, fs, sc, p->data[0] + ps*x + y*p->linesize[0], width, height, p->linesize[0], 0, 1);
 
         if (f->chroma_planes) {
-            ret |= encode_plane(fs, sc, p->data[1] + ps*cx+cy*p->linesize[1], chroma_width, chroma_height, p->linesize[1], 1, 1);
-            ret |= encode_plane(fs, sc, p->data[2] + ps*cx+cy*p->linesize[2], chroma_width, chroma_height, p->linesize[2], 1, 1);
+            ret |= encode_plane(f, fs, sc, p->data[1] + ps*cx+cy*p->linesize[1], chroma_width, chroma_height, p->linesize[1], 1, 1);
+            ret |= encode_plane(f, fs, sc, p->data[2] + ps*cx+cy*p->linesize[2], chroma_width, chroma_height, p->linesize[2], 1, 1);
         }
         if (fs->transparency)
-            ret |= encode_plane(fs, sc, p->data[3] + ps*x + y*p->linesize[3], width, height, p->linesize[3], 2, 1);
+            ret |= encode_plane(f, fs, sc, p->data[3] + ps*x + y*p->linesize[3], width, height, p->linesize[3], 2, 1);
     } else if (c->pix_fmt == AV_PIX_FMT_YA8) {
-        ret  = encode_plane(fs, sc, p->data[0] +     ps*x + y*p->linesize[0], width, height, p->linesize[0], 0, 2);
-        ret |= encode_plane(fs, sc, p->data[0] + 1 + ps*x + y*p->linesize[0], width, height, p->linesize[0], 1, 2);
+        ret  = encode_plane(f, fs, sc, p->data[0] +     ps*x + y*p->linesize[0], width, height, p->linesize[0], 0, 2);
+        ret |= encode_plane(f, fs, sc, p->data[0] + 1 + ps*x + y*p->linesize[0], width, height, p->linesize[0], 1, 2);
     } else if (f->use32bit) {
-        ret = encode_rgb_frame32(fs, sc, planes, width, height, p->linesize);
+        ret = encode_rgb_frame32(f, fs, sc, planes, width, height, p->linesize);
     } else {
-        ret = encode_rgb_frame(fs, sc, planes, width, height, p->linesize);
+        ret = encode_rgb_frame(f, fs, sc, planes, width, height, p->linesize);
     }
 
     if (ret < 0) {
diff --git a/libavcodec/ffv1enc_template.c b/libavcodec/ffv1enc_template.c
index 8b9e53fa1b..4a5580e1a5 100644
--- a/libavcodec/ffv1enc_template.c
+++ b/libavcodec/ffv1enc_template.c
@@ -23,7 +23,8 @@ 
 #include "ffv1_template.c"
 
 static av_always_inline int
-RENAME(encode_line)(FFV1Context *s, FFV1SliceContext *sc,
+RENAME(encode_line)(FFV1Context *f,
+                    FFV1Context *s, FFV1SliceContext *sc,
                     int w, TYPE *sample[3], int plane_index, int bits)
 {
     PlaneContext *const p = &s->plane[plane_index];
@@ -60,7 +61,8 @@  RENAME(encode_line)(FFV1Context *s, FFV1SliceContext *sc,
     for (x = 0; x < w; x++) {
         int diff, context;
 
-        context = RENAME(get_context)(p, sample[0] + x, sample[1] + x, sample[2] + x);
+        context = RENAME(get_context)(f->quant_tables[p->quant_table_index],
+                                      sample[0] + x, sample[1] + x, sample[2] + x);
         diff    = sample[0][x] - RENAME(predict)(sample[0] + x, sample[1] + x);
 
         if (context < 0) {
@@ -124,7 +126,8 @@  RENAME(encode_line)(FFV1Context *s, FFV1SliceContext *sc,
     return 0;
 }
 
-static int RENAME(encode_rgb_frame)(FFV1Context *s, FFV1SliceContext *sc,
+static int RENAME(encode_rgb_frame)(FFV1Context *f,
+                                    FFV1Context *s, FFV1SliceContext *sc,
                                     const uint8_t *src[4],
                                     int w, int h, const int stride[4])
 {
@@ -193,9 +196,9 @@  static int RENAME(encode_rgb_frame)(FFV1Context *s, FFV1SliceContext *sc,
             sample[p][0][-1] = sample[p][1][0  ];
             sample[p][1][ w] = sample[p][1][w-1];
             if (lbd && s->slice_coding_mode == 0)
-                ret = RENAME(encode_line)(s, sc, w, sample[p], (p + 1) / 2, 9);
+                ret = RENAME(encode_line)(f, s, sc, w, sample[p], (p + 1) / 2, 9);
             else
-                ret = RENAME(encode_line)(s, sc, w, sample[p], (p + 1) / 2, bits + (s->slice_coding_mode != 1));
+                ret = RENAME(encode_line)(f, s, sc, w, sample[p], (p + 1) / 2, bits + (s->slice_coding_mode != 1));
             if (ret < 0)
                 return ret;
         }