[FFmpeg-devel,v4,2/2] avcodec/v210dec: add the frame and slice threading support

Submitted by lance.lmwang@gmail.com on Sept. 22, 2019, 3:55 a.m.

Details

Message ID 20190922035549.1023-1-lance.lmwang@gmail.com
State New
Headers show

Commit Message

lance.lmwang@gmail.com Sept. 22, 2019, 3:55 a.m.
From: Limin Wang <lance.lmwang@gmail.com>

The multithread is avoid one core cpu is full with other filter like scale etc.
About the performance, the gain is very small, below is my testing for
performance.
In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
only.

./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10
~/Movies/1.v210

master:
./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark
-f null -
frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
overhead: unknown
bench: utime=10.082s stime=13.784s rtime=23.889s
bench: maxrss=147836928kB

patch applied:
./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
~/Movies/1.v210 -benchmark -f null -

frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
overhead: unknown
bench: utime=11.407s stime=17.258s rtime=18.279s
bench: maxrss=442884096kB

Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
---
 libavcodec/v210dec.c | 131 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 83 insertions(+), 48 deletions(-)

Comments

lance.lmwang@gmail.com Oct. 11, 2019, 12:59 a.m.
Michael, have updated the patch by your suggestion, please help to review the final version.


On Sun, Sep 22, 2019 at 11:55:49AM +0800, lance.lmwang@gmail.com wrote:
> From: Limin Wang <lance.lmwang@gmail.com>
> 
> The multithread is avoid one core cpu is full with other filter like scale etc.
> About the performance, the gain is very small, below is my testing for
> performance.
> In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> only.
> 
> ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10
> ~/Movies/1.v210
> 
> master:
> ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark
> -f null -
> frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=10.082s stime=13.784s rtime=23.889s
> bench: maxrss=147836928kB
> 
> patch applied:
> ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> ~/Movies/1.v210 -benchmark -f null -
> 
> frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=11.407s stime=17.258s rtime=18.279s
> bench: maxrss=442884096kB
> 
> Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> ---
>  libavcodec/v210dec.c | 131 ++++++++++++++++++++++++++++++++-------------------
>  1 file changed, 83 insertions(+), 48 deletions(-)
> 
> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> index 6ce18aa..2e46342 100644
> --- a/libavcodec/v210dec.c
> +++ b/libavcodec/v210dec.c
> @@ -28,6 +28,7 @@
>  #include "libavutil/internal.h"
>  #include "libavutil/mem.h"
>  #include "libavutil/intreadwrite.h"
> +#include "thread.h"
>  
>  #define READ_PIXELS(a, b, c)         \
>      do {                             \
> @@ -37,6 +38,12 @@
>          *c++ = (val >> 20) & 0x3FF;  \
>      } while (0)
>  
> +typedef struct ThreadData {
> +    AVFrame *frame;
> +    uint8_t *buf;
> +    int stride;
> +} ThreadData;
> +
>  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
>  {
>      uint32_t val;
> @@ -70,55 +77,28 @@ static av_cold int decode_init(AVCodecContext *avctx)
>      return 0;
>  }
>  
> -static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
> -                        AVPacket *avpkt)
> +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
>  {
>      V210DecContext *s = avctx->priv_data;
> -
> -    int h, w, ret, stride, aligned_input;
> -    AVFrame *pic = data;
> -    const uint8_t *psrc = avpkt->data;
> +    int h, w;
> +    ThreadData *td = arg;
> +    AVFrame *frame = td->frame;
> +    int stride = td->stride;
> +    int slice_h = avctx->height / avctx->thread_count;
> +    int slice_m = avctx->height % avctx->thread_count;
> +    int slice_start = jobnr * slice_h;
> +    int slice_end = slice_start + slice_h;
> +    const uint8_t *psrc = td->buf + stride * slice_start;
>      uint16_t *y, *u, *v;
>  
> -    if (s->custom_stride )
> -        stride = s->custom_stride;
> -    else {
> -        int aligned_width = ((avctx->width + 47) / 48) * 48;
> -        stride = aligned_width * 8 / 3;
> -    }
> -
> -    if (avpkt->size < stride * avctx->height) {
> -        if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) {
> -            stride = avpkt->size / avctx->height;
> -            if (!s->stride_warning_shown)
> -                av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small padding (64 byte) detected\n");
> -            s->stride_warning_shown = 1;
> -        } else {
> -            av_log(avctx, AV_LOG_ERROR, "packet too small\n");
> -            return AVERROR_INVALIDDATA;
> -        }
> -    }
> -    if (avctx->codec_tag == MKTAG('C', '2', '1', '0')
> -        && AV_RN32(psrc) == AV_RN32("INFO")
> -        && avpkt->size - 64 >= stride * avctx->height)
> -        psrc += 64;
> -
> -    aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f);
> -    if (aligned_input != s->aligned_input) {
> -        s->aligned_input = aligned_input;
> -        ff_v210dec_init(s);
> -    }
> -
> -    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
> -        return ret;
> -
> -    y = (uint16_t*)pic->data[0];
> -    u = (uint16_t*)pic->data[1];
> -    v = (uint16_t*)pic->data[2];
> -    pic->pict_type = AV_PICTURE_TYPE_I;
> -    pic->key_frame = 1;
> +    /* add the remaining slice for the last job */
> +    if (jobnr == avctx->thread_count - 1)
> +        slice_end += slice_m;
>  
> -    for (h = 0; h < avctx->height; h++) {
> +    y = (uint16_t*)frame->data[0] + slice_start * frame->linesize[0] / 2;
> +    u = (uint16_t*)frame->data[1] + slice_start * frame->linesize[1] / 2;
> +    v = (uint16_t*)frame->data[2] + slice_start * frame->linesize[2] / 2;
> +    for (h = slice_start; h < slice_end; h++) {
>          const uint32_t *src = (const uint32_t*)psrc;
>          uint32_t val;
>  
> @@ -154,10 +134,63 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
>          }
>  
>          psrc += stride;
> -        y += pic->linesize[0] / 2 - avctx->width + (avctx->width & 1);
> -        u += pic->linesize[1] / 2 - avctx->width / 2;
> -        v += pic->linesize[2] / 2 - avctx->width / 2;
> +        y += frame->linesize[0] / 2 - avctx->width + (avctx->width & 1);
> +        u += frame->linesize[1] / 2 - avctx->width / 2;
> +        v += frame->linesize[2] / 2 - avctx->width / 2;
> +    }
> +
> +    return 0;
> +}
> +
> +static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
> +                        AVPacket *avpkt)
> +{
> +    V210DecContext *s = avctx->priv_data;
> +    ThreadData td;
> +    int ret, stride, aligned_input;
> +    ThreadFrame frame = { .f = data };
> +    AVFrame *pic = data;
> +    const uint8_t *psrc = avpkt->data;
> +
> +    if (s->custom_stride )
> +        stride = s->custom_stride;
> +    else {
> +        int aligned_width = ((avctx->width + 47) / 48) * 48;
> +        stride = aligned_width * 8 / 3;
> +    }
> +    td.stride = stride;
> +
> +    if (avpkt->size < stride * avctx->height) {
> +        if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) {
> +            stride = avpkt->size / avctx->height;
> +            if (!s->stride_warning_shown)
> +                av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small padding (64 byte) detected\n");
> +            s->stride_warning_shown = 1;
> +        } else {
> +            av_log(avctx, AV_LOG_ERROR, "packet too small\n");
> +            return AVERROR_INVALIDDATA;
> +        }
>      }
> +    if (avctx->codec_tag == MKTAG('C', '2', '1', '0')
> +        && AV_RN32(psrc) == AV_RN32("INFO")
> +        && avpkt->size - 64 >= stride * avctx->height)
> +        psrc += 64;
> +
> +    aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f);
> +    if (aligned_input != s->aligned_input) {
> +        s->aligned_input = aligned_input;
> +        ff_v210dec_init(s);
> +    }
> +
> +    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
> +        return ret;
> +
> +    pic->pict_type = AV_PICTURE_TYPE_I;
> +    pic->key_frame = 1;
> +
> +    td.buf = (uint8_t*)psrc;
> +    td.frame = pic;
> +    avctx->execute2(avctx, v210_decode_slice, &td, NULL, avctx->thread_count);
>  
>      if (avctx->field_order > AV_FIELD_PROGRESSIVE) {
>          /* we have interlaced material flagged in container */
> @@ -193,6 +226,8 @@ AVCodec ff_v210_decoder = {
>      .priv_data_size = sizeof(V210DecContext),
>      .init           = decode_init,
>      .decode         = decode_frame,
> -    .capabilities   = AV_CODEC_CAP_DR1,
> +    .capabilities   = AV_CODEC_CAP_DR1 |
> +                      AV_CODEC_CAP_SLICE_THREADS |
> +                      AV_CODEC_CAP_FRAME_THREADS,
>      .priv_class     = &v210dec_class,
>  };
> -- 
> 2.6.4
>
Michael Niedermayer Oct. 11, 2019, 7:05 p.m.
On Sun, Sep 22, 2019 at 11:55:49AM +0800, lance.lmwang@gmail.com wrote:
> From: Limin Wang <lance.lmwang@gmail.com>
> 
> The multithread is avoid one core cpu is full with other filter like scale etc.
> About the performance, the gain is very small, below is my testing for
> performance.
> In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> only.
> 
> ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10
> ~/Movies/1.v210
> 
> master:
> ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark
> -f null -
> frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=10.082s stime=13.784s rtime=23.889s
> bench: maxrss=147836928kB
> 
> patch applied:
> ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> ~/Movies/1.v210 -benchmark -f null -
> 
> frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=11.407s stime=17.258s rtime=18.279s
> bench: maxrss=442884096kB
> 
> Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> ---
>  libavcodec/v210dec.c | 131 ++++++++++++++++++++++++++++++++-------------------
>  1 file changed, 83 insertions(+), 48 deletions(-)
> 
> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> index 6ce18aa..2e46342 100644
> --- a/libavcodec/v210dec.c
> +++ b/libavcodec/v210dec.c
> @@ -28,6 +28,7 @@
>  #include "libavutil/internal.h"
>  #include "libavutil/mem.h"
>  #include "libavutil/intreadwrite.h"
> +#include "thread.h"
>  
>  #define READ_PIXELS(a, b, c)         \
>      do {                             \
> @@ -37,6 +38,12 @@
>          *c++ = (val >> 20) & 0x3FF;  \
>      } while (0)
>  
> +typedef struct ThreadData {
> +    AVFrame *frame;
> +    uint8_t *buf;
> +    int stride;
> +} ThreadData;
> +
>  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
>  {
>      uint32_t val;
> @@ -70,55 +77,28 @@ static av_cold int decode_init(AVCodecContext *avctx)
>      return 0;
>  }
>  
> -static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
> -                        AVPacket *avpkt)
> +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
>  {
>      V210DecContext *s = avctx->priv_data;
> -
> -    int h, w, ret, stride, aligned_input;
> -    AVFrame *pic = data;
> -    const uint8_t *psrc = avpkt->data;
> +    int h, w;
> +    ThreadData *td = arg;
> +    AVFrame *frame = td->frame;
> +    int stride = td->stride;

> +    int slice_h = avctx->height / avctx->thread_count;
> +    int slice_m = avctx->height % avctx->thread_count;
> +    int slice_start = jobnr * slice_h;
> +    int slice_end = slice_start + slice_h;

assume avctx->height is 10 and avctx->thread_count is 11
slice_h becomes 0 and slice_start becomes 0 too for all threads



[...]
lance.lmwang@gmail.com Oct. 12, 2019, 11:02 a.m.
On Fri, Oct 11, 2019 at 09:05:27PM +0200, Michael Niedermayer wrote:
> On Sun, Sep 22, 2019 at 11:55:49AM +0800, lance.lmwang@gmail.com wrote:
> > From: Limin Wang <lance.lmwang@gmail.com>
> > 
> > The multithread is avoid one core cpu is full with other filter like scale etc.
> > About the performance, the gain is very small, below is my testing for
> > performance.
> > In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> > only.
> > 
> > ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10
> > ~/Movies/1.v210
> > 
> > master:
> > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark
> > -f null -
> > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> > overhead: unknown
> > bench: utime=10.082s stime=13.784s rtime=23.889s
> > bench: maxrss=147836928kB
> > 
> > patch applied:
> > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> > ~/Movies/1.v210 -benchmark -f null -
> > 
> > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> > overhead: unknown
> > bench: utime=11.407s stime=17.258s rtime=18.279s
> > bench: maxrss=442884096kB
> > 
> > Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> > ---
> >  libavcodec/v210dec.c | 131 ++++++++++++++++++++++++++++++++-------------------
> >  1 file changed, 83 insertions(+), 48 deletions(-)
> > 
> > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > index 6ce18aa..2e46342 100644
> > --- a/libavcodec/v210dec.c
> > +++ b/libavcodec/v210dec.c
> > @@ -28,6 +28,7 @@
> >  #include "libavutil/internal.h"
> >  #include "libavutil/mem.h"
> >  #include "libavutil/intreadwrite.h"
> > +#include "thread.h"
> >  
> >  #define READ_PIXELS(a, b, c)         \
> >      do {                             \
> > @@ -37,6 +38,12 @@
> >          *c++ = (val >> 20) & 0x3FF;  \
> >      } while (0)
> >  
> > +typedef struct ThreadData {
> > +    AVFrame *frame;
> > +    uint8_t *buf;
> > +    int stride;
> > +} ThreadData;
> > +
> >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
> >  {
> >      uint32_t val;
> > @@ -70,55 +77,28 @@ static av_cold int decode_init(AVCodecContext *avctx)
> >      return 0;
> >  }
> >  
> > -static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
> > -                        AVPacket *avpkt)
> > +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
> >  {
> >      V210DecContext *s = avctx->priv_data;
> > -
> > -    int h, w, ret, stride, aligned_input;
> > -    AVFrame *pic = data;
> > -    const uint8_t *psrc = avpkt->data;
> > +    int h, w;
> > +    ThreadData *td = arg;
> > +    AVFrame *frame = td->frame;
> > +    int stride = td->stride;
> 
> > +    int slice_h = avctx->height / avctx->thread_count;
> > +    int slice_m = avctx->height % avctx->thread_count;
> > +    int slice_start = jobnr * slice_h;
> > +    int slice_end = slice_start + slice_h;
> 
> assume avctx->height is 10 and avctx->thread_count is 11
> slice_h becomes 0 and slice_start becomes 0 too for all threads

Michael, I have updated the patch to limit thread_count to [1,1, avctx->height/4],
why height/4, it's borrowed from dxv.c, please give comments whether it's proper.

Thanks
> 
> 
> 
> [...]
> -- 
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> The bravest are surely those who have the clearest vision
> of what is before them, glory and danger alike, and yet
> notwithstanding go out to meet it. -- Thucydides



> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Kieran Kunhya Oct. 13, 2019, 1:10 p.m.
>
> Michael, I have updated the patch to limit thread_count to [1,1,
> avctx->height/4],
> why height/4, it's borrowed from dxv.c, please give comments whether it's
> proper.
>

For the lack of big speed improvement, make sure you are giving each thread
a slice in order.

Kieran
lance.lmwang@gmail.com Oct. 14, 2019, 1:10 a.m.
On Sun, Oct 13, 2019 at 02:10:02PM +0100, Kieran Kunhya wrote:
> >
> > Michael, I have updated the patch to limit thread_count to [1,1,
> > avctx->height/4],
> > why height/4, it's borrowed from dxv.c, please give comments whether it's
> > proper.
> >
> 
> For the lack of big speed improvement, make sure you are giving each thread
> a slice in order.

Kieran, thanks for your hints, he describe performance data has been tested on my old Mac Pro, 
so you will find that the improvement is not that great. However, if you use a x86 server
with more cores and fewer cpu MHz cpus, you will get even more improvements.

Below is my testing results in one server for reference(Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz):

./ffmpeg  -y -i /root/UHD_Soccer_4K@50_8Bit_45.7M_HEVC_AAC.ts -c:v v210 -f rawvideo -frames 50 /root/1.v210

./ffmpeg -threads 1 -s 3840x2160  -stream_loop 20 -i /root/1.v210 -benchmark -f null -
frame= 1050 fps= 80 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=3.19x

./ffmpeg -threads 2 -thread_type frame+slice  -s 3840x2160  -stream_loop 20 -i /root/1.v210 -benchmark -f null -
frame= 1050 fps=111 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=4.45x

./ffmpeg -threads 4 -thread_type frame+slice  -s 3840x2160  -stream_loop 20 -i /root/1.v210 -benchmark -f null -
frame= 1050 fps=145 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=5.81x



Thanks,
Limin


> 
> Kieran

Patch hide | download patch | download mbox

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index 6ce18aa..2e46342 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -28,6 +28,7 @@ 
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/intreadwrite.h"
+#include "thread.h"
 
 #define READ_PIXELS(a, b, c)         \
     do {                             \
@@ -37,6 +38,12 @@ 
         *c++ = (val >> 20) & 0x3FF;  \
     } while (0)
 
+typedef struct ThreadData {
+    AVFrame *frame;
+    uint8_t *buf;
+    int stride;
+} ThreadData;
+
 static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
 {
     uint32_t val;
@@ -70,55 +77,28 @@  static av_cold int decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
-                        AVPacket *avpkt)
+static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
 {
     V210DecContext *s = avctx->priv_data;
-
-    int h, w, ret, stride, aligned_input;
-    AVFrame *pic = data;
-    const uint8_t *psrc = avpkt->data;
+    int h, w;
+    ThreadData *td = arg;
+    AVFrame *frame = td->frame;
+    int stride = td->stride;
+    int slice_h = avctx->height / avctx->thread_count;
+    int slice_m = avctx->height % avctx->thread_count;
+    int slice_start = jobnr * slice_h;
+    int slice_end = slice_start + slice_h;
+    const uint8_t *psrc = td->buf + stride * slice_start;
     uint16_t *y, *u, *v;
 
-    if (s->custom_stride )
-        stride = s->custom_stride;
-    else {
-        int aligned_width = ((avctx->width + 47) / 48) * 48;
-        stride = aligned_width * 8 / 3;
-    }
-
-    if (avpkt->size < stride * avctx->height) {
-        if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) {
-            stride = avpkt->size / avctx->height;
-            if (!s->stride_warning_shown)
-                av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small padding (64 byte) detected\n");
-            s->stride_warning_shown = 1;
-        } else {
-            av_log(avctx, AV_LOG_ERROR, "packet too small\n");
-            return AVERROR_INVALIDDATA;
-        }
-    }
-    if (avctx->codec_tag == MKTAG('C', '2', '1', '0')
-        && AV_RN32(psrc) == AV_RN32("INFO")
-        && avpkt->size - 64 >= stride * avctx->height)
-        psrc += 64;
-
-    aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f);
-    if (aligned_input != s->aligned_input) {
-        s->aligned_input = aligned_input;
-        ff_v210dec_init(s);
-    }
-
-    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
-        return ret;
-
-    y = (uint16_t*)pic->data[0];
-    u = (uint16_t*)pic->data[1];
-    v = (uint16_t*)pic->data[2];
-    pic->pict_type = AV_PICTURE_TYPE_I;
-    pic->key_frame = 1;
+    /* add the remaining slice for the last job */
+    if (jobnr == avctx->thread_count - 1)
+        slice_end += slice_m;
 
-    for (h = 0; h < avctx->height; h++) {
+    y = (uint16_t*)frame->data[0] + slice_start * frame->linesize[0] / 2;
+    u = (uint16_t*)frame->data[1] + slice_start * frame->linesize[1] / 2;
+    v = (uint16_t*)frame->data[2] + slice_start * frame->linesize[2] / 2;
+    for (h = slice_start; h < slice_end; h++) {
         const uint32_t *src = (const uint32_t*)psrc;
         uint32_t val;
 
@@ -154,10 +134,63 @@  static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         }
 
         psrc += stride;
-        y += pic->linesize[0] / 2 - avctx->width + (avctx->width & 1);
-        u += pic->linesize[1] / 2 - avctx->width / 2;
-        v += pic->linesize[2] / 2 - avctx->width / 2;
+        y += frame->linesize[0] / 2 - avctx->width + (avctx->width & 1);
+        u += frame->linesize[1] / 2 - avctx->width / 2;
+        v += frame->linesize[2] / 2 - avctx->width / 2;
+    }
+
+    return 0;
+}
+
+static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
+                        AVPacket *avpkt)
+{
+    V210DecContext *s = avctx->priv_data;
+    ThreadData td;
+    int ret, stride, aligned_input;
+    ThreadFrame frame = { .f = data };
+    AVFrame *pic = data;
+    const uint8_t *psrc = avpkt->data;
+
+    if (s->custom_stride )
+        stride = s->custom_stride;
+    else {
+        int aligned_width = ((avctx->width + 47) / 48) * 48;
+        stride = aligned_width * 8 / 3;
+    }
+    td.stride = stride;
+
+    if (avpkt->size < stride * avctx->height) {
+        if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) {
+            stride = avpkt->size / avctx->height;
+            if (!s->stride_warning_shown)
+                av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small padding (64 byte) detected\n");
+            s->stride_warning_shown = 1;
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "packet too small\n");
+            return AVERROR_INVALIDDATA;
+        }
     }
+    if (avctx->codec_tag == MKTAG('C', '2', '1', '0')
+        && AV_RN32(psrc) == AV_RN32("INFO")
+        && avpkt->size - 64 >= stride * avctx->height)
+        psrc += 64;
+
+    aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f);
+    if (aligned_input != s->aligned_input) {
+        s->aligned_input = aligned_input;
+        ff_v210dec_init(s);
+    }
+
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+        return ret;
+
+    pic->pict_type = AV_PICTURE_TYPE_I;
+    pic->key_frame = 1;
+
+    td.buf = (uint8_t*)psrc;
+    td.frame = pic;
+    avctx->execute2(avctx, v210_decode_slice, &td, NULL, avctx->thread_count);
 
     if (avctx->field_order > AV_FIELD_PROGRESSIVE) {
         /* we have interlaced material flagged in container */
@@ -193,6 +226,8 @@  AVCodec ff_v210_decoder = {
     .priv_data_size = sizeof(V210DecContext),
     .init           = decode_init,
     .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 |
+                      AV_CODEC_CAP_SLICE_THREADS |
+                      AV_CODEC_CAP_FRAME_THREADS,
     .priv_class     = &v210dec_class,
 };