diff mbox

[FFmpeg-devel,v5] avcodec/v210dec: add the frame and slice threading support

Message ID 20191012105821.15523-1-lance.lmwang@gmail.com
State Superseded
Headers show

Commit Message

Lance Wang Oct. 12, 2019, 10:58 a.m. UTC
From: Limin Wang <lance.lmwang@gmail.com>

The multithread is avoid one core cpu is full with other filter like scale etc.
About the performance, the gain is very small, below is my testing for
performance.
In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
only.

./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10
~/Movies/1.v210

master:
./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark
-f null -
frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
overhead: unknown
bench: utime=10.082s stime=13.784s rtime=23.889s
bench: maxrss=147836928kB

patch applied:
./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
~/Movies/1.v210 -benchmark -f null -

frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
overhead: unknown
bench: utime=11.407s stime=17.258s rtime=18.279s
bench: maxrss=442884096kB

Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
---
 libavcodec/v210dec.c | 132 +++++++++++++++++++++++++++----------------
 libavcodec/v210dec.h |   1 +
 2 files changed, 85 insertions(+), 48 deletions(-)

Comments

Jun Zhao Oct. 12, 2019, 11:36 a.m. UTC | #1
On Sat, Oct 12, 2019 at 6:58 PM <lance.lmwang@gmail.com> wrote:
>
> From: Limin Wang <lance.lmwang@gmail.com>
>
> The multithread is avoid one core cpu is full with other filter like scale etc.
> About the performance, the gain is very small, below is my testing for
> performance.
> In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> only.
>
> ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10
> ~/Movies/1.v210
>
> master:
> ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark
> -f null -
> frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=10.082s stime=13.784s rtime=23.889s
> bench: maxrss=147836928kB
>
> patch applied:
> ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> ~/Movies/1.v210 -benchmark -f null -
>
> frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=11.407s stime=17.258s rtime=18.279s
> bench: maxrss=442884096kB
>
> Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> ---
>  libavcodec/v210dec.c | 132 +++++++++++++++++++++++++++----------------
>  libavcodec/v210dec.h |   1 +
>  2 files changed, 85 insertions(+), 48 deletions(-)
>
> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> index 5a33d8c089..c3ef8051e6 100644
> --- a/libavcodec/v210dec.c
> +++ b/libavcodec/v210dec.c
> @@ -28,6 +28,7 @@
>  #include "libavutil/internal.h"
>  #include "libavutil/mem.h"
>  #include "libavutil/intreadwrite.h"
> +#include "thread.h"
>
>  #define READ_PIXELS(a, b, c)         \
>      do {                             \
> @@ -37,6 +38,12 @@
>          *c++ = (val >> 20) & 0x3FF;  \
>      } while (0)
>
> +typedef struct ThreadData {
> +    AVFrame *frame;
> +    uint8_t *buf;
> +    int stride;
> +} ThreadData;
> +
>  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
>  {
>      uint32_t val;
> @@ -64,21 +71,87 @@ static av_cold int decode_init(AVCodecContext *avctx)
>      avctx->pix_fmt             = AV_PIX_FMT_YUV422P10;
>      avctx->bits_per_raw_sample = 10;
>
> +    s->thread_count  = av_clip(avctx->thread_count, 1, avctx->height/4);
>      s->aligned_input = 0;
>      ff_v210dec_init(s);
>
>      return 0;
>  }
>
> +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
> +{
> +    V210DecContext *s = avctx->priv_data;
> +    int h, w;
> +    ThreadData *td = arg;
> +    AVFrame *frame = td->frame;
> +    int stride = td->stride;
> +    int slice_h = avctx->height / s->thread_count;
> +    int slice_m = avctx->height % s->thread_count;
> +    int slice_start = jobnr * slice_h;
> +    int slice_end = slice_start + slice_h;
> +    const uint8_t *psrc = td->buf + stride * slice_start;
> +    uint16_t *y, *u, *v;
> +
> +    /* add the remaining slice for the last job */
> +    if (jobnr == s->thread_count - 1)
> +        slice_end += slice_m;
> +
> +    y = (uint16_t*)frame->data[0] + slice_start * frame->linesize[0] / 2;
> +    u = (uint16_t*)frame->data[1] + slice_start * frame->linesize[1] / 2;
> +    v = (uint16_t*)frame->data[2] + slice_start * frame->linesize[2] / 2;
> +    for (h = slice_start; h < slice_end; h++) {
> +        const uint32_t *src = (const uint32_t*)psrc;
> +        uint32_t val;
> +
> +        w = (avctx->width / 12) * 12;
> +        s->unpack_frame(src, y, u, v, w);
> +
> +        y += w;
> +        u += w >> 1;
> +        v += w >> 1;
> +        src += (w << 1) / 3;
> +
> +        if (w < avctx->width - 5) {
> +            READ_PIXELS(u, y, v);
> +            READ_PIXELS(y, u, y);
> +            READ_PIXELS(v, y, u);
> +            READ_PIXELS(y, v, y);
> +            w += 6;
> +        }
> +
> +        if (w < avctx->width - 1) {
> +            READ_PIXELS(u, y, v);
> +
> +            val  = av_le2ne32(*src++);
> +            *y++ =  val & 0x3FF;
> +            if (w < avctx->width - 3) {
> +                *u++ = (val >> 10) & 0x3FF;
> +                *y++ = (val >> 20) & 0x3FF;
> +
> +                val  = av_le2ne32(*src++);
> +                *v++ =  val & 0x3FF;
> +                *y++ = (val >> 10) & 0x3FF;
> +            }
> +        }
> +
> +        psrc += stride;
> +        y += frame->linesize[0] / 2 - avctx->width + (avctx->width & 1);
> +        u += frame->linesize[1] / 2 - avctx->width / 2;
> +        v += frame->linesize[2] / 2 - avctx->width / 2;
> +    }
> +
> +    return 0;
> +}
> +
>  static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
>                          AVPacket *avpkt)
>  {
>      V210DecContext *s = avctx->priv_data;
> -
> -    int h, w, ret, stride, aligned_input;
> +    ThreadData td;
> +    int ret, stride, aligned_input;
> +    ThreadFrame frame = { .f = data };
>      AVFrame *pic = data;
>      const uint8_t *psrc = avpkt->data;
> -    uint16_t *y, *u, *v;
>
>      if (s->custom_stride )
>          stride = s->custom_stride;
> @@ -86,6 +159,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
>          int aligned_width = ((avctx->width + 47) / 48) * 48;
>          stride = aligned_width * 8 / 3;
>      }
> +    td.stride = stride;
>
>      if (avpkt->size < stride * avctx->height) {
>          if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) {
> @@ -110,55 +184,15 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
>          ff_v210dec_init(s);
>      }
>
> -    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
> +    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
>          return ret;
>
> -    y = (uint16_t*)pic->data[0];
> -    u = (uint16_t*)pic->data[1];
> -    v = (uint16_t*)pic->data[2];
>      pic->pict_type = AV_PICTURE_TYPE_I;
>      pic->key_frame = 1;
>
> -    for (h = 0; h < avctx->height; h++) {
> -        const uint32_t *src = (const uint32_t*)psrc;
> -        uint32_t val;
> -
> -        w = (avctx->width / 12) * 12;
> -        s->unpack_frame(src, y, u, v, w);
> -
> -        y += w;
> -        u += w >> 1;
> -        v += w >> 1;
> -        src += (w << 1) / 3;
> -
> -        if (w < avctx->width - 5) {
> -            READ_PIXELS(u, y, v);
> -            READ_PIXELS(y, u, y);
> -            READ_PIXELS(v, y, u);
> -            READ_PIXELS(y, v, y);
> -            w += 6;
> -        }
> -
> -        if (w < avctx->width - 1) {
> -            READ_PIXELS(u, y, v);
> -
> -            val  = av_le2ne32(*src++);
> -            *y++ =  val & 0x3FF;
> -            if (w < avctx->width - 3) {
> -                *u++ = (val >> 10) & 0x3FF;
> -                *y++ = (val >> 20) & 0x3FF;
> -
> -                val  = av_le2ne32(*src++);
> -                *v++ =  val & 0x3FF;
> -                *y++ = (val >> 10) & 0x3FF;
> -            }
> -        }
> -
> -        psrc += stride;
> -        y += pic->linesize[0] / 2 - avctx->width + (avctx->width & 1);
> -        u += pic->linesize[1] / 2 - avctx->width / 2;
> -        v += pic->linesize[2] / 2 - avctx->width / 2;
> -    }
> +    td.buf = (uint8_t*)psrc;
> +    td.frame = pic;
> +    avctx->execute2(avctx, v210_decode_slice, &td, NULL, s->thread_count);
>
>      if (avctx->field_order > AV_FIELD_PROGRESSIVE) {
>          /* we have interlaced material flagged in container */
> @@ -194,6 +228,8 @@ AVCodec ff_v210_decoder = {
>      .priv_data_size = sizeof(V210DecContext),
>      .init           = decode_init,
>      .decode         = decode_frame,
> -    .capabilities   = AV_CODEC_CAP_DR1,
> +    .capabilities   = AV_CODEC_CAP_DR1 |
> +                      AV_CODEC_CAP_SLICE_THREADS |
> +                      AV_CODEC_CAP_FRAME_THREADS,

I only saw the slice thread acceleration, did not see the frame thread
acceleration, is it something I missed?
>      .priv_class     = &v210dec_class,
>  };
> diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
> index cfdb29da09..662e266315 100644
> --- a/libavcodec/v210dec.h
> +++ b/libavcodec/v210dec.h
> @@ -27,6 +27,7 @@ typedef struct {
>      AVClass *av_class;
>      int custom_stride;
>      int aligned_input;
> +    int thread_count;
>      int stride_warning_shown;
>      void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
>  } V210DecContext;
> --
Michael Niedermayer Oct. 12, 2019, 9:46 p.m. UTC | #2
On Sat, Oct 12, 2019 at 06:58:21PM +0800, lance.lmwang@gmail.com wrote:
> From: Limin Wang <lance.lmwang@gmail.com>
> 
> The multithread is avoid one core cpu is full with other filter like scale etc.
> About the performance, the gain is very small, below is my testing for
> performance.
> In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> only.
> 
> ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10
> ~/Movies/1.v210
> 
> master:
> ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark
> -f null -
> frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=10.082s stime=13.784s rtime=23.889s
> bench: maxrss=147836928kB
> 
> patch applied:
> ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> ~/Movies/1.v210 -benchmark -f null -
> 
> frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> overhead: unknown
> bench: utime=11.407s stime=17.258s rtime=18.279s
> bench: maxrss=442884096kB
> 
> Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> ---
>  libavcodec/v210dec.c | 132 +++++++++++++++++++++++++++----------------
>  libavcodec/v210dec.h |   1 +
>  2 files changed, 85 insertions(+), 48 deletions(-)
> 
> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> index 5a33d8c089..c3ef8051e6 100644
> --- a/libavcodec/v210dec.c
> +++ b/libavcodec/v210dec.c
> @@ -28,6 +28,7 @@
>  #include "libavutil/internal.h"
>  #include "libavutil/mem.h"
>  #include "libavutil/intreadwrite.h"
> +#include "thread.h"
>  
>  #define READ_PIXELS(a, b, c)         \
>      do {                             \
> @@ -37,6 +38,12 @@
>          *c++ = (val >> 20) & 0x3FF;  \
>      } while (0)
>  
> +typedef struct ThreadData {
> +    AVFrame *frame;
> +    uint8_t *buf;
> +    int stride;
> +} ThreadData;
> +
>  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
>  {
>      uint32_t val;
> @@ -64,21 +71,87 @@ static av_cold int decode_init(AVCodecContext *avctx)
>      avctx->pix_fmt             = AV_PIX_FMT_YUV422P10;
>      avctx->bits_per_raw_sample = 10;
>  
> +    s->thread_count  = av_clip(avctx->thread_count, 1, avctx->height/4);
>      s->aligned_input = 0;
>      ff_v210dec_init(s);
>  
>      return 0;
>  }
>  
> +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
> +{
> +    V210DecContext *s = avctx->priv_data;
> +    int h, w;
> +    ThreadData *td = arg;
> +    AVFrame *frame = td->frame;
> +    int stride = td->stride;
> +    int slice_h = avctx->height / s->thread_count;
> +    int slice_m = avctx->height % s->thread_count;
> +    int slice_start = jobnr * slice_h;

this is still not correct
the height of a slice is not the same for all slices if the frame height
is not divisible by the number of slices


[...]
Lance Wang Oct. 12, 2019, 10:37 p.m. UTC | #3
On Sat, Oct 12, 2019 at 07:36:29PM +0800, mypopy@gmail.com wrote:
> On Sat, Oct 12, 2019 at 6:58 PM <lance.lmwang@gmail.com> wrote:
> >
> > From: Limin Wang <lance.lmwang@gmail.com>
> >
> > The multithread is avoid one core cpu is full with other filter like scale etc.
> > About the performance, the gain is very small, below is my testing for
> > performance.
> > In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> > only.
> >
> > ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10
> > ~/Movies/1.v210
> >
> > master:
> > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark
> > -f null -
> > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> > overhead: unknown
> > bench: utime=10.082s stime=13.784s rtime=23.889s
> > bench: maxrss=147836928kB
> >
> > patch applied:
> > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> > ~/Movies/1.v210 -benchmark -f null -
> >
> > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> > overhead: unknown
> > bench: utime=11.407s stime=17.258s rtime=18.279s
> > bench: maxrss=442884096kB
> >
> > Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> > ---
> >  libavcodec/v210dec.c | 132 +++++++++++++++++++++++++++----------------
> >  libavcodec/v210dec.h |   1 +
> >  2 files changed, 85 insertions(+), 48 deletions(-)
> >
> > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > index 5a33d8c089..c3ef8051e6 100644
> > --- a/libavcodec/v210dec.c
> > +++ b/libavcodec/v210dec.c
> > @@ -28,6 +28,7 @@
> >  #include "libavutil/internal.h"
> >  #include "libavutil/mem.h"
> >  #include "libavutil/intreadwrite.h"
> > +#include "thread.h"
> >
> >  #define READ_PIXELS(a, b, c)         \
> >      do {                             \
> > @@ -37,6 +38,12 @@
> >          *c++ = (val >> 20) & 0x3FF;  \
> >      } while (0)
> >
> > +typedef struct ThreadData {
> > +    AVFrame *frame;
> > +    uint8_t *buf;
> > +    int stride;
> > +} ThreadData;
> > +
> >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
> >  {
> >      uint32_t val;
> > @@ -64,21 +71,87 @@ static av_cold int decode_init(AVCodecContext *avctx)
> >      avctx->pix_fmt             = AV_PIX_FMT_YUV422P10;
> >      avctx->bits_per_raw_sample = 10;
> >
> > +    s->thread_count  = av_clip(avctx->thread_count, 1, avctx->height/4);
> >      s->aligned_input = 0;
> >      ff_v210dec_init(s);
> >
> >      return 0;
> >  }
> >
> > +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
> > +{
> > +    V210DecContext *s = avctx->priv_data;
> > +    int h, w;
> > +    ThreadData *td = arg;
> > +    AVFrame *frame = td->frame;
> > +    int stride = td->stride;
> > +    int slice_h = avctx->height / s->thread_count;
> > +    int slice_m = avctx->height % s->thread_count;
> > +    int slice_start = jobnr * slice_h;
> > +    int slice_end = slice_start + slice_h;
> > +    const uint8_t *psrc = td->buf + stride * slice_start;
> > +    uint16_t *y, *u, *v;
> > +
> > +    /* add the remaining slice for the last job */
> > +    if (jobnr == s->thread_count - 1)
> > +        slice_end += slice_m;
> > +
> > +    y = (uint16_t*)frame->data[0] + slice_start * frame->linesize[0] / 2;
> > +    u = (uint16_t*)frame->data[1] + slice_start * frame->linesize[1] / 2;
> > +    v = (uint16_t*)frame->data[2] + slice_start * frame->linesize[2] / 2;
> > +    for (h = slice_start; h < slice_end; h++) {
> > +        const uint32_t *src = (const uint32_t*)psrc;
> > +        uint32_t val;
> > +
> > +        w = (avctx->width / 12) * 12;
> > +        s->unpack_frame(src, y, u, v, w);
> > +
> > +        y += w;
> > +        u += w >> 1;
> > +        v += w >> 1;
> > +        src += (w << 1) / 3;
> > +
> > +        if (w < avctx->width - 5) {
> > +            READ_PIXELS(u, y, v);
> > +            READ_PIXELS(y, u, y);
> > +            READ_PIXELS(v, y, u);
> > +            READ_PIXELS(y, v, y);
> > +            w += 6;
> > +        }
> > +
> > +        if (w < avctx->width - 1) {
> > +            READ_PIXELS(u, y, v);
> > +
> > +            val  = av_le2ne32(*src++);
> > +            *y++ =  val & 0x3FF;
> > +            if (w < avctx->width - 3) {
> > +                *u++ = (val >> 10) & 0x3FF;
> > +                *y++ = (val >> 20) & 0x3FF;
> > +
> > +                val  = av_le2ne32(*src++);
> > +                *v++ =  val & 0x3FF;
> > +                *y++ = (val >> 10) & 0x3FF;
> > +            }
> > +        }
> > +
> > +        psrc += stride;
> > +        y += frame->linesize[0] / 2 - avctx->width + (avctx->width & 1);
> > +        u += frame->linesize[1] / 2 - avctx->width / 2;
> > +        v += frame->linesize[2] / 2 - avctx->width / 2;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >  static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
> >                          AVPacket *avpkt)
> >  {
> >      V210DecContext *s = avctx->priv_data;
> > -
> > -    int h, w, ret, stride, aligned_input;
> > +    ThreadData td;
> > +    int ret, stride, aligned_input;
> > +    ThreadFrame frame = { .f = data };
> >      AVFrame *pic = data;
> >      const uint8_t *psrc = avpkt->data;
> > -    uint16_t *y, *u, *v;
> >
> >      if (s->custom_stride )
> >          stride = s->custom_stride;
> > @@ -86,6 +159,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
> >          int aligned_width = ((avctx->width + 47) / 48) * 48;
> >          stride = aligned_width * 8 / 3;
> >      }
> > +    td.stride = stride;
> >
> >      if (avpkt->size < stride * avctx->height) {
> >          if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) {
> > @@ -110,55 +184,15 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
> >          ff_v210dec_init(s);
> >      }
> >
> > -    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
> > +    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
> >          return ret;
> >
> > -    y = (uint16_t*)pic->data[0];
> > -    u = (uint16_t*)pic->data[1];
> > -    v = (uint16_t*)pic->data[2];
> >      pic->pict_type = AV_PICTURE_TYPE_I;
> >      pic->key_frame = 1;
> >
> > -    for (h = 0; h < avctx->height; h++) {
> > -        const uint32_t *src = (const uint32_t*)psrc;
> > -        uint32_t val;
> > -
> > -        w = (avctx->width / 12) * 12;
> > -        s->unpack_frame(src, y, u, v, w);
> > -
> > -        y += w;
> > -        u += w >> 1;
> > -        v += w >> 1;
> > -        src += (w << 1) / 3;
> > -
> > -        if (w < avctx->width - 5) {
> > -            READ_PIXELS(u, y, v);
> > -            READ_PIXELS(y, u, y);
> > -            READ_PIXELS(v, y, u);
> > -            READ_PIXELS(y, v, y);
> > -            w += 6;
> > -        }
> > -
> > -        if (w < avctx->width - 1) {
> > -            READ_PIXELS(u, y, v);
> > -
> > -            val  = av_le2ne32(*src++);
> > -            *y++ =  val & 0x3FF;
> > -            if (w < avctx->width - 3) {
> > -                *u++ = (val >> 10) & 0x3FF;
> > -                *y++ = (val >> 20) & 0x3FF;
> > -
> > -                val  = av_le2ne32(*src++);
> > -                *v++ =  val & 0x3FF;
> > -                *y++ = (val >> 10) & 0x3FF;
> > -            }
> > -        }
> > -
> > -        psrc += stride;
> > -        y += pic->linesize[0] / 2 - avctx->width + (avctx->width & 1);
> > -        u += pic->linesize[1] / 2 - avctx->width / 2;
> > -        v += pic->linesize[2] / 2 - avctx->width / 2;
> > -    }
> > +    td.buf = (uint8_t*)psrc;
> > +    td.frame = pic;
> > +    avctx->execute2(avctx, v210_decode_slice, &td, NULL, s->thread_count);
> >
> >      if (avctx->field_order > AV_FIELD_PROGRESSIVE) {
> >          /* we have interlaced material flagged in container */
> > @@ -194,6 +228,8 @@ AVCodec ff_v210_decoder = {
> >      .priv_data_size = sizeof(V210DecContext),
> >      .init           = decode_init,
> >      .decode         = decode_frame,
> > -    .capabilities   = AV_CODEC_CAP_DR1,
> > +    .capabilities   = AV_CODEC_CAP_DR1 |
> > +                      AV_CODEC_CAP_SLICE_THREADS |
> > +                      AV_CODEC_CAP_FRAME_THREADS,
> 
> I only saw the slice thread acceleration, did not see the frame thread
> acceleration, is it something I missed?

Below is the testing result for comparing:
master:
./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark -f null -
frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x

patch applied:
1 slice thread
./ffmpeg -threads 4 -thread_type slice  -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark -f null -
frame= 1010 fps= 45 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.81x

2 frame thread
./ffmpeg -threads 4 -thread_type frame  -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark -f null -
frame= 1010 fps= 52 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.09x

3 frame+slice thread
./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark -f null -
frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.19

I haven't clue why slice improvement is very small, maybe I'm misunderstanding for the usage of thread api, but it's
improvement from the testing.


Thanks,
Limin


> >      .priv_class     = &v210dec_class,
> >  };
> > diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
> > index cfdb29da09..662e266315 100644
> > --- a/libavcodec/v210dec.h
> > +++ b/libavcodec/v210dec.h
> > @@ -27,6 +27,7 @@ typedef struct {
> >      AVClass *av_class;
> >      int custom_stride;
> >      int aligned_input;
> > +    int thread_count;
> >      int stride_warning_shown;
> >      void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
> >  } V210DecContext;
> > --
Lance Wang Oct. 12, 2019, 10:45 p.m. UTC | #4
On Sat, Oct 12, 2019 at 11:46:58PM +0200, Michael Niedermayer wrote:
> On Sat, Oct 12, 2019 at 06:58:21PM +0800, lance.lmwang@gmail.com wrote:
> > From: Limin Wang <lance.lmwang@gmail.com>
> > 
> > The multithread is avoid one core cpu is full with other filter like scale etc.
> > About the performance, the gain is very small, below is my testing for
> > performance.
> > In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> > only.
> > 
> > ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10
> > ~/Movies/1.v210
> > 
> > master:
> > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark
> > -f null -
> > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> > overhead: unknown
> > bench: utime=10.082s stime=13.784s rtime=23.889s
> > bench: maxrss=147836928kB
> > 
> > patch applied:
> > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> > ~/Movies/1.v210 -benchmark -f null -
> > 
> > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> > overhead: unknown
> > bench: utime=11.407s stime=17.258s rtime=18.279s
> > bench: maxrss=442884096kB
> > 
> > Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> > ---
> >  libavcodec/v210dec.c | 132 +++++++++++++++++++++++++++----------------
> >  libavcodec/v210dec.h |   1 +
> >  2 files changed, 85 insertions(+), 48 deletions(-)
> > 
> > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > index 5a33d8c089..c3ef8051e6 100644
> > --- a/libavcodec/v210dec.c
> > +++ b/libavcodec/v210dec.c
> > @@ -28,6 +28,7 @@
> >  #include "libavutil/internal.h"
> >  #include "libavutil/mem.h"
> >  #include "libavutil/intreadwrite.h"
> > +#include "thread.h"
> >  
> >  #define READ_PIXELS(a, b, c)         \
> >      do {                             \
> > @@ -37,6 +38,12 @@
> >          *c++ = (val >> 20) & 0x3FF;  \
> >      } while (0)
> >  
> > +typedef struct ThreadData {
> > +    AVFrame *frame;
> > +    uint8_t *buf;
> > +    int stride;
> > +} ThreadData;
> > +
> >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
> >  {
> >      uint32_t val;
> > @@ -64,21 +71,87 @@ static av_cold int decode_init(AVCodecContext *avctx)
> >      avctx->pix_fmt             = AV_PIX_FMT_YUV422P10;
> >      avctx->bits_per_raw_sample = 10;
> >  
> > +    s->thread_count  = av_clip(avctx->thread_count, 1, avctx->height/4);
> >      s->aligned_input = 0;
> >      ff_v210dec_init(s);
> >  
> >      return 0;
> >  }
> >  
> > +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
> > +{
> > +    V210DecContext *s = avctx->priv_data;
> > +    int h, w;
> > +    ThreadData *td = arg;
> > +    AVFrame *frame = td->frame;
> > +    int stride = td->stride;
> > +    int slice_h = avctx->height / s->thread_count;
> > +    int slice_m = avctx->height % s->thread_count;
> > +    int slice_start = jobnr * slice_h;
> 
> this is still not correct
> the height of a slice is not the same for all slices if the frame height
> is not divisible by the number of slices
Yes, so the last slice is processed different by the following code. I have tested with
different thread number to verify the fate-v210 result. 

make fate-v210 SAMPLES=../fate-suite thread_type=frame+slice threads=[1-7]

> 
> 
> [...]
> -- 
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> No snowflake in an avalanche ever feels responsible. -- Voltaire



> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Michael Niedermayer Oct. 14, 2019, 8:12 p.m. UTC | #5
On Sun, Oct 13, 2019 at 06:45:16AM +0800, Limin Wang wrote:
> On Sat, Oct 12, 2019 at 11:46:58PM +0200, Michael Niedermayer wrote:
> > On Sat, Oct 12, 2019 at 06:58:21PM +0800, lance.lmwang@gmail.com wrote:
> > > From: Limin Wang <lance.lmwang@gmail.com>
> > > 
> > > The multithread is avoid one core cpu is full with other filter like scale etc.
> > > About the performance, the gain is very small, below is my testing for
> > > performance.
> > > In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> > > only.
> > > 
> > > ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10
> > > ~/Movies/1.v210
> > > 
> > > master:
> > > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark
> > > -f null -
> > > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> > > overhead: unknown
> > > bench: utime=10.082s stime=13.784s rtime=23.889s
> > > bench: maxrss=147836928kB
> > > 
> > > patch applied:
> > > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> > > ~/Movies/1.v210 -benchmark -f null -
> > > 
> > > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> > > overhead: unknown
> > > bench: utime=11.407s stime=17.258s rtime=18.279s
> > > bench: maxrss=442884096kB
> > > 
> > > Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> > > ---
> > >  libavcodec/v210dec.c | 132 +++++++++++++++++++++++++++----------------
> > >  libavcodec/v210dec.h |   1 +
> > >  2 files changed, 85 insertions(+), 48 deletions(-)
> > > 
> > > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > > index 5a33d8c089..c3ef8051e6 100644
> > > --- a/libavcodec/v210dec.c
> > > +++ b/libavcodec/v210dec.c
> > > @@ -28,6 +28,7 @@
> > >  #include "libavutil/internal.h"
> > >  #include "libavutil/mem.h"
> > >  #include "libavutil/intreadwrite.h"
> > > +#include "thread.h"
> > >  
> > >  #define READ_PIXELS(a, b, c)         \
> > >      do {                             \
> > > @@ -37,6 +38,12 @@
> > >          *c++ = (val >> 20) & 0x3FF;  \
> > >      } while (0)
> > >  
> > > +typedef struct ThreadData {
> > > +    AVFrame *frame;
> > > +    uint8_t *buf;
> > > +    int stride;
> > > +} ThreadData;
> > > +
> > >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
> > >  {
> > >      uint32_t val;
> > > @@ -64,21 +71,87 @@ static av_cold int decode_init(AVCodecContext *avctx)
> > >      avctx->pix_fmt             = AV_PIX_FMT_YUV422P10;
> > >      avctx->bits_per_raw_sample = 10;
> > >  
> > > +    s->thread_count  = av_clip(avctx->thread_count, 1, avctx->height/4);
> > >      s->aligned_input = 0;
> > >      ff_v210dec_init(s);
> > >  
> > >      return 0;
> > >  }
> > >  
> > > +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
> > > +{
> > > +    V210DecContext *s = avctx->priv_data;
> > > +    int h, w;
> > > +    ThreadData *td = arg;
> > > +    AVFrame *frame = td->frame;
> > > +    int stride = td->stride;
> > > +    int slice_h = avctx->height / s->thread_count;
> > > +    int slice_m = avctx->height % s->thread_count;
> > > +    int slice_start = jobnr * slice_h;
> > 
> > this is still not correct
> > the height of a slice is not the same for all slices if the frame height
> > is not divisible by the number of slices
> Yes, so the last slice is processed different by the following code. I have tested with
> different thread number to verify the fate-v210 result. 
> 
> make fate-v210 SAMPLES=../fate-suite thread_type=frame+slice threads=[1-7]

What the code should do is split the frame evenly not
have many small slices and a really large last one

a really large last slice would need much longer to be processed making
the work distribution uneven

thx

[...]
Lance Wang Oct. 15, 2019, 1:21 a.m. UTC | #6
On Mon, Oct 14, 2019 at 10:12:45PM +0200, Michael Niedermayer wrote:
> On Sun, Oct 13, 2019 at 06:45:16AM +0800, Limin Wang wrote:
> > On Sat, Oct 12, 2019 at 11:46:58PM +0200, Michael Niedermayer wrote:
> > > On Sat, Oct 12, 2019 at 06:58:21PM +0800, lance.lmwang@gmail.com wrote:
> > > > From: Limin Wang <lance.lmwang@gmail.com>
> > > > 
> > > > The multithread is avoid one core cpu is full with other filter like scale etc.
> > > > About the performance, the gain is very small, below is my testing for
> > > > performance.
> > > > In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> > > > only.
> > > > 
> > > > ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10
> > > > ~/Movies/1.v210
> > > > 
> > > > master:
> > > > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark
> > > > -f null -
> > > > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> > > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> > > > overhead: unknown
> > > > bench: utime=10.082s stime=13.784s rtime=23.889s
> > > > bench: maxrss=147836928kB
> > > > 
> > > > patch applied:
> > > > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> > > > ~/Movies/1.v210 -benchmark -f null -
> > > > 
> > > > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> > > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> > > > overhead: unknown
> > > > bench: utime=11.407s stime=17.258s rtime=18.279s
> > > > bench: maxrss=442884096kB
> > > > 
> > > > Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> > > > ---
> > > >  libavcodec/v210dec.c | 132 +++++++++++++++++++++++++++----------------
> > > >  libavcodec/v210dec.h |   1 +
> > > >  2 files changed, 85 insertions(+), 48 deletions(-)
> > > > 
> > > > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > > > index 5a33d8c089..c3ef8051e6 100644
> > > > --- a/libavcodec/v210dec.c
> > > > +++ b/libavcodec/v210dec.c
> > > > @@ -28,6 +28,7 @@
> > > >  #include "libavutil/internal.h"
> > > >  #include "libavutil/mem.h"
> > > >  #include "libavutil/intreadwrite.h"
> > > > +#include "thread.h"
> > > >  
> > > >  #define READ_PIXELS(a, b, c)         \
> > > >      do {                             \
> > > > @@ -37,6 +38,12 @@
> > > >          *c++ = (val >> 20) & 0x3FF;  \
> > > >      } while (0)
> > > >  
> > > > +typedef struct ThreadData {
> > > > +    AVFrame *frame;
> > > > +    uint8_t *buf;
> > > > +    int stride;
> > > > +} ThreadData;
> > > > +
> > > >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
> > > >  {
> > > >      uint32_t val;
> > > > @@ -64,21 +71,87 @@ static av_cold int decode_init(AVCodecContext *avctx)
> > > >      avctx->pix_fmt             = AV_PIX_FMT_YUV422P10;
> > > >      avctx->bits_per_raw_sample = 10;
> > > >  
> > > > +    s->thread_count  = av_clip(avctx->thread_count, 1, avctx->height/4);
> > > >      s->aligned_input = 0;
> > > >      ff_v210dec_init(s);
> > > >  
> > > >      return 0;
> > > >  }
> > > >  
> > > > +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
> > > > +{
> > > > +    V210DecContext *s = avctx->priv_data;
> > > > +    int h, w;
> > > > +    ThreadData *td = arg;
> > > > +    AVFrame *frame = td->frame;
> > > > +    int stride = td->stride;
> > > > +    int slice_h = avctx->height / s->thread_count;
> > > > +    int slice_m = avctx->height % s->thread_count;
> > > > +    int slice_start = jobnr * slice_h;
> > > 
> > > this is still not correct
> > > the height of a slice is not the same for all slices if the frame height
> > > is not divisible by the number of slices
> > Yes, so the last slice is processed different by the following code. I have tested with
> > different thread number to verify the fate-v210 result. 
> > 
> > make fate-v210 SAMPLES=../fate-suite thread_type=frame+slice threads=[1-7]
> 
> What the code should do is split the frame evenly not
> have many small slices and a really large last one
> 
> a really large last slice would need much longer to be processed making
> the work distribution uneven

That's true, most of my testing are >=4K resolution, for other
resolution, the process is very faster and no need to enable 
thread. I'll do the split the frame evenly by your suggestion.


> 
> thx
> 
> [...]
> -- 
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> If you fake or manipulate statistics in a paper in physics you will never
> get a job again.
> If you fake or manipulate statistics in a paper in medicin you will get
> a job for life at the pharma industry.



> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Lance Wang Oct. 15, 2019, 10:41 a.m. UTC | #7
On Mon, Oct 14, 2019 at 10:12:45PM +0200, Michael Niedermayer wrote:
> On Sun, Oct 13, 2019 at 06:45:16AM +0800, Limin Wang wrote:
> > On Sat, Oct 12, 2019 at 11:46:58PM +0200, Michael Niedermayer wrote:
> > > On Sat, Oct 12, 2019 at 06:58:21PM +0800, lance.lmwang@gmail.com wrote:
> > > > From: Limin Wang <lance.lmwang@gmail.com>
> > > > 
> > > > The multithread is avoid one core cpu is full with other filter like scale etc.
> > > > About the performance, the gain is very small, below is my testing for
> > > > performance.
> > > > In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame
> > > > only.
> > > > 
> > > > ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10
> > > > ~/Movies/1.v210
> > > > 
> > > > master:
> > > > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark
> > > > -f null -
> > > > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> > > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> > > > overhead: unknown
> > > > bench: utime=10.082s stime=13.784s rtime=23.889s
> > > > bench: maxrss=147836928kB
> > > > 
> > > > patch applied:
> > > > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i
> > > > ~/Movies/1.v210 -benchmark -f null -
> > > > 
> > > > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> > > > video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing
> > > > overhead: unknown
> > > > bench: utime=11.407s stime=17.258s rtime=18.279s
> > > > bench: maxrss=442884096kB
> > > > 
> > > > Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> > > > ---
> > > >  libavcodec/v210dec.c | 132 +++++++++++++++++++++++++++----------------
> > > >  libavcodec/v210dec.h |   1 +
> > > >  2 files changed, 85 insertions(+), 48 deletions(-)
> > > > 
> > > > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > > > index 5a33d8c089..c3ef8051e6 100644
> > > > --- a/libavcodec/v210dec.c
> > > > +++ b/libavcodec/v210dec.c
> > > > @@ -28,6 +28,7 @@
> > > >  #include "libavutil/internal.h"
> > > >  #include "libavutil/mem.h"
> > > >  #include "libavutil/intreadwrite.h"
> > > > +#include "thread.h"
> > > >  
> > > >  #define READ_PIXELS(a, b, c)         \
> > > >      do {                             \
> > > > @@ -37,6 +38,12 @@
> > > >          *c++ = (val >> 20) & 0x3FF;  \
> > > >      } while (0)
> > > >  
> > > > +typedef struct ThreadData {
> > > > +    AVFrame *frame;
> > > > +    uint8_t *buf;
> > > > +    int stride;
> > > > +} ThreadData;
> > > > +
> > > >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
> > > >  {
> > > >      uint32_t val;
> > > > @@ -64,21 +71,87 @@ static av_cold int decode_init(AVCodecContext *avctx)
> > > >      avctx->pix_fmt             = AV_PIX_FMT_YUV422P10;
> > > >      avctx->bits_per_raw_sample = 10;
> > > >  
> > > > +    s->thread_count  = av_clip(avctx->thread_count, 1, avctx->height/4);
> > > >      s->aligned_input = 0;
> > > >      ff_v210dec_init(s);
> > > >  
> > > >      return 0;
> > > >  }
> > > >  
> > > > +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
> > > > +{
> > > > +    V210DecContext *s = avctx->priv_data;
> > > > +    int h, w;
> > > > +    ThreadData *td = arg;
> > > > +    AVFrame *frame = td->frame;
> > > > +    int stride = td->stride;
> > > > +    int slice_h = avctx->height / s->thread_count;
> > > > +    int slice_m = avctx->height % s->thread_count;
> > > > +    int slice_start = jobnr * slice_h;
> > > 
> > > this is still not correct
> > > the height of a slice is not the same for all slices if the frame height
> > > is not divisible by the number of slices
> > Yes, so the last slice is processed different by the following code. I have tested with
> > different thread number to verify the fate-v210 result. 
> > 
> > make fate-v210 SAMPLES=../fate-suite thread_type=frame+slice threads=[1-7]
> 
> What the code should do is split the frame evenly not
> have many small slices and a really large last one
> 
> a really large last slice would need much longer to be processed making
> the work distribution uneven

Michael, I have updated the patch to split the frame evenly. Please review it.
I have tested with fate with threads from 1-18 to check the result.

In addition, I updated the comment message with more performance data.


> 
> thx
> 
> [...]
> -- 
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> If you fake or manipulate statistics in a paper in physics you will never
> get a job again.
> If you fake or manipulate statistics in a paper in medicin you will get
> a job for life at the pharma industry.



> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff mbox

Patch

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index 5a33d8c089..c3ef8051e6 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -28,6 +28,7 @@ 
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/intreadwrite.h"
+#include "thread.h"
 
 #define READ_PIXELS(a, b, c)         \
     do {                             \
@@ -37,6 +38,12 @@ 
         *c++ = (val >> 20) & 0x3FF;  \
     } while (0)
 
+typedef struct ThreadData {
+    AVFrame *frame;
+    uint8_t *buf;
+    int stride;
+} ThreadData;
+
 static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
 {
     uint32_t val;
@@ -64,21 +71,87 @@  static av_cold int decode_init(AVCodecContext *avctx)
     avctx->pix_fmt             = AV_PIX_FMT_YUV422P10;
     avctx->bits_per_raw_sample = 10;
 
+    s->thread_count  = av_clip(avctx->thread_count, 1, avctx->height/4);
     s->aligned_input = 0;
     ff_v210dec_init(s);
 
     return 0;
 }
 
+static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
+{
+    V210DecContext *s = avctx->priv_data;
+    int h, w;
+    ThreadData *td = arg;
+    AVFrame *frame = td->frame;
+    int stride = td->stride;
+    int slice_h = avctx->height / s->thread_count;
+    int slice_m = avctx->height % s->thread_count;
+    int slice_start = jobnr * slice_h;
+    int slice_end = slice_start + slice_h;
+    const uint8_t *psrc = td->buf + stride * slice_start;
+    uint16_t *y, *u, *v;
+
+    /* add the remaining slice for the last job */
+    if (jobnr == s->thread_count - 1)
+        slice_end += slice_m;
+
+    y = (uint16_t*)frame->data[0] + slice_start * frame->linesize[0] / 2;
+    u = (uint16_t*)frame->data[1] + slice_start * frame->linesize[1] / 2;
+    v = (uint16_t*)frame->data[2] + slice_start * frame->linesize[2] / 2;
+    for (h = slice_start; h < slice_end; h++) {
+        const uint32_t *src = (const uint32_t*)psrc;
+        uint32_t val;
+
+        w = (avctx->width / 12) * 12;
+        s->unpack_frame(src, y, u, v, w);
+
+        y += w;
+        u += w >> 1;
+        v += w >> 1;
+        src += (w << 1) / 3;
+
+        if (w < avctx->width - 5) {
+            READ_PIXELS(u, y, v);
+            READ_PIXELS(y, u, y);
+            READ_PIXELS(v, y, u);
+            READ_PIXELS(y, v, y);
+            w += 6;
+        }
+
+        if (w < avctx->width - 1) {
+            READ_PIXELS(u, y, v);
+
+            val  = av_le2ne32(*src++);
+            *y++ =  val & 0x3FF;
+            if (w < avctx->width - 3) {
+                *u++ = (val >> 10) & 0x3FF;
+                *y++ = (val >> 20) & 0x3FF;
+
+                val  = av_le2ne32(*src++);
+                *v++ =  val & 0x3FF;
+                *y++ = (val >> 10) & 0x3FF;
+            }
+        }
+
+        psrc += stride;
+        y += frame->linesize[0] / 2 - avctx->width + (avctx->width & 1);
+        u += frame->linesize[1] / 2 - avctx->width / 2;
+        v += frame->linesize[2] / 2 - avctx->width / 2;
+    }
+
+    return 0;
+}
+
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt)
 {
     V210DecContext *s = avctx->priv_data;
-
-    int h, w, ret, stride, aligned_input;
+    ThreadData td;
+    int ret, stride, aligned_input;
+    ThreadFrame frame = { .f = data };
     AVFrame *pic = data;
     const uint8_t *psrc = avpkt->data;
-    uint16_t *y, *u, *v;
 
     if (s->custom_stride )
         stride = s->custom_stride;
@@ -86,6 +159,7 @@  static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         int aligned_width = ((avctx->width + 47) / 48) * 48;
         stride = aligned_width * 8 / 3;
     }
+    td.stride = stride;
 
     if (avpkt->size < stride * avctx->height) {
         if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) {
@@ -110,55 +184,15 @@  static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         ff_v210dec_init(s);
     }
 
-    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         return ret;
 
-    y = (uint16_t*)pic->data[0];
-    u = (uint16_t*)pic->data[1];
-    v = (uint16_t*)pic->data[2];
     pic->pict_type = AV_PICTURE_TYPE_I;
     pic->key_frame = 1;
 
-    for (h = 0; h < avctx->height; h++) {
-        const uint32_t *src = (const uint32_t*)psrc;
-        uint32_t val;
-
-        w = (avctx->width / 12) * 12;
-        s->unpack_frame(src, y, u, v, w);
-
-        y += w;
-        u += w >> 1;
-        v += w >> 1;
-        src += (w << 1) / 3;
-
-        if (w < avctx->width - 5) {
-            READ_PIXELS(u, y, v);
-            READ_PIXELS(y, u, y);
-            READ_PIXELS(v, y, u);
-            READ_PIXELS(y, v, y);
-            w += 6;
-        }
-
-        if (w < avctx->width - 1) {
-            READ_PIXELS(u, y, v);
-
-            val  = av_le2ne32(*src++);
-            *y++ =  val & 0x3FF;
-            if (w < avctx->width - 3) {
-                *u++ = (val >> 10) & 0x3FF;
-                *y++ = (val >> 20) & 0x3FF;
-
-                val  = av_le2ne32(*src++);
-                *v++ =  val & 0x3FF;
-                *y++ = (val >> 10) & 0x3FF;
-            }
-        }
-
-        psrc += stride;
-        y += pic->linesize[0] / 2 - avctx->width + (avctx->width & 1);
-        u += pic->linesize[1] / 2 - avctx->width / 2;
-        v += pic->linesize[2] / 2 - avctx->width / 2;
-    }
+    td.buf = (uint8_t*)psrc;
+    td.frame = pic;
+    avctx->execute2(avctx, v210_decode_slice, &td, NULL, s->thread_count);
 
     if (avctx->field_order > AV_FIELD_PROGRESSIVE) {
         /* we have interlaced material flagged in container */
@@ -194,6 +228,8 @@  AVCodec ff_v210_decoder = {
     .priv_data_size = sizeof(V210DecContext),
     .init           = decode_init,
     .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 |
+                      AV_CODEC_CAP_SLICE_THREADS |
+                      AV_CODEC_CAP_FRAME_THREADS,
     .priv_class     = &v210dec_class,
 };
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
index cfdb29da09..662e266315 100644
--- a/libavcodec/v210dec.h
+++ b/libavcodec/v210dec.h
@@ -27,6 +27,7 @@  typedef struct {
     AVClass *av_class;
     int custom_stride;
     int aligned_input;
+    int thread_count;
     int stride_warning_shown;
     void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
 } V210DecContext;