[FFmpeg-devel,v6] avcodec/v210dec: add the frame and slice threading support

Submitted by lance.lmwang@gmail.com on Oct. 15, 2019, 10:37 a.m.

Details

Message ID 20191015103710.22099-1-lance.lmwang@gmail.com
State New
Headers show

Commit Message

lance.lmwang@gmail.com Oct. 15, 2019, 10:37 a.m.
From: Limin Wang <lance.lmwang@gmail.com>

Threading is to avoid a core cpu being occupied fully with other filters like scale,
regarding performance, if your cpu frequency is very high, the gain is very small, but
with more cores and fewer cpu MHz cpus, you will get more improvements.

The following is my testing results of performance on two different system:
1, testing result with my old mac pro
./ffmpeg -y -i ./4k_4096_3072.mov -c:v v210 -f rawvideo -frames 10 ./1.v210

./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ./1.v210 -benchmark -f null -
frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x

patch applied:
./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i ./1.v210 -benchmark -f null -
frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x

2, testing result with x86 server (Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz):
./ffmpeg  -y -i ./4k_3840_2160.ts -c:v v210 -f rawvideo -frames 50 ./2.v210

./ffmpeg -threads 1 -s 3840x2160  -stream_loop 20 -i ./2.v210 -benchmark -f null -
frame= 1050 fps= 80 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=3.19x

patch applied:
./ffmpeg -threads 2 -thread_type frame+slice  -s 3840x2160  -stream_loop 20 -i ./2.v210 -benchmark -f null -
frame= 1050 fps=111 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=4.45x

./ffmpeg -threads 4 -thread_type frame+slice  -s 3840x2160  -stream_loop 20 -i ./2.v210 -benchmark -f null -
frame= 1050 fps=145 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=5.81x

Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
---
 libavcodec/v210dec.c | 135 ++++++++++++++++++++++++++++---------------
 libavcodec/v210dec.h |   1 +
 2 files changed, 88 insertions(+), 48 deletions(-)

Comments

Michael Niedermayer Oct. 17, 2019, 4:43 p.m.
On Tue, Oct 15, 2019 at 06:37:10PM +0800, lance.lmwang@gmail.com wrote:
> From: Limin Wang <lance.lmwang@gmail.com>
> 
> Threading is to avoid a core cpu being occupied fully with other filters like scale,
> regarding performance, if your cpu frequency is very high, the gain is very small, but
> with more cores and fewer cpu MHz cpus, you will get more improvements.
> 
> The following is my testing results of performance on two different system:
> 1, testing result with my old mac pro
> ./ffmpeg -y -i ./4k_4096_3072.mov -c:v v210 -f rawvideo -frames 10 ./1.v210
> 
> ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ./1.v210 -benchmark -f null -
> frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> 
> patch applied:
> ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i ./1.v210 -benchmark -f null -
> frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> 
> 2, testing result with x86 server (Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz):
> ./ffmpeg  -y -i ./4k_3840_2160.ts -c:v v210 -f rawvideo -frames 50 ./2.v210
> 
> ./ffmpeg -threads 1 -s 3840x2160  -stream_loop 20 -i ./2.v210 -benchmark -f null -
> frame= 1050 fps= 80 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=3.19x
> 
> patch applied:
> ./ffmpeg -threads 2 -thread_type frame+slice  -s 3840x2160  -stream_loop 20 -i ./2.v210 -benchmark -f null -
> frame= 1050 fps=111 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=4.45x
> 
> ./ffmpeg -threads 4 -thread_type frame+slice  -s 3840x2160  -stream_loop 20 -i ./2.v210 -benchmark -f null -
> frame= 1050 fps=145 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=5.81x
> 
> Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> ---
>  libavcodec/v210dec.c | 135 ++++++++++++++++++++++++++++---------------
>  libavcodec/v210dec.h |   1 +
>  2 files changed, 88 insertions(+), 48 deletions(-)
> 
> diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> index 5a33d8c089..91c2fe0d07 100644
> --- a/libavcodec/v210dec.c
> +++ b/libavcodec/v210dec.c
> @@ -28,6 +28,7 @@
>  #include "libavutil/internal.h"
>  #include "libavutil/mem.h"
>  #include "libavutil/intreadwrite.h"
> +#include "thread.h"
>  
>  #define READ_PIXELS(a, b, c)         \
>      do {                             \
> @@ -37,6 +38,12 @@
>          *c++ = (val >> 20) & 0x3FF;  \
>      } while (0)
>  
> +typedef struct ThreadData {
> +    AVFrame *frame;
> +    uint8_t *buf;
> +    int stride;
> +} ThreadData;
> +
>  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
>  {
>      uint32_t val;
> @@ -64,21 +71,90 @@ static av_cold int decode_init(AVCodecContext *avctx)
>      avctx->pix_fmt             = AV_PIX_FMT_YUV422P10;
>      avctx->bits_per_raw_sample = 10;
>  
> +    s->thread_count  = av_clip(avctx->thread_count, 1, avctx->height/4);
>      s->aligned_input = 0;
>      ff_v210dec_init(s);
>  
>      return 0;
>  }
>  
> +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
> +{
> +    V210DecContext *s = avctx->priv_data;
> +    int h, w;
> +    ThreadData *td = arg;
> +    AVFrame *frame = td->frame;
> +    int stride = td->stride;
> +    int slice_h = avctx->height / s->thread_count;
> +    int slice_m = avctx->height % s->thread_count;
> +    int slice_start;
> +    int slice_end;
> +    uint8_t *psrc;
> +    uint16_t *y, *u, *v;
> +
> +    slice_start = jobnr * slice_h;
> +    slice_start += FFMIN(jobnr, slice_m);
> +    slice_end = slice_start + slice_h;
> +    if (jobnr < slice_m)
> +        slice_end ++;

I suggest to use code similar to what filters use, for example yadif
    int slice_start = (height *  jobnr   ) / s->thread_count;
    int slice_end   = (height * (jobnr+1)) / s->thread_count;
    
This is simpler

[...]
lance.lmwang@gmail.com Oct. 18, 2019, 1:05 a.m.
On Thu, Oct 17, 2019 at 06:43:33PM +0200, Michael Niedermayer wrote:
> On Tue, Oct 15, 2019 at 06:37:10PM +0800, lance.lmwang@gmail.com wrote:
> > From: Limin Wang <lance.lmwang@gmail.com>
> > 
> > Threading is to avoid a core cpu being occupied fully with other filters like scale,
> > regarding performance, if your cpu frequency is very high, the gain is very small, but
> > with more cores and fewer cpu MHz cpus, you will get more improvements.
> > 
> > The following is my testing results of performance on two different system:
> > 1, testing result with my old mac pro
> > ./ffmpeg -y -i ./4k_4096_3072.mov -c:v v210 -f rawvideo -frames 10 ./1.v210
> > 
> > ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ./1.v210 -benchmark -f null -
> > frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x
> > 
> > patch applied:
> > ./ffmpeg -threads 4 -thread_type frame+slice  -s 4096x3072 -stream_loop 100 -i ./1.v210 -benchmark -f null -
> > frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x
> > 
> > 2, testing result with x86 server (Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz):
> > ./ffmpeg  -y -i ./4k_3840_2160.ts -c:v v210 -f rawvideo -frames 50 ./2.v210
> > 
> > ./ffmpeg -threads 1 -s 3840x2160  -stream_loop 20 -i ./2.v210 -benchmark -f null -
> > frame= 1050 fps= 80 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=3.19x
> > 
> > patch applied:
> > ./ffmpeg -threads 2 -thread_type frame+slice  -s 3840x2160  -stream_loop 20 -i ./2.v210 -benchmark -f null -
> > frame= 1050 fps=111 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=4.45x
> > 
> > ./ffmpeg -threads 4 -thread_type frame+slice  -s 3840x2160  -stream_loop 20 -i ./2.v210 -benchmark -f null -
> > frame= 1050 fps=145 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=5.81x
> > 
> > Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> > ---
> >  libavcodec/v210dec.c | 135 ++++++++++++++++++++++++++++---------------
> >  libavcodec/v210dec.h |   1 +
> >  2 files changed, 88 insertions(+), 48 deletions(-)
> > 
> > diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
> > index 5a33d8c089..91c2fe0d07 100644
> > --- a/libavcodec/v210dec.c
> > +++ b/libavcodec/v210dec.c
> > @@ -28,6 +28,7 @@
> >  #include "libavutil/internal.h"
> >  #include "libavutil/mem.h"
> >  #include "libavutil/intreadwrite.h"
> > +#include "thread.h"
> >  
> >  #define READ_PIXELS(a, b, c)         \
> >      do {                             \
> > @@ -37,6 +38,12 @@
> >          *c++ = (val >> 20) & 0x3FF;  \
> >      } while (0)
> >  
> > +typedef struct ThreadData {
> > +    AVFrame *frame;
> > +    uint8_t *buf;
> > +    int stride;
> > +} ThreadData;
> > +
> >  static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
> >  {
> >      uint32_t val;
> > @@ -64,21 +71,90 @@ static av_cold int decode_init(AVCodecContext *avctx)
> >      avctx->pix_fmt             = AV_PIX_FMT_YUV422P10;
> >      avctx->bits_per_raw_sample = 10;
> >  
> > +    s->thread_count  = av_clip(avctx->thread_count, 1, avctx->height/4);
> >      s->aligned_input = 0;
> >      ff_v210dec_init(s);
> >  
> >      return 0;
> >  }
> >  
> > +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
> > +{
> > +    V210DecContext *s = avctx->priv_data;
> > +    int h, w;
> > +    ThreadData *td = arg;
> > +    AVFrame *frame = td->frame;
> > +    int stride = td->stride;
> > +    int slice_h = avctx->height / s->thread_count;
> > +    int slice_m = avctx->height % s->thread_count;
> > +    int slice_start;
> > +    int slice_end;
> > +    uint8_t *psrc;
> > +    uint16_t *y, *u, *v;
> > +
> > +    slice_start = jobnr * slice_h;
> > +    slice_start += FFMIN(jobnr, slice_m);
> > +    slice_end = slice_start + slice_h;
> > +    if (jobnr < slice_m)
> > +        slice_end ++;
> 
> I suggest to use code similar to what filters use, for example yadif
>     int slice_start = (height *  jobnr   ) / s->thread_count;
>     int slice_end   = (height * (jobnr+1)) / s->thread_count;
>     
> This is simpler
Yes, it's more simple, I'll update with that.

> 
> [...]
> -- 
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
> 
> Whats the most studid thing your enemy could do ? Blow himself up
> Whats the most studid thing you could do ? Give up your rights and
> freedom because your enemy blew himself up.
> 



> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

Patch hide | download patch | download mbox

diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c
index 5a33d8c089..91c2fe0d07 100644
--- a/libavcodec/v210dec.c
+++ b/libavcodec/v210dec.c
@@ -28,6 +28,7 @@ 
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/intreadwrite.h"
+#include "thread.h"
 
 #define READ_PIXELS(a, b, c)         \
     do {                             \
@@ -37,6 +38,12 @@ 
         *c++ = (val >> 20) & 0x3FF;  \
     } while (0)
 
+typedef struct ThreadData {
+    AVFrame *frame;
+    uint8_t *buf;
+    int stride;
+} ThreadData;
+
 static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
 {
     uint32_t val;
@@ -64,21 +71,90 @@  static av_cold int decode_init(AVCodecContext *avctx)
     avctx->pix_fmt             = AV_PIX_FMT_YUV422P10;
     avctx->bits_per_raw_sample = 10;
 
+    s->thread_count  = av_clip(avctx->thread_count, 1, avctx->height/4);
     s->aligned_input = 0;
     ff_v210dec_init(s);
 
     return 0;
 }
 
+static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
+{
+    V210DecContext *s = avctx->priv_data;
+    int h, w;
+    ThreadData *td = arg;
+    AVFrame *frame = td->frame;
+    int stride = td->stride;
+    int slice_h = avctx->height / s->thread_count;
+    int slice_m = avctx->height % s->thread_count;
+    int slice_start;
+    int slice_end;
+    uint8_t *psrc;
+    uint16_t *y, *u, *v;
+
+    slice_start = jobnr * slice_h;
+    slice_start += FFMIN(jobnr, slice_m);
+    slice_end = slice_start + slice_h;
+    if (jobnr < slice_m)
+        slice_end ++;
+
+    psrc = td->buf + stride * slice_start;
+    y = (uint16_t*)frame->data[0] + slice_start * frame->linesize[0] / 2;
+    u = (uint16_t*)frame->data[1] + slice_start * frame->linesize[1] / 2;
+    v = (uint16_t*)frame->data[2] + slice_start * frame->linesize[2] / 2;
+    for (h = slice_start; h < slice_end; h++) {
+        const uint32_t *src = (const uint32_t*)psrc;
+        uint32_t val;
+
+        w = (avctx->width / 12) * 12;
+        s->unpack_frame(src, y, u, v, w);
+
+        y += w;
+        u += w >> 1;
+        v += w >> 1;
+        src += (w << 1) / 3;
+
+        if (w < avctx->width - 5) {
+            READ_PIXELS(u, y, v);
+            READ_PIXELS(y, u, y);
+            READ_PIXELS(v, y, u);
+            READ_PIXELS(y, v, y);
+            w += 6;
+        }
+
+        if (w < avctx->width - 1) {
+            READ_PIXELS(u, y, v);
+
+            val  = av_le2ne32(*src++);
+            *y++ =  val & 0x3FF;
+            if (w < avctx->width - 3) {
+                *u++ = (val >> 10) & 0x3FF;
+                *y++ = (val >> 20) & 0x3FF;
+
+                val  = av_le2ne32(*src++);
+                *v++ =  val & 0x3FF;
+                *y++ = (val >> 10) & 0x3FF;
+            }
+        }
+
+        psrc += stride;
+        y += frame->linesize[0] / 2 - avctx->width + (avctx->width & 1);
+        u += frame->linesize[1] / 2 - avctx->width / 2;
+        v += frame->linesize[2] / 2 - avctx->width / 2;
+    }
+
+    return 0;
+}
+
 static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                         AVPacket *avpkt)
 {
     V210DecContext *s = avctx->priv_data;
-
-    int h, w, ret, stride, aligned_input;
+    ThreadData td;
+    int ret, stride, aligned_input;
+    ThreadFrame frame = { .f = data };
     AVFrame *pic = data;
     const uint8_t *psrc = avpkt->data;
-    uint16_t *y, *u, *v;
 
     if (s->custom_stride )
         stride = s->custom_stride;
@@ -86,6 +162,7 @@  static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         int aligned_width = ((avctx->width + 47) / 48) * 48;
         stride = aligned_width * 8 / 3;
     }
+    td.stride = stride;
 
     if (avpkt->size < stride * avctx->height) {
         if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) {
@@ -110,55 +187,15 @@  static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
         ff_v210dec_init(s);
     }
 
-    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
         return ret;
 
-    y = (uint16_t*)pic->data[0];
-    u = (uint16_t*)pic->data[1];
-    v = (uint16_t*)pic->data[2];
     pic->pict_type = AV_PICTURE_TYPE_I;
     pic->key_frame = 1;
 
-    for (h = 0; h < avctx->height; h++) {
-        const uint32_t *src = (const uint32_t*)psrc;
-        uint32_t val;
-
-        w = (avctx->width / 12) * 12;
-        s->unpack_frame(src, y, u, v, w);
-
-        y += w;
-        u += w >> 1;
-        v += w >> 1;
-        src += (w << 1) / 3;
-
-        if (w < avctx->width - 5) {
-            READ_PIXELS(u, y, v);
-            READ_PIXELS(y, u, y);
-            READ_PIXELS(v, y, u);
-            READ_PIXELS(y, v, y);
-            w += 6;
-        }
-
-        if (w < avctx->width - 1) {
-            READ_PIXELS(u, y, v);
-
-            val  = av_le2ne32(*src++);
-            *y++ =  val & 0x3FF;
-            if (w < avctx->width - 3) {
-                *u++ = (val >> 10) & 0x3FF;
-                *y++ = (val >> 20) & 0x3FF;
-
-                val  = av_le2ne32(*src++);
-                *v++ =  val & 0x3FF;
-                *y++ = (val >> 10) & 0x3FF;
-            }
-        }
-
-        psrc += stride;
-        y += pic->linesize[0] / 2 - avctx->width + (avctx->width & 1);
-        u += pic->linesize[1] / 2 - avctx->width / 2;
-        v += pic->linesize[2] / 2 - avctx->width / 2;
-    }
+    td.buf = (uint8_t*)psrc;
+    td.frame = pic;
+    avctx->execute2(avctx, v210_decode_slice, &td, NULL, s->thread_count);
 
     if (avctx->field_order > AV_FIELD_PROGRESSIVE) {
         /* we have interlaced material flagged in container */
@@ -194,6 +231,8 @@  AVCodec ff_v210_decoder = {
     .priv_data_size = sizeof(V210DecContext),
     .init           = decode_init,
     .decode         = decode_frame,
-    .capabilities   = AV_CODEC_CAP_DR1,
+    .capabilities   = AV_CODEC_CAP_DR1 |
+                      AV_CODEC_CAP_SLICE_THREADS |
+                      AV_CODEC_CAP_FRAME_THREADS,
     .priv_class     = &v210dec_class,
 };
diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h
index cfdb29da09..662e266315 100644
--- a/libavcodec/v210dec.h
+++ b/libavcodec/v210dec.h
@@ -27,6 +27,7 @@  typedef struct {
     AVClass *av_class;
     int custom_stride;
     int aligned_input;
+    int thread_count;
     int stride_warning_shown;
     void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
 } V210DecContext;