[FFmpeg-devel,v1] avcodec/v410dec: add the frame and slice threading support

Submitted by lance.lmwang@gmail.com on Oct. 25, 2019, 3:36 p.m.

Details

Message ID 20191025153644.4904-1-lance.lmwang@gmail.com
State New
Headers show

Commit Message

lance.lmwang@gmail.com Oct. 25, 2019, 3:36 p.m.
From: Limin Wang <lance.lmwang@gmail.com>

1, Test server configure:
[root@localhost ~]# cat /proc/cpuinfo  |grep "model name"
model name	: Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
model name	: Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
...

[root@localhost ~]# free -h
              total        used        free      shared  buff/cache   available
Mem:           102G        997M         93G         16M        7.6G        100G

2, performance profiling
master:
./ffmpeg -y -stream_loop 100 -i ./test.avi -benchmark -f null -
video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=16.932s stime=9.417s rtime=26.341s
bench: maxrss=271056kB
frame= 1010 fps= 38 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=8.22x

patch applied:
./ffmpeg -y -threads 2 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null -
frame= 1010 fps= 52 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=11.1x
video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=23.164s stime=10.983s rtime=19.503s
bench: maxrss=338252kB

./ffmpeg -y -threads 4 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null -
frame= 1010 fps= 71 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=15.3x
video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=21.610s stime=11.603s rtime=14.160s
bench: maxrss=517060kB


Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
---
 libavcodec/v410dec.c | 72 +++++++++++++++++++++++++++++++-------------
 1 file changed, 51 insertions(+), 21 deletions(-)

Comments

Carl Eugen Hoyos Oct. 25, 2019, 4:39 p.m.
Am Fr., 25. Okt. 2019 um 17:37 Uhr schrieb <lance.lmwang@gmail.com>:

> -    .capabilities = AV_CODEC_CAP_DR1,
> +    .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS |
> +                    AV_CODEC_CAP_FRAME_THREADS

In your tests: Was slice threading or frame threading more effective?

Carl Eugen
lance.lmwang@gmail.com Oct. 26, 2019, 2:07 a.m.
On Fri, Oct 25, 2019 at 06:39:46PM +0200, Carl Eugen Hoyos wrote:
> Am Fr., 25. Okt. 2019 um 17:37 Uhr schrieb <lance.lmwang@gmail.com>:
> 
> > -    .capabilities = AV_CODEC_CAP_DR1,
> > +    .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS |
> > +                    AV_CODEC_CAP_FRAME_THREADS

It's related to cpu and memory very much, I recall on one of my linux
system, slice thread is faster. It's using the same process with v210dec,
although the patch is pending for review yet.

Below is testing result on my old mac pro system with 8G memory:
./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v410 -an -frames:v 10 ~/Movies/1.avi

./ffmpeg -y -stream_loop 100 -i ~/Movies/1.avi  -benchmark -f null -
frame= 1010 fps= 27 q=-0.0 Lsize=N/A time=00:00:42.08 bitrate=N/A speed=1.11x

./ffmpeg -y -threads 4 -thread_type slice -stream_loop 100 -i ~/Movies/1.avi  -benchmark -f null -
frame= 1010 fps= 32 q=-0.0 Lsize=N/A time=00:00:42.08 bitrate=N/A speed=1.34x

./ffmpeg -y -threads 4 -thread_type frame -stream_loop 100 -i ~/Movies/1.avi  -benchmark -f null -
frame= 1010 fps= 38 q=-0.0 Lsize=N/A time=00:00:42.08 bitrate=N/A speed=1.58x

./ffmpeg -y -threads 4 -thread_type frame+slice -stream_loop 100 -i ~/Movies/1.avi  -benchmark -f null -
frame= 1010 fps= 38 q=-0.0 Lsize=N/A time=00:00:42.08 bitrate=N/A speed=1.59x


> 
> In your tests: Was slice threading or frame threading more effective?
> 
> Carl Eugen
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

Patch hide | download patch | download mbox

diff --git a/libavcodec/v410dec.c b/libavcodec/v410dec.c
index 48fab68273..7ad5eb8fb5 100644
--- a/libavcodec/v410dec.c
+++ b/libavcodec/v410dec.c
@@ -24,6 +24,13 @@ 
 #include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "internal.h"
+#include "thread.h"
+
+typedef struct ThreadData {
+    AVFrame *frame;
+    uint8_t *buf;
+    int stride;
+} ThreadData;
 
 static av_cold int v410_decode_init(AVCodecContext *avctx)
 {
@@ -42,31 +49,24 @@  static av_cold int v410_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static int v410_decode_frame(AVCodecContext *avctx, void *data,
-                             int *got_frame, AVPacket *avpkt)
+static int v410_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
 {
-    AVFrame *pic = data;
-    uint8_t *src = avpkt->data;
+    ThreadData *td = arg;
+    AVFrame *pic = td->frame;
+    int stride = td->stride;
+    int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4);
+    int slice_start = (avctx->height *  jobnr) / thread_count;
+    int slice_end = (avctx->height * (jobnr+1)) / thread_count;
+    const uint8_t *src = td->buf + stride * slice_start;
     uint16_t *y, *u, *v;
     uint32_t val;
-    int i, j, ret;
-
-    if (avpkt->size < 4 * avctx->height * avctx->width) {
-        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
-        return AVERROR(EINVAL);
-    }
-
-    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
-        return ret;
+    int i, j;
 
-    pic->key_frame = 1;
-    pic->pict_type = AV_PICTURE_TYPE_I;
+    y = (uint16_t*)pic->data[0] + slice_start * (pic->linesize[0] >> 1);
+    u = (uint16_t*)pic->data[1] + slice_start * (pic->linesize[1] >> 1);
+    v = (uint16_t*)pic->data[2] + slice_start * (pic->linesize[2] >> 1);
 
-    y = (uint16_t *)pic->data[0];
-    u = (uint16_t *)pic->data[1];
-    v = (uint16_t *)pic->data[2];
-
-    for (i = 0; i < avctx->height; i++) {
+    for (i = slice_start; i < slice_end; i++) {
         for (j = 0; j < avctx->width; j++) {
             val = AV_RL32(src);
 
@@ -82,6 +82,35 @@  static int v410_decode_frame(AVCodecContext *avctx, void *data,
         v += pic->linesize[2] >> 1;
     }
 
+    return 0;
+}
+
+static int v410_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    ThreadData td;
+    ThreadFrame frame = { .f = data };
+    AVFrame *pic = data;
+    uint8_t *src = avpkt->data;
+    int ret;
+    int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4);
+
+    td.stride = avctx->width * 4;
+    if (avpkt->size < 4 * avctx->height * avctx->width) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    td.buf = src;
+    td.frame = pic;
+    avctx->execute2(avctx, v410_decode_slice, &td, NULL, thread_count);
+
     *got_frame = 1;
 
     return avpkt->size;
@@ -94,5 +123,6 @@  AVCodec ff_v410_decoder = {
     .id           = AV_CODEC_ID_V410,
     .init         = v410_decode_init,
     .decode       = v410_decode_frame,
-    .capabilities = AV_CODEC_CAP_DR1,
+    .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS |
+                    AV_CODEC_CAP_FRAME_THREADS
 };