diff mbox

[FFmpeg-devel,v1] avcodec/v410dec: add the frame and slice threading support

Message ID 20191025153644.4904-1-lance.lmwang@gmail.com
State Accepted
Commit f0dbeb5eaa41fa508560ddaee51efa875a602bfc
Headers show

Commit Message

Lance Wang Oct. 25, 2019, 3:36 p.m. UTC
From: Limin Wang <lance.lmwang@gmail.com>

1, Test server configure:
[root@localhost ~]# cat /proc/cpuinfo  |grep "model name"
model name	: Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
model name	: Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
...

[root@localhost ~]# free -h
              total        used        free      shared  buff/cache   available
Mem:           102G        997M         93G         16M        7.6G        100G

2, performance profiling
master:
./ffmpeg -y -stream_loop 100 -i ./test.avi -benchmark -f null -
video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=16.932s stime=9.417s rtime=26.341s
bench: maxrss=271056kB
frame= 1010 fps= 38 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=8.22x

patch applied:
./ffmpeg -y -threads 2 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null -
frame= 1010 fps= 52 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=11.1x
video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=23.164s stime=10.983s rtime=19.503s
bench: maxrss=338252kB

./ffmpeg -y -threads 4 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null -
frame= 1010 fps= 71 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=15.3x
video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
bench: utime=21.610s stime=11.603s rtime=14.160s
bench: maxrss=517060kB


Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
---
 libavcodec/v410dec.c | 72 +++++++++++++++++++++++++++++++-------------
 1 file changed, 51 insertions(+), 21 deletions(-)

Comments

Carl Eugen Hoyos Oct. 25, 2019, 4:39 p.m. UTC | #1
Am Fr., 25. Okt. 2019 um 17:37 Uhr schrieb <lance.lmwang@gmail.com>:

> -    .capabilities = AV_CODEC_CAP_DR1,
> +    .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS |
> +                    AV_CODEC_CAP_FRAME_THREADS

In your tests: Was slice threading or frame threading more effective?

Carl Eugen
Lance Wang Oct. 26, 2019, 2:07 a.m. UTC | #2
On Fri, Oct 25, 2019 at 06:39:46PM +0200, Carl Eugen Hoyos wrote:
> Am Fr., 25. Okt. 2019 um 17:37 Uhr schrieb <lance.lmwang@gmail.com>:
> 
> > -    .capabilities = AV_CODEC_CAP_DR1,
> > +    .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS |
> > +                    AV_CODEC_CAP_FRAME_THREADS

It's related to cpu and memory very much, I recall on one of my linux
system, slice thread is faster. It's using the same process with v210dec,
although the patch is pending for review yet.

Below is testing result on my old mac pro system with 8G memory:
./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v410 -an -frames:v 10 ~/Movies/1.avi

./ffmpeg -y -stream_loop 100 -i ~/Movies/1.avi  -benchmark -f null -
frame= 1010 fps= 27 q=-0.0 Lsize=N/A time=00:00:42.08 bitrate=N/A speed=1.11x

./ffmpeg -y -threads 4 -thread_type slice -stream_loop 100 -i ~/Movies/1.avi  -benchmark -f null -
frame= 1010 fps= 32 q=-0.0 Lsize=N/A time=00:00:42.08 bitrate=N/A speed=1.34x

./ffmpeg -y -threads 4 -thread_type frame -stream_loop 100 -i ~/Movies/1.avi  -benchmark -f null -
frame= 1010 fps= 38 q=-0.0 Lsize=N/A time=00:00:42.08 bitrate=N/A speed=1.58x

./ffmpeg -y -threads 4 -thread_type frame+slice -stream_loop 100 -i ~/Movies/1.avi  -benchmark -f null -
frame= 1010 fps= 38 q=-0.0 Lsize=N/A time=00:00:42.08 bitrate=N/A speed=1.59x


> 
> In your tests: Was slice threading or frame threading more effective?
> 
> Carl Eugen
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Lance Wang Nov. 22, 2019, 4:09 p.m. UTC | #3
ping, also ping with v210dec thread support which its reviewed by Michael.
https://patchwork.ffmpeg.org/patch/15836/

If no developer is interested in the module, I'm glad to maintain it,
I think it's better than nobody. Please feedback.


On Fri, Oct 25, 2019 at 11:36:44PM +0800, lance.lmwang@gmail.com wrote:
> From: Limin Wang <lance.lmwang@gmail.com>
> 
> 1, Test server configure:
> [root@localhost ~]# cat /proc/cpuinfo  |grep "model name"
> model name	: Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
> model name	: Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
> ...
> 
> [root@localhost ~]# free -h
>               total        used        free      shared  buff/cache   available
> Mem:           102G        997M         93G         16M        7.6G        100G
> 
> 2, performance profiling
> master:
> ./ffmpeg -y -stream_loop 100 -i ./test.avi -benchmark -f null -
> video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
> bench: utime=16.932s stime=9.417s rtime=26.341s
> bench: maxrss=271056kB
> frame= 1010 fps= 38 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=8.22x
> 
> patch applied:
> ./ffmpeg -y -threads 2 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null -
> frame= 1010 fps= 52 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=11.1x
> video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
> bench: utime=23.164s stime=10.983s rtime=19.503s
> bench: maxrss=338252kB
> 
> ./ffmpeg -y -threads 4 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null -
> frame= 1010 fps= 71 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=15.3x
> video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
> bench: utime=21.610s stime=11.603s rtime=14.160s
> bench: maxrss=517060kB
> 
> 
> Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> ---
>  libavcodec/v410dec.c | 72 +++++++++++++++++++++++++++++++-------------
>  1 file changed, 51 insertions(+), 21 deletions(-)
> 
> diff --git a/libavcodec/v410dec.c b/libavcodec/v410dec.c
> index 48fab68273..7ad5eb8fb5 100644
> --- a/libavcodec/v410dec.c
> +++ b/libavcodec/v410dec.c
> @@ -24,6 +24,13 @@
>  #include "libavutil/intreadwrite.h"
>  #include "avcodec.h"
>  #include "internal.h"
> +#include "thread.h"
> +
> +typedef struct ThreadData {
> +    AVFrame *frame;
> +    uint8_t *buf;
> +    int stride;
> +} ThreadData;
>  
>  static av_cold int v410_decode_init(AVCodecContext *avctx)
>  {
> @@ -42,31 +49,24 @@ static av_cold int v410_decode_init(AVCodecContext *avctx)
>      return 0;
>  }
>  
> -static int v410_decode_frame(AVCodecContext *avctx, void *data,
> -                             int *got_frame, AVPacket *avpkt)
> +static int v410_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
>  {
> -    AVFrame *pic = data;
> -    uint8_t *src = avpkt->data;
> +    ThreadData *td = arg;
> +    AVFrame *pic = td->frame;
> +    int stride = td->stride;
> +    int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4);
> +    int slice_start = (avctx->height *  jobnr) / thread_count;
> +    int slice_end = (avctx->height * (jobnr+1)) / thread_count;
> +    const uint8_t *src = td->buf + stride * slice_start;
>      uint16_t *y, *u, *v;
>      uint32_t val;
> -    int i, j, ret;
> -
> -    if (avpkt->size < 4 * avctx->height * avctx->width) {
> -        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
> -        return AVERROR(EINVAL);
> -    }
> -
> -    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
> -        return ret;
> +    int i, j;
>  
> -    pic->key_frame = 1;
> -    pic->pict_type = AV_PICTURE_TYPE_I;
> +    y = (uint16_t*)pic->data[0] + slice_start * (pic->linesize[0] >> 1);
> +    u = (uint16_t*)pic->data[1] + slice_start * (pic->linesize[1] >> 1);
> +    v = (uint16_t*)pic->data[2] + slice_start * (pic->linesize[2] >> 1);
>  
> -    y = (uint16_t *)pic->data[0];
> -    u = (uint16_t *)pic->data[1];
> -    v = (uint16_t *)pic->data[2];
> -
> -    for (i = 0; i < avctx->height; i++) {
> +    for (i = slice_start; i < slice_end; i++) {
>          for (j = 0; j < avctx->width; j++) {
>              val = AV_RL32(src);
>  
> @@ -82,6 +82,35 @@ static int v410_decode_frame(AVCodecContext *avctx, void *data,
>          v += pic->linesize[2] >> 1;
>      }
>  
> +    return 0;
> +}
> +
> +static int v410_decode_frame(AVCodecContext *avctx, void *data,
> +                             int *got_frame, AVPacket *avpkt)
> +{
> +    ThreadData td;
> +    ThreadFrame frame = { .f = data };
> +    AVFrame *pic = data;
> +    uint8_t *src = avpkt->data;
> +    int ret;
> +    int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4);
> +
> +    td.stride = avctx->width * 4;
> +    if (avpkt->size < 4 * avctx->height * avctx->width) {
> +        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
> +        return AVERROR(EINVAL);
> +    }
> +
> +    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
> +        return ret;
> +
> +    pic->key_frame = 1;
> +    pic->pict_type = AV_PICTURE_TYPE_I;
> +
> +    td.buf = src;
> +    td.frame = pic;
> +    avctx->execute2(avctx, v410_decode_slice, &td, NULL, thread_count);
> +
>      *got_frame = 1;
>  
>      return avpkt->size;
> @@ -94,5 +123,6 @@ AVCodec ff_v410_decoder = {
>      .id           = AV_CODEC_ID_V410,
>      .init         = v410_decode_init,
>      .decode       = v410_decode_frame,
> -    .capabilities = AV_CODEC_CAP_DR1,
> +    .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS |
> +                    AV_CODEC_CAP_FRAME_THREADS
>  };
> -- 
> 2.21.0
>
Michael Niedermayer Nov. 24, 2019, 3 p.m. UTC | #4
On Sat, Nov 23, 2019 at 12:09:09AM +0800, Limin Wang wrote:
> 
> ping, also ping with v210dec thread support which its reviewed by Michael.
> https://patchwork.ffmpeg.org/patch/15836/
> 
> If no developer is interested in the module, I'm glad to maintain it,
> I think it's better than nobody. Please feedback.

will apply the v410 patch after correcting a english grammer typo

thx

[...]
James Almer Nov. 24, 2019, 4:05 p.m. UTC | #5
On 10/25/2019 12:36 PM, lance.lmwang@gmail.com wrote:
> From: Limin Wang <lance.lmwang@gmail.com>
> 
> 1, Test server configure:
> [root@localhost ~]# cat /proc/cpuinfo  |grep "model name"
> model name	: Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
> model name	: Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
> ...
> 
> [root@localhost ~]# free -h
>               total        used        free      shared  buff/cache   available
> Mem:           102G        997M         93G         16M        7.6G        100G
> 
> 2, performance profiling
> master:
> ./ffmpeg -y -stream_loop 100 -i ./test.avi -benchmark -f null -
> video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
> bench: utime=16.932s stime=9.417s rtime=26.341s
> bench: maxrss=271056kB
> frame= 1010 fps= 38 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=8.22x
> 
> patch applied:
> ./ffmpeg -y -threads 2 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null -
> frame= 1010 fps= 52 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=11.1x
> video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
> bench: utime=23.164s stime=10.983s rtime=19.503s
> bench: maxrss=338252kB
> 
> ./ffmpeg -y -threads 4 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null -
> frame= 1010 fps= 71 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=15.3x
> video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
> bench: utime=21.610s stime=11.603s rtime=14.160s
> bench: maxrss=517060kB

Can you try with slice threading? -thread_type slice+frame will default
to frame, since afaik both can't run at the same time.

> 
> 
> Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> ---
>  libavcodec/v410dec.c | 72 +++++++++++++++++++++++++++++++-------------
>  1 file changed, 51 insertions(+), 21 deletions(-)
> 
> diff --git a/libavcodec/v410dec.c b/libavcodec/v410dec.c
> index 48fab68273..7ad5eb8fb5 100644
> --- a/libavcodec/v410dec.c
> +++ b/libavcodec/v410dec.c
> @@ -24,6 +24,13 @@
>  #include "libavutil/intreadwrite.h"
>  #include "avcodec.h"
>  #include "internal.h"
> +#include "thread.h"
> +
> +typedef struct ThreadData {
> +    AVFrame *frame;
> +    uint8_t *buf;
> +    int stride;
> +} ThreadData;
>  
>  static av_cold int v410_decode_init(AVCodecContext *avctx)
>  {
> @@ -42,31 +49,24 @@ static av_cold int v410_decode_init(AVCodecContext *avctx)
>      return 0;
>  }
>  
> -static int v410_decode_frame(AVCodecContext *avctx, void *data,
> -                             int *got_frame, AVPacket *avpkt)
> +static int v410_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
>  {
> -    AVFrame *pic = data;
> -    uint8_t *src = avpkt->data;
> +    ThreadData *td = arg;
> +    AVFrame *pic = td->frame;
> +    int stride = td->stride;
> +    int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4);
> +    int slice_start = (avctx->height *  jobnr) / thread_count;
> +    int slice_end = (avctx->height * (jobnr+1)) / thread_count;
> +    const uint8_t *src = td->buf + stride * slice_start;
>      uint16_t *y, *u, *v;
>      uint32_t val;
> -    int i, j, ret;
> -
> -    if (avpkt->size < 4 * avctx->height * avctx->width) {
> -        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
> -        return AVERROR(EINVAL);
> -    }
> -
> -    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
> -        return ret;
> +    int i, j;
>  
> -    pic->key_frame = 1;
> -    pic->pict_type = AV_PICTURE_TYPE_I;
> +    y = (uint16_t*)pic->data[0] + slice_start * (pic->linesize[0] >> 1);
> +    u = (uint16_t*)pic->data[1] + slice_start * (pic->linesize[1] >> 1);
> +    v = (uint16_t*)pic->data[2] + slice_start * (pic->linesize[2] >> 1);
>  
> -    y = (uint16_t *)pic->data[0];
> -    u = (uint16_t *)pic->data[1];
> -    v = (uint16_t *)pic->data[2];
> -
> -    for (i = 0; i < avctx->height; i++) {
> +    for (i = slice_start; i < slice_end; i++) {
>          for (j = 0; j < avctx->width; j++) {
>              val = AV_RL32(src);
>  
> @@ -82,6 +82,35 @@ static int v410_decode_frame(AVCodecContext *avctx, void *data,
>          v += pic->linesize[2] >> 1;
>      }
>  
> +    return 0;
> +}
> +
> +static int v410_decode_frame(AVCodecContext *avctx, void *data,
> +                             int *got_frame, AVPacket *avpkt)
> +{
> +    ThreadData td;
> +    ThreadFrame frame = { .f = data };
> +    AVFrame *pic = data;
> +    uint8_t *src = avpkt->data;
> +    int ret;
> +    int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4);
> +
> +    td.stride = avctx->width * 4;
> +    if (avpkt->size < 4 * avctx->height * avctx->width) {
> +        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
> +        return AVERROR(EINVAL);
> +    }
> +
> +    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
> +        return ret;
> +
> +    pic->key_frame = 1;
> +    pic->pict_type = AV_PICTURE_TYPE_I;
> +
> +    td.buf = src;
> +    td.frame = pic;
> +    avctx->execute2(avctx, v410_decode_slice, &td, NULL, thread_count);
> +
>      *got_frame = 1;
>  
>      return avpkt->size;
> @@ -94,5 +123,6 @@ AVCodec ff_v410_decoder = {
>      .id           = AV_CODEC_ID_V410,
>      .init         = v410_decode_init,
>      .decode       = v410_decode_frame,
> -    .capabilities = AV_CODEC_CAP_DR1,
> +    .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS |
> +                    AV_CODEC_CAP_FRAME_THREADS
>  };
>
Michael Niedermayer Nov. 24, 2019, 7:54 p.m. UTC | #6
On Sun, Nov 24, 2019 at 04:00:07PM +0100, Michael Niedermayer wrote:
> On Sat, Nov 23, 2019 at 12:09:09AM +0800, Limin Wang wrote:
> > 
> > ping, also ping with v210dec thread support which its reviewed by Michael.
> > https://patchwork.ffmpeg.org/patch/15836/
> > 
> > If no developer is interested in the module, I'm glad to maintain it,
> > I think it's better than nobody. Please feedback.
> 
> will apply the v410 patch after correcting a english grammer typo

as james asked for more benchmarks, ill wait with applying so the
commit can contain benchmarks for both cases

thx

[...]
James Almer Nov. 24, 2019, 8:02 p.m. UTC | #7
On 11/24/2019 4:54 PM, Michael Niedermayer wrote:
> On Sun, Nov 24, 2019 at 04:00:07PM +0100, Michael Niedermayer wrote:
>> On Sat, Nov 23, 2019 at 12:09:09AM +0800, Limin Wang wrote:
>>>
>>> ping, also ping with v210dec thread support which its reviewed by Michael.
>>> https://patchwork.ffmpeg.org/patch/15836/
>>>
>>> If no developer is interested in the module, I'm glad to maintain it,
>>> I think it's better than nobody. Please feedback.
>>
>> will apply the v410 patch after correcting a english grammer typo
> 
> as james asked for more benchmarks, ill wait with applying so the
> commit can contain benchmarks for both cases
> 
> thx

Actually, i wrote that reply too soon. I missed the fact Carl asked the
same thing and the benchmarks were provided.
Lance Wang Nov. 25, 2019, 1:01 a.m. UTC | #8
On Sun, Nov 24, 2019 at 01:05:53PM -0300, James Almer wrote:
> On 10/25/2019 12:36 PM, lance.lmwang@gmail.com wrote:
> > From: Limin Wang <lance.lmwang@gmail.com>
> > 
> > 1, Test server configure:
> > [root@localhost ~]# cat /proc/cpuinfo  |grep "model name"
> > model name	: Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
> > model name	: Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
> > ...
> > 
> > [root@localhost ~]# free -h
> >               total        used        free      shared  buff/cache   available
> > Mem:           102G        997M         93G         16M        7.6G        100G
> > 
> > 2, performance profiling
> > master:
> > ./ffmpeg -y -stream_loop 100 -i ./test.avi -benchmark -f null -
> > video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
> > bench: utime=16.932s stime=9.417s rtime=26.341s
> > bench: maxrss=271056kB
> > frame= 1010 fps= 38 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=8.22x
> > 
> > patch applied:
> > ./ffmpeg -y -threads 2 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null -
> > frame= 1010 fps= 52 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=11.1x
> > video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
> > bench: utime=23.164s stime=10.983s rtime=19.503s
> > bench: maxrss=338252kB
> > 
> > ./ffmpeg -y -threads 4 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null -
> > frame= 1010 fps= 71 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=15.3x
> > video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
> > bench: utime=21.610s stime=11.603s rtime=14.160s
> > bench: maxrss=517060kB
> 
> Can you try with slice threading? -thread_type slice+frame will default
> to frame, since afaik both can't run at the same time.
No problem, I'll update with slice threading for both v410 and v210. So
that's why I can't get expected result when try with frame+slice, I
expect with better performance.

> 
> > 
> > 
> > Signed-off-by: Limin Wang <lance.lmwang@gmail.com>
> > ---
> >  libavcodec/v410dec.c | 72 +++++++++++++++++++++++++++++++-------------
> >  1 file changed, 51 insertions(+), 21 deletions(-)
> > 
> > diff --git a/libavcodec/v410dec.c b/libavcodec/v410dec.c
> > index 48fab68273..7ad5eb8fb5 100644
> > --- a/libavcodec/v410dec.c
> > +++ b/libavcodec/v410dec.c
> > @@ -24,6 +24,13 @@
> >  #include "libavutil/intreadwrite.h"
> >  #include "avcodec.h"
> >  #include "internal.h"
> > +#include "thread.h"
> > +
> > +typedef struct ThreadData {
> > +    AVFrame *frame;
> > +    uint8_t *buf;
> > +    int stride;
> > +} ThreadData;
> >  
> >  static av_cold int v410_decode_init(AVCodecContext *avctx)
> >  {
> > @@ -42,31 +49,24 @@ static av_cold int v410_decode_init(AVCodecContext *avctx)
> >      return 0;
> >  }
> >  
> > -static int v410_decode_frame(AVCodecContext *avctx, void *data,
> > -                             int *got_frame, AVPacket *avpkt)
> > +static int v410_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
> >  {
> > -    AVFrame *pic = data;
> > -    uint8_t *src = avpkt->data;
> > +    ThreadData *td = arg;
> > +    AVFrame *pic = td->frame;
> > +    int stride = td->stride;
> > +    int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4);
> > +    int slice_start = (avctx->height *  jobnr) / thread_count;
> > +    int slice_end = (avctx->height * (jobnr+1)) / thread_count;
> > +    const uint8_t *src = td->buf + stride * slice_start;
> >      uint16_t *y, *u, *v;
> >      uint32_t val;
> > -    int i, j, ret;
> > -
> > -    if (avpkt->size < 4 * avctx->height * avctx->width) {
> > -        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
> > -        return AVERROR(EINVAL);
> > -    }
> > -
> > -    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
> > -        return ret;
> > +    int i, j;
> >  
> > -    pic->key_frame = 1;
> > -    pic->pict_type = AV_PICTURE_TYPE_I;
> > +    y = (uint16_t*)pic->data[0] + slice_start * (pic->linesize[0] >> 1);
> > +    u = (uint16_t*)pic->data[1] + slice_start * (pic->linesize[1] >> 1);
> > +    v = (uint16_t*)pic->data[2] + slice_start * (pic->linesize[2] >> 1);
> >  
> > -    y = (uint16_t *)pic->data[0];
> > -    u = (uint16_t *)pic->data[1];
> > -    v = (uint16_t *)pic->data[2];
> > -
> > -    for (i = 0; i < avctx->height; i++) {
> > +    for (i = slice_start; i < slice_end; i++) {
> >          for (j = 0; j < avctx->width; j++) {
> >              val = AV_RL32(src);
> >  
> > @@ -82,6 +82,35 @@ static int v410_decode_frame(AVCodecContext *avctx, void *data,
> >          v += pic->linesize[2] >> 1;
> >      }
> >  
> > +    return 0;
> > +}
> > +
> > +static int v410_decode_frame(AVCodecContext *avctx, void *data,
> > +                             int *got_frame, AVPacket *avpkt)
> > +{
> > +    ThreadData td;
> > +    ThreadFrame frame = { .f = data };
> > +    AVFrame *pic = data;
> > +    uint8_t *src = avpkt->data;
> > +    int ret;
> > +    int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4);
> > +
> > +    td.stride = avctx->width * 4;
> > +    if (avpkt->size < 4 * avctx->height * avctx->width) {
> > +        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
> > +        return AVERROR(EINVAL);
> > +    }
> > +
> > +    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
> > +        return ret;
> > +
> > +    pic->key_frame = 1;
> > +    pic->pict_type = AV_PICTURE_TYPE_I;
> > +
> > +    td.buf = src;
> > +    td.frame = pic;
> > +    avctx->execute2(avctx, v410_decode_slice, &td, NULL, thread_count);
> > +
> >      *got_frame = 1;
> >  
> >      return avpkt->size;
> > @@ -94,5 +123,6 @@ AVCodec ff_v410_decoder = {
> >      .id           = AV_CODEC_ID_V410,
> >      .init         = v410_decode_init,
> >      .decode       = v410_decode_frame,
> > -    .capabilities = AV_CODEC_CAP_DR1,
> > +    .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS |
> > +                    AV_CODEC_CAP_FRAME_THREADS
> >  };
> > 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Lance Wang Nov. 25, 2019, 3:50 a.m. UTC | #9
On Sun, Nov 24, 2019 at 05:02:55PM -0300, James Almer wrote:
> On 11/24/2019 4:54 PM, Michael Niedermayer wrote:
> > On Sun, Nov 24, 2019 at 04:00:07PM +0100, Michael Niedermayer wrote:
> >> On Sat, Nov 23, 2019 at 12:09:09AM +0800, Limin Wang wrote:
> >>>
> >>> ping, also ping with v210dec thread support which its reviewed by Michael.
> >>> https://patchwork.ffmpeg.org/patch/15836/
> >>>
> >>> If no developer is interested in the module, I'm glad to maintain it,
> >>> I think it's better than nobody. Please feedback.
> >>
> >> will apply the v410 patch after correcting a english grammer typo
> > 
> > as james asked for more benchmarks, ill wait with applying so the
> > commit can contain benchmarks for both cases

Have updated the benchmarks result for both v410 and v210. v210 is
sending out by its own thread.



> > 
> > thx
> 
> Actually, i wrote that reply too soon. I missed the fact Carl asked the
> same thing and the benchmarks were provided.
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff mbox

Patch

diff --git a/libavcodec/v410dec.c b/libavcodec/v410dec.c
index 48fab68273..7ad5eb8fb5 100644
--- a/libavcodec/v410dec.c
+++ b/libavcodec/v410dec.c
@@ -24,6 +24,13 @@ 
 #include "libavutil/intreadwrite.h"
 #include "avcodec.h"
 #include "internal.h"
+#include "thread.h"
+
+typedef struct ThreadData {
+    AVFrame *frame;
+    uint8_t *buf;
+    int stride;
+} ThreadData;
 
 static av_cold int v410_decode_init(AVCodecContext *avctx)
 {
@@ -42,31 +49,24 @@  static av_cold int v410_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static int v410_decode_frame(AVCodecContext *avctx, void *data,
-                             int *got_frame, AVPacket *avpkt)
+static int v410_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
 {
-    AVFrame *pic = data;
-    uint8_t *src = avpkt->data;
+    ThreadData *td = arg;
+    AVFrame *pic = td->frame;
+    int stride = td->stride;
+    int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4);
+    int slice_start = (avctx->height *  jobnr) / thread_count;
+    int slice_end = (avctx->height * (jobnr+1)) / thread_count;
+    const uint8_t *src = td->buf + stride * slice_start;
     uint16_t *y, *u, *v;
     uint32_t val;
-    int i, j, ret;
-
-    if (avpkt->size < 4 * avctx->height * avctx->width) {
-        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
-        return AVERROR(EINVAL);
-    }
-
-    if ((ret = ff_get_buffer(avctx, pic, 0)) < 0)
-        return ret;
+    int i, j;
 
-    pic->key_frame = 1;
-    pic->pict_type = AV_PICTURE_TYPE_I;
+    y = (uint16_t*)pic->data[0] + slice_start * (pic->linesize[0] >> 1);
+    u = (uint16_t*)pic->data[1] + slice_start * (pic->linesize[1] >> 1);
+    v = (uint16_t*)pic->data[2] + slice_start * (pic->linesize[2] >> 1);
 
-    y = (uint16_t *)pic->data[0];
-    u = (uint16_t *)pic->data[1];
-    v = (uint16_t *)pic->data[2];
-
-    for (i = 0; i < avctx->height; i++) {
+    for (i = slice_start; i < slice_end; i++) {
         for (j = 0; j < avctx->width; j++) {
             val = AV_RL32(src);
 
@@ -82,6 +82,35 @@  static int v410_decode_frame(AVCodecContext *avctx, void *data,
         v += pic->linesize[2] >> 1;
     }
 
+    return 0;
+}
+
+static int v410_decode_frame(AVCodecContext *avctx, void *data,
+                             int *got_frame, AVPacket *avpkt)
+{
+    ThreadData td;
+    ThreadFrame frame = { .f = data };
+    AVFrame *pic = data;
+    uint8_t *src = avpkt->data;
+    int ret;
+    int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4);
+
+    td.stride = avctx->width * 4;
+    if (avpkt->size < 4 * avctx->height * avctx->width) {
+        av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n");
+        return AVERROR(EINVAL);
+    }
+
+    if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0)
+        return ret;
+
+    pic->key_frame = 1;
+    pic->pict_type = AV_PICTURE_TYPE_I;
+
+    td.buf = src;
+    td.frame = pic;
+    avctx->execute2(avctx, v410_decode_slice, &td, NULL, thread_count);
+
     *got_frame = 1;
 
     return avpkt->size;
@@ -94,5 +123,6 @@  AVCodec ff_v410_decoder = {
     .id           = AV_CODEC_ID_V410,
     .init         = v410_decode_init,
     .decode       = v410_decode_frame,
-    .capabilities = AV_CODEC_CAP_DR1,
+    .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS |
+                    AV_CODEC_CAP_FRAME_THREADS
 };