Message ID | 20191025153644.4904-1-lance.lmwang@gmail.com |
---|---|
State | Accepted |
Commit | f0dbeb5eaa41fa508560ddaee51efa875a602bfc |
Headers | show |
Am Fr., 25. Okt. 2019 um 17:37 Uhr schrieb <lance.lmwang@gmail.com>: > - .capabilities = AV_CODEC_CAP_DR1, > + .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS | > + AV_CODEC_CAP_FRAME_THREADS In your tests: Was slice threading or frame threading more effective? Carl Eugen
On Fri, Oct 25, 2019 at 06:39:46PM +0200, Carl Eugen Hoyos wrote: > Am Fr., 25. Okt. 2019 um 17:37 Uhr schrieb <lance.lmwang@gmail.com>: > > > - .capabilities = AV_CODEC_CAP_DR1, > > + .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS | > > + AV_CODEC_CAP_FRAME_THREADS It's related to cpu and memory very much, I recall on one of my linux system, slice thread is faster. It's using the same process with v210dec, although the patch is pending for review yet. Below is testing result on my old mac pro system with 8G memory: ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v410 -an -frames:v 10 ~/Movies/1.avi ./ffmpeg -y -stream_loop 100 -i ~/Movies/1.avi -benchmark -f null - frame= 1010 fps= 27 q=-0.0 Lsize=N/A time=00:00:42.08 bitrate=N/A speed=1.11x ./ffmpeg -y -threads 4 -thread_type slice -stream_loop 100 -i ~/Movies/1.avi -benchmark -f null - frame= 1010 fps= 32 q=-0.0 Lsize=N/A time=00:00:42.08 bitrate=N/A speed=1.34x ./ffmpeg -y -threads 4 -thread_type frame -stream_loop 100 -i ~/Movies/1.avi -benchmark -f null - frame= 1010 fps= 38 q=-0.0 Lsize=N/A time=00:00:42.08 bitrate=N/A speed=1.58x ./ffmpeg -y -threads 4 -thread_type frame+slice -stream_loop 100 -i ~/Movies/1.avi -benchmark -f null - frame= 1010 fps= 38 q=-0.0 Lsize=N/A time=00:00:42.08 bitrate=N/A speed=1.59x > > In your tests: Was slice threading or frame threading more effective? > > Carl Eugen > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
ping, also ping with v210dec thread support which its reviewed by Michael. https://patchwork.ffmpeg.org/patch/15836/ If no developer is interested in the module, I'm glad to maintain it, I think it's better than nobody. Please feedback. On Fri, Oct 25, 2019 at 11:36:44PM +0800, lance.lmwang@gmail.com wrote: > From: Limin Wang <lance.lmwang@gmail.com> > > 1, Test server configure: > [root@localhost ~]# cat /proc/cpuinfo |grep "model name" > model name : Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz > model name : Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz > ... > > [root@localhost ~]# free -h > total used free shared buff/cache available > Mem: 102G 997M 93G 16M 7.6G 100G > > 2, performance profiling > master: > ./ffmpeg -y -stream_loop 100 -i ./test.avi -benchmark -f null - > video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown > bench: utime=16.932s stime=9.417s rtime=26.341s > bench: maxrss=271056kB > frame= 1010 fps= 38 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=8.22x > > patch applied: > ./ffmpeg -y -threads 2 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null - > frame= 1010 fps= 52 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=11.1x > video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown > bench: utime=23.164s stime=10.983s rtime=19.503s > bench: maxrss=338252kB > > ./ffmpeg -y -threads 4 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null - > frame= 1010 fps= 71 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=15.3x > video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown > bench: utime=21.610s stime=11.603s rtime=14.160s > bench: maxrss=517060kB > > > Signed-off-by: Limin Wang <lance.lmwang@gmail.com> > --- > libavcodec/v410dec.c | 72 +++++++++++++++++++++++++++++++------------- > 1 file changed, 51 insertions(+), 21 deletions(-) > > diff --git a/libavcodec/v410dec.c b/libavcodec/v410dec.c > index 48fab68273..7ad5eb8fb5 100644 > --- a/libavcodec/v410dec.c > +++ b/libavcodec/v410dec.c > @@ -24,6 +24,13 @@ > #include "libavutil/intreadwrite.h" > #include "avcodec.h" > #include "internal.h" > +#include "thread.h" > + > +typedef struct ThreadData { > + AVFrame *frame; > + uint8_t *buf; > + int stride; > +} ThreadData; > > static av_cold int v410_decode_init(AVCodecContext *avctx) > { > @@ -42,31 +49,24 @@ static av_cold int v410_decode_init(AVCodecContext *avctx) > return 0; > } > > -static int v410_decode_frame(AVCodecContext *avctx, void *data, > - int *got_frame, AVPacket *avpkt) > +static int v410_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr) > { > - AVFrame *pic = data; > - uint8_t *src = avpkt->data; > + ThreadData *td = arg; > + AVFrame *pic = td->frame; > + int stride = td->stride; > + int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4); > + int slice_start = (avctx->height * jobnr) / thread_count; > + int slice_end = (avctx->height * (jobnr+1)) / thread_count; > + const uint8_t *src = td->buf + stride * slice_start; > uint16_t *y, *u, *v; > uint32_t val; > - int i, j, ret; > - > - if (avpkt->size < 4 * avctx->height * avctx->width) { > - av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n"); > - return AVERROR(EINVAL); > - } > - > - if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) > - return ret; > + int i, j; > > - pic->key_frame = 1; > - pic->pict_type = AV_PICTURE_TYPE_I; > + y = (uint16_t*)pic->data[0] + slice_start * (pic->linesize[0] >> 1); > + u = (uint16_t*)pic->data[1] + slice_start * (pic->linesize[1] >> 1); > + v = (uint16_t*)pic->data[2] + slice_start * (pic->linesize[2] >> 1); > > - y = (uint16_t *)pic->data[0]; > - u = (uint16_t *)pic->data[1]; > - v = (uint16_t *)pic->data[2]; > - > - for (i = 0; i < avctx->height; i++) { > + for (i = slice_start; i < slice_end; i++) { > for (j = 0; j < avctx->width; j++) { > val = AV_RL32(src); > > @@ -82,6 +82,35 @@ static int v410_decode_frame(AVCodecContext *avctx, void *data, > v += pic->linesize[2] >> 1; > } > > + return 0; > +} > + > +static int v410_decode_frame(AVCodecContext *avctx, void *data, > + int *got_frame, AVPacket *avpkt) > +{ > + ThreadData td; > + ThreadFrame frame = { .f = data }; > + AVFrame *pic = data; > + uint8_t *src = avpkt->data; > + int ret; > + int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4); > + > + td.stride = avctx->width * 4; > + if (avpkt->size < 4 * avctx->height * avctx->width) { > + av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n"); > + return AVERROR(EINVAL); > + } > + > + if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) > + return ret; > + > + pic->key_frame = 1; > + pic->pict_type = AV_PICTURE_TYPE_I; > + > + td.buf = src; > + td.frame = pic; > + avctx->execute2(avctx, v410_decode_slice, &td, NULL, thread_count); > + > *got_frame = 1; > > return avpkt->size; > @@ -94,5 +123,6 @@ AVCodec ff_v410_decoder = { > .id = AV_CODEC_ID_V410, > .init = v410_decode_init, > .decode = v410_decode_frame, > - .capabilities = AV_CODEC_CAP_DR1, > + .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS | > + AV_CODEC_CAP_FRAME_THREADS > }; > -- > 2.21.0 >
On Sat, Nov 23, 2019 at 12:09:09AM +0800, Limin Wang wrote: > > ping, also ping with v210dec thread support which its reviewed by Michael. > https://patchwork.ffmpeg.org/patch/15836/ > > If no developer is interested in the module, I'm glad to maintain it, > I think it's better than nobody. Please feedback. will apply the v410 patch after correcting a english grammer typo thx [...]
On 10/25/2019 12:36 PM, lance.lmwang@gmail.com wrote: > From: Limin Wang <lance.lmwang@gmail.com> > > 1, Test server configure: > [root@localhost ~]# cat /proc/cpuinfo |grep "model name" > model name : Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz > model name : Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz > ... > > [root@localhost ~]# free -h > total used free shared buff/cache available > Mem: 102G 997M 93G 16M 7.6G 100G > > 2, performance profiling > master: > ./ffmpeg -y -stream_loop 100 -i ./test.avi -benchmark -f null - > video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown > bench: utime=16.932s stime=9.417s rtime=26.341s > bench: maxrss=271056kB > frame= 1010 fps= 38 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=8.22x > > patch applied: > ./ffmpeg -y -threads 2 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null - > frame= 1010 fps= 52 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=11.1x > video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown > bench: utime=23.164s stime=10.983s rtime=19.503s > bench: maxrss=338252kB > > ./ffmpeg -y -threads 4 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null - > frame= 1010 fps= 71 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=15.3x > video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown > bench: utime=21.610s stime=11.603s rtime=14.160s > bench: maxrss=517060kB Can you try with slice threading? -thread_type slice+frame will default to frame, since afaik both can't run at the same time. > > > Signed-off-by: Limin Wang <lance.lmwang@gmail.com> > --- > libavcodec/v410dec.c | 72 +++++++++++++++++++++++++++++++------------- > 1 file changed, 51 insertions(+), 21 deletions(-) > > diff --git a/libavcodec/v410dec.c b/libavcodec/v410dec.c > index 48fab68273..7ad5eb8fb5 100644 > --- a/libavcodec/v410dec.c > +++ b/libavcodec/v410dec.c > @@ -24,6 +24,13 @@ > #include "libavutil/intreadwrite.h" > #include "avcodec.h" > #include "internal.h" > +#include "thread.h" > + > +typedef struct ThreadData { > + AVFrame *frame; > + uint8_t *buf; > + int stride; > +} ThreadData; > > static av_cold int v410_decode_init(AVCodecContext *avctx) > { > @@ -42,31 +49,24 @@ static av_cold int v410_decode_init(AVCodecContext *avctx) > return 0; > } > > -static int v410_decode_frame(AVCodecContext *avctx, void *data, > - int *got_frame, AVPacket *avpkt) > +static int v410_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr) > { > - AVFrame *pic = data; > - uint8_t *src = avpkt->data; > + ThreadData *td = arg; > + AVFrame *pic = td->frame; > + int stride = td->stride; > + int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4); > + int slice_start = (avctx->height * jobnr) / thread_count; > + int slice_end = (avctx->height * (jobnr+1)) / thread_count; > + const uint8_t *src = td->buf + stride * slice_start; > uint16_t *y, *u, *v; > uint32_t val; > - int i, j, ret; > - > - if (avpkt->size < 4 * avctx->height * avctx->width) { > - av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n"); > - return AVERROR(EINVAL); > - } > - > - if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) > - return ret; > + int i, j; > > - pic->key_frame = 1; > - pic->pict_type = AV_PICTURE_TYPE_I; > + y = (uint16_t*)pic->data[0] + slice_start * (pic->linesize[0] >> 1); > + u = (uint16_t*)pic->data[1] + slice_start * (pic->linesize[1] >> 1); > + v = (uint16_t*)pic->data[2] + slice_start * (pic->linesize[2] >> 1); > > - y = (uint16_t *)pic->data[0]; > - u = (uint16_t *)pic->data[1]; > - v = (uint16_t *)pic->data[2]; > - > - for (i = 0; i < avctx->height; i++) { > + for (i = slice_start; i < slice_end; i++) { > for (j = 0; j < avctx->width; j++) { > val = AV_RL32(src); > > @@ -82,6 +82,35 @@ static int v410_decode_frame(AVCodecContext *avctx, void *data, > v += pic->linesize[2] >> 1; > } > > + return 0; > +} > + > +static int v410_decode_frame(AVCodecContext *avctx, void *data, > + int *got_frame, AVPacket *avpkt) > +{ > + ThreadData td; > + ThreadFrame frame = { .f = data }; > + AVFrame *pic = data; > + uint8_t *src = avpkt->data; > + int ret; > + int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4); > + > + td.stride = avctx->width * 4; > + if (avpkt->size < 4 * avctx->height * avctx->width) { > + av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n"); > + return AVERROR(EINVAL); > + } > + > + if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) > + return ret; > + > + pic->key_frame = 1; > + pic->pict_type = AV_PICTURE_TYPE_I; > + > + td.buf = src; > + td.frame = pic; > + avctx->execute2(avctx, v410_decode_slice, &td, NULL, thread_count); > + > *got_frame = 1; > > return avpkt->size; > @@ -94,5 +123,6 @@ AVCodec ff_v410_decoder = { > .id = AV_CODEC_ID_V410, > .init = v410_decode_init, > .decode = v410_decode_frame, > - .capabilities = AV_CODEC_CAP_DR1, > + .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS | > + AV_CODEC_CAP_FRAME_THREADS > }; >
On Sun, Nov 24, 2019 at 04:00:07PM +0100, Michael Niedermayer wrote: > On Sat, Nov 23, 2019 at 12:09:09AM +0800, Limin Wang wrote: > > > > ping, also ping with v210dec thread support which its reviewed by Michael. > > https://patchwork.ffmpeg.org/patch/15836/ > > > > If no developer is interested in the module, I'm glad to maintain it, > > I think it's better than nobody. Please feedback. > > will apply the v410 patch after correcting a english grammer typo as james asked for more benchmarks, ill wait with applying so the commit can contain benchmarks for both cases thx [...]
On 11/24/2019 4:54 PM, Michael Niedermayer wrote: > On Sun, Nov 24, 2019 at 04:00:07PM +0100, Michael Niedermayer wrote: >> On Sat, Nov 23, 2019 at 12:09:09AM +0800, Limin Wang wrote: >>> >>> ping, also ping with v210dec thread support which its reviewed by Michael. >>> https://patchwork.ffmpeg.org/patch/15836/ >>> >>> If no developer is interested in the module, I'm glad to maintain it, >>> I think it's better than nobody. Please feedback. >> >> will apply the v410 patch after correcting a english grammer typo > > as james asked for more benchmarks, ill wait with applying so the > commit can contain benchmarks for both cases > > thx Actually, i wrote that reply too soon. I missed the fact Carl asked the same thing and the benchmarks were provided.
On Sun, Nov 24, 2019 at 01:05:53PM -0300, James Almer wrote: > On 10/25/2019 12:36 PM, lance.lmwang@gmail.com wrote: > > From: Limin Wang <lance.lmwang@gmail.com> > > > > 1, Test server configure: > > [root@localhost ~]# cat /proc/cpuinfo |grep "model name" > > model name : Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz > > model name : Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz > > ... > > > > [root@localhost ~]# free -h > > total used free shared buff/cache available > > Mem: 102G 997M 93G 16M 7.6G 100G > > > > 2, performance profiling > > master: > > ./ffmpeg -y -stream_loop 100 -i ./test.avi -benchmark -f null - > > video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown > > bench: utime=16.932s stime=9.417s rtime=26.341s > > bench: maxrss=271056kB > > frame= 1010 fps= 38 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=8.22x > > > > patch applied: > > ./ffmpeg -y -threads 2 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null - > > frame= 1010 fps= 52 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=11.1x > > video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown > > bench: utime=23.164s stime=10.983s rtime=19.503s > > bench: maxrss=338252kB > > > > ./ffmpeg -y -threads 4 -thread_type slice+frame -stream_loop 100 -i ./test.avi -benchmark -f null - > > frame= 1010 fps= 71 q=-0.0 Lsize=N/A time=00:03:36.54 bitrate=N/A speed=15.3x > > video:529kB audio:40602kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown > > bench: utime=21.610s stime=11.603s rtime=14.160s > > bench: maxrss=517060kB > > Can you try with slice threading? -thread_type slice+frame will default > to frame, since afaik both can't run at the same time. No problem, I'll update with slice threading for both v410 and v210. So that's why I can't get expected result when try with frame+slice, I expect with better performance. > > > > > > > Signed-off-by: Limin Wang <lance.lmwang@gmail.com> > > --- > > libavcodec/v410dec.c | 72 +++++++++++++++++++++++++++++++------------- > > 1 file changed, 51 insertions(+), 21 deletions(-) > > > > diff --git a/libavcodec/v410dec.c b/libavcodec/v410dec.c > > index 48fab68273..7ad5eb8fb5 100644 > > --- a/libavcodec/v410dec.c > > +++ b/libavcodec/v410dec.c > > @@ -24,6 +24,13 @@ > > #include "libavutil/intreadwrite.h" > > #include "avcodec.h" > > #include "internal.h" > > +#include "thread.h" > > + > > +typedef struct ThreadData { > > + AVFrame *frame; > > + uint8_t *buf; > > + int stride; > > +} ThreadData; > > > > static av_cold int v410_decode_init(AVCodecContext *avctx) > > { > > @@ -42,31 +49,24 @@ static av_cold int v410_decode_init(AVCodecContext *avctx) > > return 0; > > } > > > > -static int v410_decode_frame(AVCodecContext *avctx, void *data, > > - int *got_frame, AVPacket *avpkt) > > +static int v410_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr) > > { > > - AVFrame *pic = data; > > - uint8_t *src = avpkt->data; > > + ThreadData *td = arg; > > + AVFrame *pic = td->frame; > > + int stride = td->stride; > > + int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4); > > + int slice_start = (avctx->height * jobnr) / thread_count; > > + int slice_end = (avctx->height * (jobnr+1)) / thread_count; > > + const uint8_t *src = td->buf + stride * slice_start; > > uint16_t *y, *u, *v; > > uint32_t val; > > - int i, j, ret; > > - > > - if (avpkt->size < 4 * avctx->height * avctx->width) { > > - av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n"); > > - return AVERROR(EINVAL); > > - } > > - > > - if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) > > - return ret; > > + int i, j; > > > > - pic->key_frame = 1; > > - pic->pict_type = AV_PICTURE_TYPE_I; > > + y = (uint16_t*)pic->data[0] + slice_start * (pic->linesize[0] >> 1); > > + u = (uint16_t*)pic->data[1] + slice_start * (pic->linesize[1] >> 1); > > + v = (uint16_t*)pic->data[2] + slice_start * (pic->linesize[2] >> 1); > > > > - y = (uint16_t *)pic->data[0]; > > - u = (uint16_t *)pic->data[1]; > > - v = (uint16_t *)pic->data[2]; > > - > > - for (i = 0; i < avctx->height; i++) { > > + for (i = slice_start; i < slice_end; i++) { > > for (j = 0; j < avctx->width; j++) { > > val = AV_RL32(src); > > > > @@ -82,6 +82,35 @@ static int v410_decode_frame(AVCodecContext *avctx, void *data, > > v += pic->linesize[2] >> 1; > > } > > > > + return 0; > > +} > > + > > +static int v410_decode_frame(AVCodecContext *avctx, void *data, > > + int *got_frame, AVPacket *avpkt) > > +{ > > + ThreadData td; > > + ThreadFrame frame = { .f = data }; > > + AVFrame *pic = data; > > + uint8_t *src = avpkt->data; > > + int ret; > > + int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4); > > + > > + td.stride = avctx->width * 4; > > + if (avpkt->size < 4 * avctx->height * avctx->width) { > > + av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n"); > > + return AVERROR(EINVAL); > > + } > > + > > + if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) > > + return ret; > > + > > + pic->key_frame = 1; > > + pic->pict_type = AV_PICTURE_TYPE_I; > > + > > + td.buf = src; > > + td.frame = pic; > > + avctx->execute2(avctx, v410_decode_slice, &td, NULL, thread_count); > > + > > *got_frame = 1; > > > > return avpkt->size; > > @@ -94,5 +123,6 @@ AVCodec ff_v410_decoder = { > > .id = AV_CODEC_ID_V410, > > .init = v410_decode_init, > > .decode = v410_decode_frame, > > - .capabilities = AV_CODEC_CAP_DR1, > > + .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS | > > + AV_CODEC_CAP_FRAME_THREADS > > }; > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
On Sun, Nov 24, 2019 at 05:02:55PM -0300, James Almer wrote: > On 11/24/2019 4:54 PM, Michael Niedermayer wrote: > > On Sun, Nov 24, 2019 at 04:00:07PM +0100, Michael Niedermayer wrote: > >> On Sat, Nov 23, 2019 at 12:09:09AM +0800, Limin Wang wrote: > >>> > >>> ping, also ping with v210dec thread support which its reviewed by Michael. > >>> https://patchwork.ffmpeg.org/patch/15836/ > >>> > >>> If no developer is interested in the module, I'm glad to maintain it, > >>> I think it's better than nobody. Please feedback. > >> > >> will apply the v410 patch after correcting a english grammer typo > > > > as james asked for more benchmarks, ill wait with applying so the > > commit can contain benchmarks for both cases Have updated the benchmarks result for both v410 and v210. v210 is sending out by its own thread. > > > > thx > > Actually, i wrote that reply too soon. I missed the fact Carl asked the > same thing and the benchmarks were provided. > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff --git a/libavcodec/v410dec.c b/libavcodec/v410dec.c index 48fab68273..7ad5eb8fb5 100644 --- a/libavcodec/v410dec.c +++ b/libavcodec/v410dec.c @@ -24,6 +24,13 @@ #include "libavutil/intreadwrite.h" #include "avcodec.h" #include "internal.h" +#include "thread.h" + +typedef struct ThreadData { + AVFrame *frame; + uint8_t *buf; + int stride; +} ThreadData; static av_cold int v410_decode_init(AVCodecContext *avctx) { @@ -42,31 +49,24 @@ static av_cold int v410_decode_init(AVCodecContext *avctx) return 0; } -static int v410_decode_frame(AVCodecContext *avctx, void *data, - int *got_frame, AVPacket *avpkt) +static int v410_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr) { - AVFrame *pic = data; - uint8_t *src = avpkt->data; + ThreadData *td = arg; + AVFrame *pic = td->frame; + int stride = td->stride; + int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4); + int slice_start = (avctx->height * jobnr) / thread_count; + int slice_end = (avctx->height * (jobnr+1)) / thread_count; + const uint8_t *src = td->buf + stride * slice_start; uint16_t *y, *u, *v; uint32_t val; - int i, j, ret; - - if (avpkt->size < 4 * avctx->height * avctx->width) { - av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n"); - return AVERROR(EINVAL); - } - - if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) - return ret; + int i, j; - pic->key_frame = 1; - pic->pict_type = AV_PICTURE_TYPE_I; + y = (uint16_t*)pic->data[0] + slice_start * (pic->linesize[0] >> 1); + u = (uint16_t*)pic->data[1] + slice_start * (pic->linesize[1] >> 1); + v = (uint16_t*)pic->data[2] + slice_start * (pic->linesize[2] >> 1); - y = (uint16_t *)pic->data[0]; - u = (uint16_t *)pic->data[1]; - v = (uint16_t *)pic->data[2]; - - for (i = 0; i < avctx->height; i++) { + for (i = slice_start; i < slice_end; i++) { for (j = 0; j < avctx->width; j++) { val = AV_RL32(src); @@ -82,6 +82,35 @@ static int v410_decode_frame(AVCodecContext *avctx, void *data, v += pic->linesize[2] >> 1; } + return 0; +} + +static int v410_decode_frame(AVCodecContext *avctx, void *data, + int *got_frame, AVPacket *avpkt) +{ + ThreadData td; + ThreadFrame frame = { .f = data }; + AVFrame *pic = data; + uint8_t *src = avpkt->data; + int ret; + int thread_count = av_clip(avctx->thread_count, 1, avctx->height/4); + + td.stride = avctx->width * 4; + if (avpkt->size < 4 * avctx->height * avctx->width) { + av_log(avctx, AV_LOG_ERROR, "Insufficient input data.\n"); + return AVERROR(EINVAL); + } + + if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) + return ret; + + pic->key_frame = 1; + pic->pict_type = AV_PICTURE_TYPE_I; + + td.buf = src; + td.frame = pic; + avctx->execute2(avctx, v410_decode_slice, &td, NULL, thread_count); + *got_frame = 1; return avpkt->size; @@ -94,5 +123,6 @@ AVCodec ff_v410_decoder = { .id = AV_CODEC_ID_V410, .init = v410_decode_init, .decode = v410_decode_frame, - .capabilities = AV_CODEC_CAP_DR1, + .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS | + AV_CODEC_CAP_FRAME_THREADS };