From patchwork Sun Sep 1 13:29:07 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Lance Wang X-Patchwork-Id: 14834 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 4C699449D28 for ; Sun, 1 Sep 2019 16:29:21 +0300 (EEST) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 2DD956805FE; Sun, 1 Sep 2019 16:29:21 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-pg1-f182.google.com (mail-pg1-f182.google.com [209.85.215.182]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 98AC6680468 for ; Sun, 1 Sep 2019 16:29:14 +0300 (EEST) Received: by mail-pg1-f182.google.com with SMTP id i18so5878617pgl.11 for ; Sun, 01 Sep 2019 06:29:14 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20161025; h=from:to:cc:subject:date:message-id:in-reply-to:references; bh=6Ul16eWvcycm8/j6udRxEgI0xoJN/CBw4LzrgPqnvpM=; b=Re7Y+K5W6lawx5gP2sdp78us7Yf5I77eK/CEQYfAFqakuIL1DRA3C0ZVj7n5PYGcTj ueHQrHs5iqNHGblNvyxUEuQ5K1P1VejHZhlXaonn7eAicMLO1KZXOg6KlzgM0p52TKWk nbeRbX3UGqtDxlTtML9biCr5i+SfVfcvdrwp1idkf9J6r20LpY5d2SC9h+sEcBJ0848K RiVaq+klGC2BkEOy0zM9tJD/qzigXarAha2XBHOYaQzVnzpdf0AzQKB1tSPBwb13fmLe njlWXbmFF/cVnmMxFqINpl0OPCuD4UdUnE26YmDj+bfrrUsvMVjlezJUDKYxnUdsUSYI PozA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references; bh=6Ul16eWvcycm8/j6udRxEgI0xoJN/CBw4LzrgPqnvpM=; b=o3eFh7mlCjlWqCBGuQLmQjk0h6qQZ1SWf1/YTqyAzsS4UK5RyIGvTEURQGYyCUFj77 zPRQEjOScBhfFrWRJjkrAzf/x7nTerWJGUaJsuY1znTmbsBITOz9iqzSHu24T1Qp+Ely oTjSM/BIwZwLwLHZ4CURBbwSE0n8N4Rg2v2TESO7gThn1LlJxMEOCsHDXj3hrdufTEoq 8qMwPZXf4j30GuQlqh3FMT87VD3VRVzS63It1rjdWcTqhK90voo9d5yRU6lwmmN/15Ll rsCdyJgTA4hJZpOZg2hSyKouuOL54GabEuoJS5pUMTC1h/VSY7yjnHbaFD62oUXbjdKg 1k4Q== X-Gm-Message-State: APjAAAUThvskRqbiGzhUO09HEblg+Ma4VPD9OAD0ZTnri6K1oOaHHXvW 5ccDiU4RI5LLbL7qAw8FvVVb9ls4 X-Google-Smtp-Source: APXvYqw18OAWJAqHskUpKemxZvhaeeXfX7SsEJBXVt1R8/R5SPoJqg9NnZe2G6J+iGkhfZr/qOz6Ng== X-Received: by 2002:a62:52d0:: with SMTP id g199mr12741259pfb.120.1567344552727; Sun, 01 Sep 2019 06:29:12 -0700 (PDT) Received: from vpn.localdomain ([47.90.99.151]) by smtp.gmail.com with ESMTPSA id s5sm11648100pjo.26.2019.09.01.06.29.11 (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Sun, 01 Sep 2019 06:29:12 -0700 (PDT) From: lance.lmwang@gmail.com To: ffmpeg-devel@ffmpeg.org Date: Sun, 1 Sep 2019 21:29:07 +0800 Message-Id: <20190901132907.28620-1-lance.lmwang@gmail.com> X-Mailer: git-send-email 2.9.5 In-Reply-To: <20190830033752.26454-1-lance.lmwang@gmail.com> References: <20190830033752.26454-1-lance.lmwang@gmail.com> Subject: [FFmpeg-devel] [PATCH v1 2/4] avcodec/v210dec: add the slice threading support X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Limin Wang MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" From: Limin Wang The multithread is avoid one core cpu is full with other filter like scale etc. About the performance, the gain is very small, below is my testing for performance. In order to avoid the disk bottleneck, I'll use stream_loop mode for 10 frame only. ./ffmpeg -y -i ~/Movies/4k_Rec709_ProResHQ.mov -c:v v210 -f rawvideo -frames 10 ~/Movies/1.v210 ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 1000 -i ~/Movies/1.v210 -benchmark -f null - master: frame= 1010 fps= 40 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.59x video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown bench: utime=10.724s stime=14.580s rtime=25.405s bench: maxrss=147800064kB ./ffmpeg -threads 4 -s 4096x3072 -stream_loop 100 -i ~/Movies/1.v210 -benchmark -f null - patched: frame= 1010 fps= 45 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.78x video:529kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown bench: utime=27.463s stime=14.760s rtime=22.699s bench: maxrss=147828736kB Signed-off-by: Limin Wang --- libavcodec/v210dec.c | 136 ++++++++++++++++++++++++++++++++------------------- libavcodec/v210dec.h | 1 + 2 files changed, 87 insertions(+), 50 deletions(-) diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c index 6ce18aa..d3add8b 100644 --- a/libavcodec/v210dec.c +++ b/libavcodec/v210dec.c @@ -28,6 +28,7 @@ #include "libavutil/internal.h" #include "libavutil/mem.h" #include "libavutil/intreadwrite.h" +#include "thread.h" #define READ_PIXELS(a, b, c) \ do { \ @@ -37,6 +38,13 @@ *c++ = (val >> 20) & 0x3FF; \ } while (0) +#define MAX_SLICES 32 +typedef struct ThreadData { + AVFrame *frame; + uint8_t *buf; + int stride; +} ThreadData; + static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) { uint32_t val; @@ -67,58 +75,32 @@ static av_cold int decode_init(AVCodecContext *avctx) s->aligned_input = 0; ff_v210dec_init(s); + s->slice_count = av_clip(avctx->thread_count, 1, MAX_SLICES); return 0; } -static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, - AVPacket *avpkt) +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int nb_jobs) { V210DecContext *s = avctx->priv_data; - - int h, w, ret, stride, aligned_input; - AVFrame *pic = data; - const uint8_t *psrc = avpkt->data; + int h, w; + ThreadData *td = arg; + AVFrame *frame = td->frame; + int stride = td->stride; + int slice_h = avctx->height / s->slice_count; + int slice_m = avctx->height % s->slice_count; + int slice_start = jobnr * slice_h; + int slice_end = slice_start + slice_h; + const uint8_t *psrc = td->buf + stride * slice_start; uint16_t *y, *u, *v; - if (s->custom_stride ) - stride = s->custom_stride; - else { - int aligned_width = ((avctx->width + 47) / 48) * 48; - stride = aligned_width * 8 / 3; - } + /* add the remaining slice for the last job */ + if (jobnr == s->slice_count - 1) + slice_end += slice_m; - if (avpkt->size < stride * avctx->height) { - if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) { - stride = avpkt->size / avctx->height; - if (!s->stride_warning_shown) - av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small padding (64 byte) detected\n"); - s->stride_warning_shown = 1; - } else { - av_log(avctx, AV_LOG_ERROR, "packet too small\n"); - return AVERROR_INVALIDDATA; - } - } - if (avctx->codec_tag == MKTAG('C', '2', '1', '0') - && AV_RN32(psrc) == AV_RN32("INFO") - && avpkt->size - 64 >= stride * avctx->height) - psrc += 64; - - aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f); - if (aligned_input != s->aligned_input) { - s->aligned_input = aligned_input; - ff_v210dec_init(s); - } - - if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) - return ret; - - y = (uint16_t*)pic->data[0]; - u = (uint16_t*)pic->data[1]; - v = (uint16_t*)pic->data[2]; - pic->pict_type = AV_PICTURE_TYPE_I; - pic->key_frame = 1; - - for (h = 0; h < avctx->height; h++) { + y = (uint16_t*)frame->data[0] + slice_start * frame->linesize[0] / 2; + u = (uint16_t*)frame->data[1] + slice_start * frame->linesize[1] / 2; + v = (uint16_t*)frame->data[2] + slice_start * frame->linesize[2] / 2; + for (h = slice_start; h < slice_end; h++) { const uint32_t *src = (const uint32_t*)psrc; uint32_t val; @@ -154,16 +136,68 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, } psrc += stride; - y += pic->linesize[0] / 2 - avctx->width + (avctx->width & 1); - u += pic->linesize[1] / 2 - avctx->width / 2; - v += pic->linesize[2] / 2 - avctx->width / 2; + y += frame->linesize[0] / 2 - avctx->width + (avctx->width & 1); + u += frame->linesize[1] / 2 - avctx->width / 2; + v += frame->linesize[2] / 2 - avctx->width / 2; + } + + return 0; +} + +static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, + AVPacket *avpkt) +{ + V210DecContext *s = avctx->priv_data; + ThreadData td; + int ret, stride, aligned_input; + AVFrame *frame = data; + const uint8_t *psrc = avpkt->data; + + if (s->custom_stride ) + stride = s->custom_stride; + else { + int aligned_width = ((avctx->width + 47) / 48) * 48; + stride = aligned_width * 8 / 3; } + td.stride = stride; + + if (avpkt->size < stride * avctx->height) { + if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) { + stride = avpkt->size / avctx->height; + if (!s->stride_warning_shown) + av_log(avctx, AV_LOG_WARNING, "Broken v210 with too small padding (64 byte) detected\n"); + s->stride_warning_shown = 1; + } else { + av_log(avctx, AV_LOG_ERROR, "packet too small\n"); + return AVERROR_INVALIDDATA; + } + } + if (avctx->codec_tag == MKTAG('C', '2', '1', '0') + && AV_RN32(psrc) == AV_RN32("INFO") + && avpkt->size - 64 >= stride * avctx->height) + psrc += 64; + + aligned_input = !((uintptr_t)psrc & 0x1f) && !(stride & 0x1f); + if (aligned_input != s->aligned_input) { + s->aligned_input = aligned_input; + ff_v210dec_init(s); + } + + if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) + return ret; + + frame->pict_type = AV_PICTURE_TYPE_I; + frame->key_frame = 1; + + td.buf = (uint8_t*)psrc; + td.frame = frame; + avctx->execute2(avctx, v210_decode_slice, &td, NULL, s->slice_count); if (avctx->field_order > AV_FIELD_PROGRESSIVE) { /* we have interlaced material flagged in container */ - pic->interlaced_frame = 1; + frame->interlaced_frame = 1; if (avctx->field_order == AV_FIELD_TT || avctx->field_order == AV_FIELD_TB) - pic->top_field_first = 1; + frame->top_field_first = 1; } *got_frame = 1; @@ -193,6 +227,8 @@ AVCodec ff_v210_decoder = { .priv_data_size = sizeof(V210DecContext), .init = decode_init, .decode = decode_frame, - .capabilities = AV_CODEC_CAP_DR1, + .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_SLICE_THREADS, .priv_class = &v210dec_class, + .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE | + FF_CODEC_CAP_INIT_CLEANUP, }; diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h index cfdb29d..3581943 100644 --- a/libavcodec/v210dec.h +++ b/libavcodec/v210dec.h @@ -26,6 +26,7 @@ typedef struct { AVClass *av_class; int custom_stride; + int slice_count; // Number of slices for threaded operations int aligned_input; int stride_warning_shown; void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);