From patchwork Fri Oct 18 01:15:09 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Lance Wang X-Patchwork-Id: 15835 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id C3B8B4498C8 for ; Fri, 18 Oct 2019 04:15:22 +0300 (EEST) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 9C61768AC46; Fri, 18 Oct 2019 04:15:22 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-pl1-f196.google.com (mail-pl1-f196.google.com [209.85.214.196]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id E0F5D68ABB1 for ; Fri, 18 Oct 2019 04:15:15 +0300 (EEST) Received: by mail-pl1-f196.google.com with SMTP id d22so1998207pll.7 for ; Thu, 17 Oct 2019 18:15:15 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20161025; h=from:to:cc:subject:date:message-id:in-reply-to:references; bh=xJJgKsfHBLhVew+iLqly5tv9i/etcKKaHv1DeYx46wQ=; b=euHvR2XvJNdjzB2atf4Y3SJkjIVkRx1odOVtVMkckAA2dvPF+byieXZ49YxF04nAa1 c6nVuzLklp0n1ZH3Be/xNsKcuGZDAYrtgLn6WimBqQpU7Wgh0uD82yg2sx/mNaDSG4gk 2qbqC7Kc991KkpnLrZw/xa66X0+oqvMAJ0llJDlF8R8Km37cmTjNbZuA+NvQ+adz8O9w YSUJPg8HOYYO5i5Vn/N5iGNh7KQtGyx2XrDYiGCXvROHFbFQsQMnB6YNBfomJ4i5GEmv 7eM22CFpvyfCQG6HYbAWnpebWEUSi2M3X/hHQbaIpGEArVvL3FXTVa/LsQNxPgEmo7s+ ZHVQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references; bh=xJJgKsfHBLhVew+iLqly5tv9i/etcKKaHv1DeYx46wQ=; b=qROcScr7+S+ilrLdEU/wD6woMH/xChdp/rT10epOzWczteQR0KBG58xcua2PXlspaD q+mmW02ZG9Dw/L624iAZYgYsTfAuj6F1VILbwBZSxDJIDpaGWordsWJ3SB9xAfYEIMZZ 59xkmC9Kh2lQDQ0AZD+PLn2CL5O1FmndMqb6mXFzRD+MSxT+fHpEuyEpKvNqQiwOjwhP zqOOr/V8r7eoyIYMt+tb0YhhUUOWVvZ/rhjSab3c2BUjMXlYDHXv3HTPmPuPymx7AzZo 4Z4QfROykRmOhJIe9AC3pRhGvX3/q2p3jc1beHOe9irqU6HwsL9KC8L+dBrpPNBx1XFX 4r8w== X-Gm-Message-State: APjAAAXS571ieST1EyQrPyFbTgGp500nEFa+PnM3he6gk+79/JNRRqqf 8bqB09sHX7XFMlIi1h3eykllaNf+9XQ= X-Google-Smtp-Source: APXvYqxjwm/MP4tm7by4CDsqdjQN/Tgqx+plOcBtYG9mtk84RN5IkA9afah2UwyJR+5mZDsUQMcKcQ== X-Received: by 2002:a17:902:9a93:: with SMTP id w19mr6693782plp.316.1571361313895; Thu, 17 Oct 2019 18:15:13 -0700 (PDT) Received: from vpn.localdomain ([47.90.99.151]) by smtp.gmail.com with ESMTPSA id h14sm3911759pfo.15.2019.10.17.18.15.12 (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Thu, 17 Oct 2019 18:15:13 -0700 (PDT) From: lance.lmwang@gmail.com To: ffmpeg-devel@ffmpeg.org Date: Fri, 18 Oct 2019 09:15:09 +0800 Message-Id: <20191018011509.26915-1-lance.lmwang@gmail.com> X-Mailer: git-send-email 2.9.5 In-Reply-To: <20190830033752.26454-2-lance.lmwang@gmail.com> References: <20190830033752.26454-2-lance.lmwang@gmail.com> Subject: [FFmpeg-devel] [PATCH v7] avcodec/v210dec: add the frame and slice threading support X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Limin Wang MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" From: Limin Wang Threading is to avoid a core cpu being occupied fully with other filters like scale, regarding performance, if your cpu frequency is very high, the gain is very small, but with more cores and fewer cpu MHz cpus, you will get more improvements. The following is my testing results of performance on two different system: 1, testing result with my old mac pro ./ffmpeg -y -i ./4k_4096_3072.mov -c:v v210 -f rawvideo -frames 10 ./1.v210 ./ffmpeg -threads 1 -s 4096x3072 -stream_loop 100 -i ./1.v210 -benchmark -f null - frame= 1010 fps= 42 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=1.69x patch applied: ./ffmpeg -threads 4 -thread_type frame+slice -s 4096x3072 -stream_loop 100 -i ./1.v210 -benchmark -f null - frame= 1010 fps= 55 q=-0.0 Lsize=N/A time=00:00:40.40 bitrate=N/A speed=2.22x 2, testing result with x86 server (Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz): ./ffmpeg -y -i ./4k_3840_2160.ts -c:v v210 -f rawvideo -frames 50 ./2.v210 ./ffmpeg -threads 1 -s 3840x2160 -stream_loop 20 -i ./2.v210 -benchmark -f null - frame= 1050 fps= 80 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=3.19x patch applied: ./ffmpeg -threads 2 -thread_type frame+slice -s 3840x2160 -stream_loop 20 -i ./2.v210 -benchmark -f null - frame= 1050 fps=111 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=4.45x ./ffmpeg -threads 4 -thread_type frame+slice -s 3840x2160 -stream_loop 20 -i ./2.v210 -benchmark -f null - frame= 1050 fps=145 q=-0.0 Lsize=N/A time=00:00:42.00 bitrate=N/A speed=5.81x Signed-off-by: Limin Wang --- libavcodec/v210dec.c | 126 ++++++++++++++++++++++++++----------------- libavcodec/v210dec.h | 1 + 2 files changed, 79 insertions(+), 48 deletions(-) diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c index 5a33d8c089..4b436d2fa0 100644 --- a/libavcodec/v210dec.c +++ b/libavcodec/v210dec.c @@ -28,6 +28,7 @@ #include "libavutil/internal.h" #include "libavutil/mem.h" #include "libavutil/intreadwrite.h" +#include "thread.h" #define READ_PIXELS(a, b, c) \ do { \ @@ -37,6 +38,12 @@ *c++ = (val >> 20) & 0x3FF; \ } while (0) +typedef struct ThreadData { + AVFrame *frame; + uint8_t *buf; + int stride; +} ThreadData; + static void v210_planar_unpack_c(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) { uint32_t val; @@ -64,21 +71,81 @@ static av_cold int decode_init(AVCodecContext *avctx) avctx->pix_fmt = AV_PIX_FMT_YUV422P10; avctx->bits_per_raw_sample = 10; + s->thread_count = av_clip(avctx->thread_count, 1, avctx->height/4); s->aligned_input = 0; ff_v210dec_init(s); return 0; } +static int v210_decode_slice(AVCodecContext *avctx, void *arg, int jobnr, int threadnr) +{ + V210DecContext *s = avctx->priv_data; + int h, w; + ThreadData *td = arg; + AVFrame *frame = td->frame; + int stride = td->stride; + int slice_start = (avctx->height * jobnr ) / s->thread_count; + int slice_end = (avctx->height * (jobnr+1)) / s->thread_count; + uint8_t *psrc = td->buf + stride * slice_start; + uint16_t *y, *u, *v; + + y = (uint16_t*)frame->data[0] + slice_start * frame->linesize[0] / 2; + u = (uint16_t*)frame->data[1] + slice_start * frame->linesize[1] / 2; + v = (uint16_t*)frame->data[2] + slice_start * frame->linesize[2] / 2; + for (h = slice_start; h < slice_end; h++) { + const uint32_t *src = (const uint32_t*)psrc; + uint32_t val; + + w = (avctx->width / 12) * 12; + s->unpack_frame(src, y, u, v, w); + + y += w; + u += w >> 1; + v += w >> 1; + src += (w << 1) / 3; + + if (w < avctx->width - 5) { + READ_PIXELS(u, y, v); + READ_PIXELS(y, u, y); + READ_PIXELS(v, y, u); + READ_PIXELS(y, v, y); + w += 6; + } + + if (w < avctx->width - 1) { + READ_PIXELS(u, y, v); + + val = av_le2ne32(*src++); + *y++ = val & 0x3FF; + if (w < avctx->width - 3) { + *u++ = (val >> 10) & 0x3FF; + *y++ = (val >> 20) & 0x3FF; + + val = av_le2ne32(*src++); + *v++ = val & 0x3FF; + *y++ = (val >> 10) & 0x3FF; + } + } + + psrc += stride; + y += frame->linesize[0] / 2 - avctx->width + (avctx->width & 1); + u += frame->linesize[1] / 2 - avctx->width / 2; + v += frame->linesize[2] / 2 - avctx->width / 2; + } + + return 0; +} + static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt) { V210DecContext *s = avctx->priv_data; - - int h, w, ret, stride, aligned_input; + ThreadData td; + int ret, stride, aligned_input; + ThreadFrame frame = { .f = data }; AVFrame *pic = data; const uint8_t *psrc = avpkt->data; - uint16_t *y, *u, *v; if (s->custom_stride ) stride = s->custom_stride; @@ -86,6 +153,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, int aligned_width = ((avctx->width + 47) / 48) * 48; stride = aligned_width * 8 / 3; } + td.stride = stride; if (avpkt->size < stride * avctx->height) { if ((((avctx->width + 23) / 24) * 24 * 8) / 3 * avctx->height == avpkt->size) { @@ -110,55 +178,15 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, ff_v210dec_init(s); } - if ((ret = ff_get_buffer(avctx, pic, 0)) < 0) + if ((ret = ff_thread_get_buffer(avctx, &frame, 0)) < 0) return ret; - y = (uint16_t*)pic->data[0]; - u = (uint16_t*)pic->data[1]; - v = (uint16_t*)pic->data[2]; pic->pict_type = AV_PICTURE_TYPE_I; pic->key_frame = 1; - for (h = 0; h < avctx->height; h++) { - const uint32_t *src = (const uint32_t*)psrc; - uint32_t val; - - w = (avctx->width / 12) * 12; - s->unpack_frame(src, y, u, v, w); - - y += w; - u += w >> 1; - v += w >> 1; - src += (w << 1) / 3; - - if (w < avctx->width - 5) { - READ_PIXELS(u, y, v); - READ_PIXELS(y, u, y); - READ_PIXELS(v, y, u); - READ_PIXELS(y, v, y); - w += 6; - } - - if (w < avctx->width - 1) { - READ_PIXELS(u, y, v); - - val = av_le2ne32(*src++); - *y++ = val & 0x3FF; - if (w < avctx->width - 3) { - *u++ = (val >> 10) & 0x3FF; - *y++ = (val >> 20) & 0x3FF; - - val = av_le2ne32(*src++); - *v++ = val & 0x3FF; - *y++ = (val >> 10) & 0x3FF; - } - } - - psrc += stride; - y += pic->linesize[0] / 2 - avctx->width + (avctx->width & 1); - u += pic->linesize[1] / 2 - avctx->width / 2; - v += pic->linesize[2] / 2 - avctx->width / 2; - } + td.buf = (uint8_t*)psrc; + td.frame = pic; + avctx->execute2(avctx, v210_decode_slice, &td, NULL, s->thread_count); if (avctx->field_order > AV_FIELD_PROGRESSIVE) { /* we have interlaced material flagged in container */ @@ -194,6 +222,8 @@ AVCodec ff_v210_decoder = { .priv_data_size = sizeof(V210DecContext), .init = decode_init, .decode = decode_frame, - .capabilities = AV_CODEC_CAP_DR1, + .capabilities = AV_CODEC_CAP_DR1 | + AV_CODEC_CAP_SLICE_THREADS | + AV_CODEC_CAP_FRAME_THREADS, .priv_class = &v210dec_class, }; diff --git a/libavcodec/v210dec.h b/libavcodec/v210dec.h index cfdb29da09..662e266315 100644 --- a/libavcodec/v210dec.h +++ b/libavcodec/v210dec.h @@ -27,6 +27,7 @@ typedef struct { AVClass *av_class; int custom_stride; int aligned_input; + int thread_count; int stride_warning_shown; void (*unpack_frame)(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); } V210DecContext;