From patchwork Sun Dec 30 20:57:23 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alex Mogurenko X-Patchwork-Id: 11594 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 0742344CDE1 for ; Sun, 30 Dec 2018 23:04:49 +0200 (EET) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id AD87C68A357; Sun, 30 Dec 2018 23:04:45 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-lf1-f68.google.com (mail-lf1-f68.google.com [209.85.167.68]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 04DD7689A72 for ; Sun, 30 Dec 2018 23:04:38 +0200 (EET) Received: by mail-lf1-f68.google.com with SMTP id i26so17495210lfc.0 for ; Sun, 30 Dec 2018 13:04:46 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20161025; h=sender:from:to:subject:date:message-id:mime-version :content-transfer-encoding; bh=zshMXvnCt/TIMz8B5Nro9n0pu48RXhNiW1cdfLh5asI=; b=tfYFsQH/WMg2/GjaV+m+74r0S/RdhBJiK4xIml7sDPwjXPpS1hKpRq7lP8Iz1hR+Pd eCK8fLnr2TXVtQxAPXVFiz6TW+8D8UisvAp4TiosiKVkEjm/RtYAjBgzvYYcUBqrFkJw ADUUTOQNWJfupt+w4KXlN9hRHU7avdnVBccX59wCX4ehUvu+JQYfsOLVVt1qa1U852MH gKnVEaVx/iXuPixQPpyUEyjdmgsDaoRLziCoGBaBX5GjJLyHno28CNigYNiIMcWVEP4E qGRpuFbg2Ap25XIcdgPlPIsBiitalJU5WgrJL4o9t6T4e0RhOPgEiF4z39L2Xs/iUThJ jmfQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:sender:from:to:subject:date:message-id :mime-version:content-transfer-encoding; bh=zshMXvnCt/TIMz8B5Nro9n0pu48RXhNiW1cdfLh5asI=; b=G/KjSm1L3seHJdoFiaLWLv3u3/8X0pcsjxRBEwxjVv3t3x2VHbAU6KKd4EfUUNMMa1 HKpwQu59qvce3c5VzWKzmGnMyMKX572MFqWwAaMH9BgCZrYGWvVTIYwMkzKrcoJCAnTg 3Xp52Uu6Br2w8pJKratIJ11MRPm11AzQhJ0UygnTOUbvjB68zzmWUzkfj18PL9WnC7+9 OvBIROQmAKwfw3V7X6S6W9/RoGed9XW7AG4O2JysjcNTBV9G9fufKW0ejj7xVA7/4ZOU qJk/lndRfJyHbWgNh+iFHwETd66AkhjLDQCyTeuoQ2vDx47WVsBJFdqbiVpqMvIUXeM1 1JMg== X-Gm-Message-State: AA+aEWbUsV+mAnBvS/I3fCfmdXwGi1/CMnphPEbrZQCCby+1DeM1ofBH 1gCSfZm23qOgO3230IFl1cz9Gf03 X-Google-Smtp-Source: AFSGD/WvAdKaSxLATUTEZPrkSUO4YX/sJfVEtV8gxLFA6s9m6PKRhD5FOMi/DHGyas1LQdnfFBV5MQ== X-Received: by 2002:a19:4948:: with SMTP id l8mr18371653lfj.156.1546203456702; Sun, 30 Dec 2018 12:57:36 -0800 (PST) Received: from localhost.localdomain ([77.121.132.178]) by smtp.gmail.com with ESMTPSA id g12-v6sm9654274lja.74.2018.12.30.12.57.35 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Sun, 30 Dec 2018 12:57:36 -0800 (PST) From: Alex Mogurenko To: ffmpeg-devel@ffmpeg.org Date: Sun, 30 Dec 2018 22:57:23 +0200 Message-Id: <20181230205723.2596-1-alex@mogurenko.com> X-Mailer: git-send-email 2.19.0 MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH] avcodec/prores_ks reduce twice fdct calls X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" fdct done twice for each block. first time during quant calculation, second during slice encoding. so if we pre-save dct coefficients no need to do fdct second time. disadvantages: requires more memory advantages: improves performance ~4-5% --- libavcodec/proresenc_kostya.c | 74 ++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 22 deletions(-) diff --git a/libavcodec/proresenc_kostya.c b/libavcodec/proresenc_kostya.c index e045a972f1..4d49d6521a 100644 --- a/libavcodec/proresenc_kostya.c +++ b/libavcodec/proresenc_kostya.c @@ -219,7 +219,6 @@ struct TrellisNode { #define MAX_STORED_Q 16 typedef struct ProresThreadData { - DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE]; DECLARE_ALIGNED(16, uint16_t, emu_buf)[16 * 16]; int16_t custom_q[64]; int16_t custom_chroma_q[64]; @@ -228,7 +227,6 @@ typedef struct ProresThreadData { typedef struct ProresContext { AVClass *class; - DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE]; DECLARE_ALIGNED(16, uint16_t, emu_buf)[16*16]; int16_t quants[MAX_STORED_Q][64]; int16_t quants_chroma[MAX_STORED_Q][64]; @@ -237,6 +235,7 @@ typedef struct ProresContext { const uint8_t *quant_mat; const uint8_t *quant_chroma_mat; const uint8_t *scantable; + int16_t *blocks[MAX_PLANES]; void (*fdct)(FDCTDSPContext *fdsp, const uint16_t *src, ptrdiff_t linesize, int16_t *block); @@ -562,6 +561,8 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic, int plane_factor, is_chroma; uint16_t *qmat; uint16_t *qmat_chroma; + int16_t *blocks; + DECLARE_ALIGNED(16, int16_t, dct_blocks)[16 * 16 * MAX_MBS_PER_SLICE]; if (ctx->pictures_per_frame == 1) line_add = 0; @@ -604,28 +605,38 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic, src = (const uint16_t*)(pic->data[i] + yp * linesize + line_add * pic->linesize[i]) + xp; + if (!ctx->force_quant) { + blocks = ctx->blocks[i] + (y * ctx->slices_width + x / ctx->mbs_per_slice) * 16 * 16 * ctx->mbs_per_slice; + } else { + blocks = dct_blocks; + } + if (i < 3) { - get_slice_data(ctx, src, linesize, xp, yp, - pwidth, avctx->height / ctx->pictures_per_frame, - ctx->blocks[0], ctx->emu_buf, - mbs_per_slice, num_cblocks, is_chroma); + if (ctx->force_quant) { + get_slice_data(ctx, src, linesize, xp, yp, + pwidth, avctx->height / ctx->pictures_per_frame, + blocks, ctx->emu_buf, + mbs_per_slice, num_cblocks, is_chroma); + } if (!is_chroma) {/* luma quant */ sizes[i] = encode_slice_plane(ctx, pb, src, linesize, - mbs_per_slice, ctx->blocks[0], + mbs_per_slice, blocks, num_cblocks, plane_factor, qmat); } else { /* chroma plane */ sizes[i] = encode_slice_plane(ctx, pb, src, linesize, - mbs_per_slice, ctx->blocks[0], + mbs_per_slice, blocks, num_cblocks, plane_factor, qmat_chroma); } } else { - get_alpha_data(ctx, src, linesize, xp, yp, - pwidth, avctx->height / ctx->pictures_per_frame, - ctx->blocks[0], mbs_per_slice, ctx->alpha_bits); + if (ctx->force_quant) { + get_alpha_data(ctx, src, linesize, xp, yp, + pwidth, avctx->height / ctx->pictures_per_frame, + blocks, mbs_per_slice, ctx->alpha_bits); + } sizes[i] = encode_alpha_plane(ctx, pb, mbs_per_slice, - ctx->blocks[0], quant); + blocks, quant); } total_size += sizes[i]; if (put_bits_left(pb) < 0) { @@ -730,15 +741,15 @@ static int estimate_slice_plane(ProresContext *ctx, int *error, int plane, const uint16_t *src, ptrdiff_t linesize, int mbs_per_slice, int blocks_per_mb, int plane_size_factor, - const int16_t *qmat, ProresThreadData *td) + const int16_t *qmat, int16_t *blocks) { int blocks_per_slice; int bits; blocks_per_slice = mbs_per_slice * blocks_per_mb; - bits = estimate_dcs(error, td->blocks[plane], blocks_per_slice, qmat[0]); - bits += estimate_acs(error, td->blocks[plane], blocks_per_slice, + bits = estimate_dcs(error, blocks, blocks_per_slice, qmat[0]); + bits += estimate_acs(error, blocks, blocks_per_slice, plane_size_factor, ctx->scantable, qmat); return FFALIGN(bits, 8); @@ -819,6 +830,7 @@ static int find_slice_quant(AVCodecContext *avctx, int overquant; uint16_t *qmat; uint16_t *qmat_chroma; + int16_t *blocks[MAX_PLANES]; int linesize[4], line_add; int alpha_bits = 0; @@ -848,16 +860,17 @@ static int find_slice_quant(AVCodecContext *avctx, linesize[i] = ctx->pic->linesize[i] * ctx->pictures_per_frame; src = (const uint16_t *)(ctx->pic->data[i] + yp * linesize[i] + line_add * ctx->pic->linesize[i]) + xp; + blocks[i] = ctx->blocks[i] + (y * ctx->slices_width + x / ctx->mbs_per_slice) * 16 * 16 * ctx->mbs_per_slice; if (i < 3) { get_slice_data(ctx, src, linesize[i], xp, yp, pwidth, avctx->height / ctx->pictures_per_frame, - td->blocks[i], td->emu_buf, + blocks[i], td->emu_buf, mbs_per_slice, num_cblocks[i], is_chroma[i]); } else { get_alpha_data(ctx, src, linesize[i], xp, yp, pwidth, avctx->height / ctx->pictures_per_frame, - td->blocks[i], mbs_per_slice, ctx->alpha_bits); + blocks[i], mbs_per_slice, ctx->alpha_bits); } } @@ -868,7 +881,7 @@ static int find_slice_quant(AVCodecContext *avctx, if (ctx->alpha_bits) alpha_bits = estimate_alpha_plane(ctx, src, linesize[3], - mbs_per_slice, td->blocks[3]); + mbs_per_slice, blocks[3]); // todo: maybe perform coarser quantising to fit into frame size when needed for (q = min_quant; q <= max_quant; q++) { bits = alpha_bits; @@ -877,13 +890,13 @@ static int find_slice_quant(AVCodecContext *avctx, src, linesize[0], mbs_per_slice, num_cblocks[0], plane_factor[0], - ctx->quants[q], td); /* estimate luma plane */ + ctx->quants[q], blocks[0]); /* estimate luma plane */ for (i = 1; i < ctx->num_planes - !!ctx->alpha_bits; i++) { /* estimate chroma plane */ bits += estimate_slice_plane(ctx, &error, i, src, linesize[i], mbs_per_slice, num_cblocks[i], plane_factor[i], - ctx->quants_chroma[q], td); + ctx->quants_chroma[q], blocks[i]); } if (bits > 65000 * 8) error = SCORE_LIMIT; @@ -914,13 +927,13 @@ static int find_slice_quant(AVCodecContext *avctx, src, linesize[0], mbs_per_slice, num_cblocks[0], plane_factor[0], - qmat, td);/* estimate luma plane */ + qmat, blocks[0]);/* estimate luma plane */ for (i = 1; i < ctx->num_planes - !!ctx->alpha_bits; i++) { /* estimate chroma plane */ bits += estimate_slice_plane(ctx, &error, i, src, linesize[i], mbs_per_slice, num_cblocks[i], plane_factor[i], - qmat_chroma, td); + qmat_chroma, blocks[i]); } if (bits <= ctx->bits_per_mb * mbs_per_slice) break; @@ -1167,6 +1180,10 @@ static av_cold int encode_close(AVCodecContext *avctx) av_freep(&ctx->tdata); av_freep(&ctx->slice_q); + for (i = 0; i < MAX_PLANES; i++) { + av_freep(&ctx->blocks[i]); + } + return 0; } @@ -1319,6 +1336,19 @@ FF_ENABLE_DEPRECATION_WARNINGS ctx->tdata[j].nodes[i].score = 0; } } + + for (j = 0; j < MAX_PLANES; j++) { + ctx->blocks[j] = av_malloc(16 * 16 + * ctx -> slices_width + * ctx -> mb_height + * ctx -> mbs_per_slice + * sizeof(*ctx->blocks[0])); + + if (!ctx->blocks[j]) { + encode_close(avctx); + return AVERROR(ENOMEM); + } + } } else { int ls = 0; int ls_chroma = 0;