From patchwork Mon Oct 9 12:18:45 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: kaustubh.raste@imgtec.com X-Patchwork-Id: 5505 Delivered-To: ffmpegpatchwork@gmail.com Received: by 10.2.161.90 with SMTP id m26csp2581881jah; Mon, 9 Oct 2017 05:17:23 -0700 (PDT) X-Google-Smtp-Source: AOwi7QCeFiFnb8izNWS5kBEmBq2uKXgWOwdEldU5YLSO8gldG4Pax22BM3tA5EvTp9/izoI4pZyW X-Received: by 10.28.26.11 with SMTP id a11mr9294954wma.90.1507551443258; Mon, 09 Oct 2017 05:17:23 -0700 (PDT) ARC-Seal: i=1; a=rsa-sha256; t=1507551443; cv=none; d=google.com; s=arc-20160816; b=aDVvzxKH6IyXtK/McaLh04GA+Qg7Mj4UL/ECrHeTScBzobZqiWtpLJMhHT9UNpD7w7 h0/DhAJFzJtwmi4+Qgxg4DmXNiJcmQnYMZWEIec3Fl9wzTB3C97CBekUOJkLSPffNwC5 k7o1hv2n6xpo7bNGbbru+Xmqa0ryoh49XfvViNQFJeq/7OGb4h4pAyYJrPTAr1nME3pN BUcUbQbzEQb/gGzQrAQN1r8mcFWyOlXnUQmIzhjWAiyBMCKe+SQR42bQkruB8NyegD54 pk4c5KDcWC8h4fV8X7O9iYfxdcjHHmrvztrjm4qqSFrJnNkTGQXJXoEM6gSHizd9Sq/5 Hzqg== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816; h=sender:errors-to:content-transfer-encoding:cc:reply-to :list-subscribe:list-help:list-post:list-archive:list-unsubscribe :list-id:precedence:subject:mime-version:message-id:date:to:from :delivered-to:arc-authentication-results; bh=bDK2UCR0+aHqOKVhmT6uZdmXOcc2i5FCPVHgmO2Xnog=; b=fi1GaIy16IqTDyr/vOxcFYgh6DXhdmDQlY+WWyXS/HsEGghMvsNbqb1NS+7227gJ2n Rj4qcPFThJMQjsQlSRnQggGqzr5ltaet+jzHQZpyX6hNdl1yC+yqxh8bbOe9bTH9OUQ6 atso481Q+0hZFwvqRyA4wPtu8RWexgUB0XDXZFXYIRm124U4U30tx8BRZIyAw5k7V8wY ZFAlvXbYC7K2FxsugLGpWz/hCWmtp3z7rg+YjVpY4VLzFLquQEV9sBGFsO78l+LtOp6U 86scWNcygl2IkC6LEb/rBLOXRXST18XfU+mISkyIfE1nKYVXgu865xqERe/fmJ5xnTPb m8Dg== ARC-Authentication-Results: i=1; mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Return-Path: Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org. [79.124.17.100]) by mx.google.com with ESMTP id t3si6495845wme.171.2017.10.09.05.17.22; Mon, 09 Oct 2017 05:17:23 -0700 (PDT) Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Authentication-Results: mx.google.com; spf=pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) smtp.mailfrom=ffmpeg-devel-bounces@ffmpeg.org Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 704E86883D4; Mon, 9 Oct 2017 15:17:20 +0300 (EEST) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mailapp01.imgtec.com (mailapp01.imgtec.com [195.59.15.196]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id CD162680774 for ; Mon, 9 Oct 2017 15:17:13 +0300 (EEST) Received: from hhmail02.hh.imgtec.org (unknown [10.100.10.20]) by Forcepoint Email with ESMTPS id E0080ACABC38F for ; Mon, 9 Oct 2017 13:17:10 +0100 (IST) Received: from pudesk204.pu.imgtec.org (192.168.91.13) by hhmail02.hh.imgtec.org (10.100.10.20) with Microsoft SMTP Server (TLS) id 14.3.361.1; Mon, 9 Oct 2017 13:17:13 +0100 From: To: Date: Mon, 9 Oct 2017 17:48:45 +0530 Message-ID: <1507551525-9406-1-git-send-email-kaustubh.raste@imgtec.com> X-Mailer: git-send-email 1.7.9.5 MIME-Version: 1.0 X-Originating-IP: [192.168.91.13] Subject: [FFmpeg-devel] [PATCH] avcodec/mips: Improve avc uni copy mc msa functions X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Kaustubh Raste Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" From: Kaustubh Raste Load the specific bytes instead of MSA load. Signed-off-by: Kaustubh Raste --- libavcodec/mips/hevc_mc_uni_msa.c | 245 +++++++++++++++---------------------- 1 file changed, 100 insertions(+), 145 deletions(-) diff --git a/libavcodec/mips/hevc_mc_uni_msa.c b/libavcodec/mips/hevc_mc_uni_msa.c index cf22e7f..eead591 100644 --- a/libavcodec/mips/hevc_mc_uni_msa.c +++ b/libavcodec/mips/hevc_mc_uni_msa.c @@ -28,83 +28,39 @@ static void copy_width8_msa(uint8_t *src, int32_t src_stride, { int32_t cnt; uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - - if (0 == height % 12) { - for (cnt = (height / 12); cnt--;) { - LD_UB8(src, src_stride, - src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - - out0 = __msa_copy_u_d((v2i64) src0, 0); - out1 = __msa_copy_u_d((v2i64) src1, 0); - out2 = __msa_copy_u_d((v2i64) src2, 0); - out3 = __msa_copy_u_d((v2i64) src3, 0); - out4 = __msa_copy_u_d((v2i64) src4, 0); - out5 = __msa_copy_u_d((v2i64) src5, 0); - out6 = __msa_copy_u_d((v2i64) src6, 0); - out7 = __msa_copy_u_d((v2i64) src7, 0); - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - SD4(out4, out5, out6, out7, dst, dst_stride); - dst += (4 * dst_stride); - - LD_UB4(src, src_stride, src0, src1, src2, src3); + if (2 == height) { + LD2(src, src_stride, out0, out1); + SD(out0, dst); + dst += dst_stride; + SD(out1, dst); + } else if (6 == height) { + LD4(src, src_stride, out0, out1, out2, out3); + src += (4 * src_stride); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + LD2(src, src_stride, out0, out1); + SD(out0, dst); + dst += dst_stride; + SD(out1, dst); + } else if (0 == (height % 8)) { + for (cnt = (height >> 3); cnt--;) { + LD4(src, src_stride, out0, out1, out2, out3); + src += (4 * src_stride); + LD4(src, src_stride, out4, out5, out6, out7); src += (4 * src_stride); - - out0 = __msa_copy_u_d((v2i64) src0, 0); - out1 = __msa_copy_u_d((v2i64) src1, 0); - out2 = __msa_copy_u_d((v2i64) src2, 0); - out3 = __msa_copy_u_d((v2i64) src3, 0); - - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - } - } else if (0 == height % 8) { - for (cnt = height >> 3; cnt--;) { - LD_UB8(src, src_stride, - src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - - out0 = __msa_copy_u_d((v2i64) src0, 0); - out1 = __msa_copy_u_d((v2i64) src1, 0); - out2 = __msa_copy_u_d((v2i64) src2, 0); - out3 = __msa_copy_u_d((v2i64) src3, 0); - out4 = __msa_copy_u_d((v2i64) src4, 0); - out5 = __msa_copy_u_d((v2i64) src5, 0); - out6 = __msa_copy_u_d((v2i64) src6, 0); - out7 = __msa_copy_u_d((v2i64) src7, 0); - SD4(out0, out1, out2, out3, dst, dst_stride); dst += (4 * dst_stride); SD4(out4, out5, out6, out7, dst, dst_stride); dst += (4 * dst_stride); } - } else if (0 == height % 4) { - for (cnt = (height / 4); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); + } else if (0 == (height % 4)) { + for (cnt = (height >> 2); cnt--;) { + LD4(src, src_stride, out0, out1, out2, out3); src += (4 * src_stride); - out0 = __msa_copy_u_d((v2i64) src0, 0); - out1 = __msa_copy_u_d((v2i64) src1, 0); - out2 = __msa_copy_u_d((v2i64) src2, 0); - out3 = __msa_copy_u_d((v2i64) src3, 0); - SD4(out0, out1, out2, out3, dst, dst_stride); dst += (4 * dst_stride); } - } else if (0 == height % 2) { - for (cnt = (height / 2); cnt--;) { - LD_UB2(src, src_stride, src0, src1); - src += (2 * src_stride); - out0 = __msa_copy_u_d((v2i64) src0, 0); - out1 = __msa_copy_u_d((v2i64) src1, 0); - - SD(out0, dst); - dst += dst_stride; - SD(out1, dst); - dst += dst_stride; - } } } @@ -122,33 +78,6 @@ static void copy_width12_msa(uint8_t *src, int32_t src_stride, ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); } -static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int32_t height, int32_t width) -{ - int32_t cnt, loop_cnt; - uint8_t *src_tmp, *dst_tmp; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - - for (cnt = (width >> 4); cnt--;) { - src_tmp = src; - dst_tmp = dst; - - for (loop_cnt = (height >> 3); loop_cnt--;) { - LD_UB8(src_tmp, src_stride, - src0, src1, src2, src3, src4, src5, src6, src7); - src_tmp += (8 * src_stride); - - ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, - dst_tmp, dst_stride); - dst_tmp += (8 * dst_stride); - } - - src += 16; - dst += 16; - } -} - static void copy_width16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height) @@ -156,23 +85,25 @@ static void copy_width16_msa(uint8_t *src, int32_t src_stride, int32_t cnt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - if (0 == height % 12) { - for (cnt = (height / 12); cnt--;) { - LD_UB8(src, src_stride, - src0, src1, src2, src3, src4, src5, src6, src7); + if (12 == height) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); + dst += (8 * dst_stride); + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } else if (0 == (height % 8)) { + for (cnt = (height >> 3); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); src += (8 * src_stride); - ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, - dst, dst_stride); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, + dst_stride); dst += (8 * dst_stride); - - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - dst += (4 * dst_stride); } - } else if (0 == height % 8) { - copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); - } else if (0 == height % 4) { + } else if (0 == (height % 4)) { for (cnt = (height >> 2); cnt--;) { LD_UB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); @@ -187,8 +118,23 @@ static void copy_width24_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height) { - copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); - copy_width8_msa(src + 16, src_stride, dst + 16, dst_stride, height); + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + for (cnt = 4; cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD4(src + 16, src_stride, out0, out1, out2, out3); + src += (4 * src_stride); + LD4(src + 16, src_stride, out4, out5, out6, out7); + src += (4 * src_stride); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); + SD4(out0, out1, out2, out3, dst + 16, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } } static void copy_width32_msa(uint8_t *src, int32_t src_stride, @@ -198,40 +144,13 @@ static void copy_width32_msa(uint8_t *src, int32_t src_stride, int32_t cnt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - if (0 == height % 12) { - for (cnt = (height / 12); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(src + 16, src_stride, src4, src5, src6, src7); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); - dst += (4 * dst_stride); - - LD_UB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(src + 16, src_stride, src4, src5, src6, src7); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); - dst += (4 * dst_stride); - - LD_UB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(src + 16, src_stride, src4, src5, src6, src7); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); - dst += (4 * dst_stride); - } - } else if (0 == height % 8) { - copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); - } else if (0 == height % 4) { - for (cnt = (height >> 2); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(src + 16, src_stride, src4, src5, src6, src7); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); - dst += (4 * dst_stride); - } + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); } } @@ -239,14 +158,50 @@ static void copy_width48_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height) { - copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 48); + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 src11; + + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + LD_UB4(src + 32, src_stride, src8, src9, src10, src11); + src += (4 * src_stride); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride); + dst += (4 * dst_stride); + } } static void copy_width64_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height) { - copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(src, 16, src4, src5, src6, src7); + src += src_stride; + LD_UB4(src, 16, src8, src9, src10, src11); + src += src_stride; + LD_UB4(src, 16, src12, src13, src14, src15); + src += src_stride; + + ST_UB4(src0, src1, src2, src3, dst, 16); + dst += dst_stride; + ST_UB4(src4, src5, src6, src7, dst, 16); + dst += dst_stride; + ST_UB4(src8, src9, src10, src11, dst, 16); + dst += dst_stride; + ST_UB4(src12, src13, src14, src15, dst, 16); + dst += dst_stride; + } } static const uint8_t mc_filt_mask_arr[16 * 3] = {