From patchwork Thu Dec 27 06:16:22 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: guxiwei X-Patchwork-Id: 11561 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 81C0A44D763 for ; Thu, 27 Dec 2018 08:16:35 +0200 (EET) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 38AB668ABC6; Thu, 27 Dec 2018 08:16:32 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail.loongson.cn (mail.loongson.cn [114.242.206.163]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 4AD7268AB27 for ; Thu, 27 Dec 2018 08:16:23 +0200 (EET) Received: from localhost (unknown [210.45.123.188]) by mail (Coremail) with SMTP id QMiowPDx_784biRc7KVfAA--.50240S3; Thu, 27 Dec 2018 14:16:25 +0800 (CST) From: gxw To: ffmpeg-devel@ffmpeg.org Date: Thu, 27 Dec 2018 14:16:22 +0800 Message-Id: <1545891382-24809-1-git-send-email-guxiwei-hf@loongson.cn> X-Mailer: git-send-email 2.1.0 X-CM-TRANSID: QMiowPDx_784biRc7KVfAA--.50240S3 X-Coremail-Antispam: 1UD129KBjvAXoWfur48CFWxArykAFW3Zw17trb_yoW5GF4xAo WFkr97Aw17G3s7ta1qyF1UAas2gryxWr1fJa1Syr48Zasa9ry3Ca4a9r4fZw1jgr1rZa4a krWYy397trnxA3Wkn29KB7ZKAUJUUUUU529EdanIXcx71UUUUU7v73VFW2AGmfu7bjvjm3 AaLaJ3UjIYCTnIWjp_UUUYI7AC8VAFwI0_Jr0_Gr1l1xkIjI8I6I8E6xAIw20EY4v20xva j40_Wr0E3s1l1IIY67AEw4v_Jr0_Jr4l8cAvFVAK0II2c7xJM28CjxkF64kEwVA0rcxSw2 x7M28EF7xvwVC0I7IYx2IY67AKxVW8JVW5JwA2z4x0Y4vE2Ix0cI8IcVCY1x0267AKxVW8 JVWxJwA2z4x0Y4vEx4A2jsIE14v26r4UJVWxJr1l84ACjcxK6I8E87Iv6xkF7I0E14v26F 4UJVW0owAS0I0E0xvYzxvE52x082IY62kv0487Mc02F40EFcxC0VAKzVAqx4xG6I80ewAv 7VC0I7IYx2IY67AKxVWUJVWUGwAv7VC2z280aVAFwI0_Jr0_Gr1lOx8S6xCaFVCjc4AY6r 1j6r4UM4x0Y48IcxkI7VAKI48JM4x0Y48IcxkI7VAKI48G6xCjnVAKz4kxM4x0x7Aq67II x4CEVc8vx2IErcIFxwCF04k20xvY0x0EwIxGrwCFx2IqxVCFs4IE7xkEbVWUJVW8JwC20s 026c02F40E14v26r1j6r18MI8I3I0E7480Y4vE14v26r106r1rMI8E67AF67kF1VAFwI0_ Jrv_JF1lIxkGc2Ij64vIr41lIxAIcVC0I7IYx2IY67AKxVWUJVWUCwCI42IY6xIIjxv20x vEc7CjxVAFwI0_Jr0_Gr1lIxAIcVCF04k26cxKx2IYs7xG6rW3Jr0E3s1lIxAIcVC2z280 aVAFwI0_Jr0_Gr1lIxAIcVC2z280aVCY1x0267AKxVWUJVW8JbIYCTnIWIevJa73UjIFyT uYvjfU0OJ5UUUUU X-CM-SenderInfo: 5jx0x4dhlnxwo6or00hjvr0hdfq/ Subject: [FFmpeg-devel] [PATCH] avcodec/mips: [loongson] optimize theora decoding in vp3dsp. X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: gxw MIME-Version: 1.0 Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" Optimize theora decoding with msa in functions: 1. ff_vp3_idct_add_msa 2. ff_vp3_idct_put_msa 3. ff_vp3_idct_dc_add_msa 4. ff_vp3_v_loop_filter_msa 5. ff_vp3_h_loop_filter_msa 6. ff_put_no_rnd_pixels_l2_msa Theora decoding speed improved about 36%(from 22fps to 30fps, Tested on loongson 2K1000). --- libavcodec/mips/Makefile | 2 + libavcodec/mips/vp3dsp_idct_msa.c | 662 +++++++++++++++++++++++++++++++++++++ libavcodec/mips/vp3dsp_init_mips.c | 46 +++ libavcodec/mips/vp3dsp_mips.h | 37 +++ libavcodec/vp3dsp.c | 2 + libavcodec/vp3dsp.h | 1 + 6 files changed, 750 insertions(+) create mode 100644 libavcodec/mips/vp3dsp_idct_msa.c create mode 100644 libavcodec/mips/vp3dsp_init_mips.c create mode 100644 libavcodec/mips/vp3dsp_mips.h diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index 1f659a0..3571207 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -22,6 +22,7 @@ OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_init_mips.o \ mips/hevcpred_init_mips.o OBJS-$(CONFIG_VP9_DECODER) += mips/vp9dsp_init_mips.o OBJS-$(CONFIG_VP8_DECODER) += mips/vp8dsp_init_mips.o +OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_init_mips.o OBJS-$(CONFIG_H264DSP) += mips/h264dsp_init_mips.o OBJS-$(CONFIG_H264QPEL) += mips/h264qpel_init_mips.o OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_init_mips.o @@ -54,6 +55,7 @@ MSA-OBJS-$(CONFIG_VP9_DECODER) += mips/vp9_mc_msa.o \ MSA-OBJS-$(CONFIG_VP8_DECODER) += mips/vp8_mc_msa.o \ mips/vp8_idct_msa.o \ mips/vp8_lpf_msa.o +MSA-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_msa.o MSA-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_msa.o \ mips/h264idct_msa.o MSA-OBJS-$(CONFIG_H264QPEL) += mips/h264qpel_msa.o diff --git a/libavcodec/mips/vp3dsp_idct_msa.c b/libavcodec/mips/vp3dsp_idct_msa.c new file mode 100644 index 0000000..5427ac5 --- /dev/null +++ b/libavcodec/mips/vp3dsp_idct_msa.c @@ -0,0 +1,662 @@ +/* + * Copyright (c) 2018 gxw + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "vp3dsp_mips.h" +#include "libavutil/mips/generic_macros_msa.h" +#include "libavutil/intreadwrite.h" +#include "libavcodec/rnd_avg.h" + +static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) +{ + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, sign; + v4i32 r0_r, r0_l, r1_r, r1_l, r2_r, r2_l, r3_r, r3_l, + r4_r, r4_l, r5_r, r5_l, r6_r, r6_l, r7_r, r7_l; + v4i32 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H; + v4i32 Ed, Gd, Add, Bdd, Fd, Hd; + v16u8 sign_l; + v16i8 d0, d1, d2, d3, d4, d5, d6, d7; + v4i32 c0, c1, c2, c3, c4, c5, c6, c7; + v4i32 f0, f1, f2, f3, f4, f5, f6, f7; + v4i32 sign_t; + v16i8 zero = {0}; + v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}; + v4i32 cnst64277w = {64277, 64277, 64277, 64277}; + v4i32 cnst60547w = {60547, 60547, 60547, 60547}; + v4i32 cnst54491w = {54491, 54491, 54491, 54491}; + v4i32 cnst46341w = {46341, 46341, 46341, 46341}; + v4i32 cnst36410w = {36410, 36410, 36410, 36410}; + v4i32 cnst25080w = {25080, 25080, 25080, 25080}; + v4i32 cnst12785w = {12785, 12785, 12785, 12785}; + v4i32 cnst8w = {8, 8, 8, 8}; + v4i32 cnst2048w = {2048, 2048, 2048, 2048}; + v4i32 cnst128w = {128, 128, 128, 128}; + int nstride = stride; + + /* Extended input data */ + LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7); + sign = __msa_clti_s_h(r0, 0); + r0_r = (v4i32) __msa_ilvr_h(sign, r0); + r0_l = (v4i32) __msa_ilvl_h(sign, r0); + sign = __msa_clti_s_h(r1, 0); + r1_r = (v4i32) __msa_ilvr_h(sign, r1); + r1_l = (v4i32) __msa_ilvl_h(sign, r1); + sign = __msa_clti_s_h(r2, 0); + r2_r = (v4i32) __msa_ilvr_h(sign, r2); + r2_l = (v4i32) __msa_ilvl_h(sign, r2); + sign = __msa_clti_s_h(r3, 0); + r3_r = (v4i32) __msa_ilvr_h(sign, r3); + r3_l = (v4i32) __msa_ilvl_h(sign, r3); + sign = __msa_clti_s_h(r4, 0); + r4_r = (v4i32) __msa_ilvr_h(sign, r4); + r4_l = (v4i32) __msa_ilvl_h(sign, r4); + sign = __msa_clti_s_h(r5, 0); + r5_r = (v4i32) __msa_ilvr_h(sign, r5); + r5_l = (v4i32) __msa_ilvl_h(sign, r5); + sign = __msa_clti_s_h(r6, 0); + r6_r = (v4i32) __msa_ilvr_h(sign, r6); + r6_l = (v4i32) __msa_ilvl_h(sign, r6); + sign = __msa_clti_s_h(r7, 0); + r7_r = (v4i32) __msa_ilvr_h(sign, r7); + r7_l = (v4i32) __msa_ilvl_h(sign, r7); + + /* Right part */ + A = ((r1_r * cnst64277w) >> 16) + ((r7_r * cnst12785w) >> 16); + B = ((r1_r * cnst12785w) >> 16) - ((r7_r * cnst64277w) >> 16); + C = ((r3_r * cnst54491w) >> 16) + ((r5_r * cnst36410w) >> 16); + D = ((r5_r * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16); + Ad = ((A - C) * cnst46341w) >> 16; + Bd = ((B - D) * cnst46341w) >> 16; + Cd = A + C; + Dd = B + D; + E = ((r0_r + r4_r) * cnst46341w) >> 16; + F = ((r0_r - r4_r) * cnst46341w) >> 16; + G = ((r2_r * cnst60547w) >> 16) + ((r6_r * cnst25080w) >> 16); + H = ((r2_r * cnst25080w) >> 16) - ((r6_r * cnst60547w) >> 16); + Ed = E - G; + Gd = E + G; + Add = F + Ad; + Bdd = Bd - H; + Fd = F - Ad; + Hd = Bd + H; + r0_r = Gd + Cd; + r7_r = Gd - Cd; + r1_r = Add + Hd; + r2_r = Add - Hd; + r3_r = Ed + Dd; + r4_r = Ed - Dd; + r5_r = Fd + Bdd; + r6_r = Fd - Bdd; + + /* Left part */ + A = ((r1_l * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16); + B = ((r1_l * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16); + C = ((r3_l * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16); + D = ((r5_l * cnst54491w) >> 16) - ((r3_l * cnst36410w) >> 16); + Ad = ((A - C) * cnst46341w) >> 16; + Bd = ((B - D) * cnst46341w) >> 16; + Cd = A + C; + Dd = B + D; + E = ((r0_l + r4_l) * cnst46341w) >> 16; + F = ((r0_l - r4_l) * cnst46341w) >> 16; + G = ((r2_l * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16); + H = ((r2_l * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16); + Ed = E - G; + Gd = E + G; + Add = F + Ad; + Bdd = Bd - H; + Fd = F - Ad; + Hd = Bd + H; + r0_l = Gd + Cd; + r7_l = Gd - Cd; + r1_l = Add + Hd; + r2_l = Add - Hd; + r3_l = Ed + Dd; + r4_l = Ed - Dd; + r5_l = Fd + Bdd; + r6_l = Fd - Bdd; + + /* Row 0 to 3 */ + TRANSPOSE4x4_SW_SW(r0_r, r1_r, r2_r, r3_r, + r0_r, r1_r, r2_r, r3_r); + TRANSPOSE4x4_SW_SW(r0_l, r1_l, r2_l, r3_l, + r0_l, r1_l, r2_l, r3_l); + A = ((r1_r * cnst64277w) >> 16) + ((r3_l * cnst12785w) >> 16); + B = ((r1_r * cnst12785w) >> 16) - ((r3_l * cnst64277w) >> 16); + C = ((r3_r * cnst54491w) >> 16) + ((r1_l * cnst36410w) >> 16); + D = ((r1_l * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16); + Ad = ((A - C) * cnst46341w) >> 16; + Bd = ((B - D) * cnst46341w) >> 16; + Cd = A + C; + Dd = B + D; + E = ((r0_r + r0_l) * cnst46341w) >> 16; + E += cnst8w; + F = ((r0_r - r0_l) * cnst46341w) >> 16; + F += cnst8w; + if (type == 1) { // HACK + E += cnst2048w; + F += cnst2048w; + } + G = ((r2_r * cnst60547w) >> 16) + ((r2_l * cnst25080w) >> 16); + H = ((r2_r * cnst25080w) >> 16) - ((r2_l * cnst60547w) >> 16); + Ed = E - G; + Gd = E + G; + Add = F + Ad; + Bdd = Bd - H; + Fd = F - Ad; + Hd = Bd + H; + A = (Gd + Cd) >> 4; + B = (Gd - Cd) >> 4; + C = (Add + Hd) >> 4; + D = (Add - Hd) >> 4; + E = (Ed + Dd) >> 4; + F = (Ed - Dd) >> 4; + G = (Fd + Bdd) >> 4; + H = (Fd - Bdd) >> 4; + if (type != 1) { + LD_SB8(dst, stride, d0, d1, d2, d3, d4, d5, d6, d7); + ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3, + f0, f1, f2, f3); + ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7, + f4, f5, f6, f7); + ILVR_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3, + c0, c1, c2, c3); + ILVR_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7, + c4, c5, c6, c7); + A += c0; + B += c7; + C += c1; + D += c2; + E += c3; + F += c4; + G += c5; + H += c6; + } + A = CLIP_SW_0_255(A); + B = CLIP_SW_0_255(B); + C = CLIP_SW_0_255(C); + D = CLIP_SW_0_255(D); + E = CLIP_SW_0_255(E); + F = CLIP_SW_0_255(F); + G = CLIP_SW_0_255(G); + H = CLIP_SW_0_255(H); + sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r); + sign_l = __msa_or_v(sign_l, (v16u8)r3_r); + sign_l = __msa_or_v(sign_l, (v16u8)r0_l); + sign_l = __msa_or_v(sign_l, (v16u8)r1_l); + sign_l = __msa_or_v(sign_l, (v16u8)r2_l); + sign_l = __msa_or_v(sign_l, (v16u8)r3_l); + sign_t = __msa_ceqi_w((v4i32)sign_l, 0); + Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20; + if (type == 1) { + Bdd = Add + cnst128w; + Bdd = CLIP_SW_0_255(Bdd); + Ad = Bdd; + Bd = Bdd; + Cd = Bdd; + Dd = Bdd; + Ed = Bdd; + Fd = Bdd; + Gd = Bdd; + Hd = Bdd; + } else { + Ad = Add + c0; + Bd = Add + c1; + Cd = Add + c2; + Dd = Add + c3; + Ed = Add + c4; + Fd = Add + c5; + Gd = Add + c6; + Hd = Add + c7; + Ad = CLIP_SW_0_255(Ad); + Bd = CLIP_SW_0_255(Bd); + Cd = CLIP_SW_0_255(Cd); + Dd = CLIP_SW_0_255(Dd); + Ed = CLIP_SW_0_255(Ed); + Fd = CLIP_SW_0_255(Fd); + Gd = CLIP_SW_0_255(Gd); + Hd = CLIP_SW_0_255(Hd); + } + Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t); + Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t); + Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t); + Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t); + Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t); + Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t); + Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t); + Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t); + sign_t = __msa_ceqi_w(sign_t, 0); + A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t); + B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t); + C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t); + D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t); + E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t); + F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t); + G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t); + H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t); + r0_r = Ad + A; + r1_r = Bd + C; + r2_r = Cd + D; + r3_r = Dd + E; + r0_l = Ed + F; + r1_l = Fd + G; + r2_l = Gd + H; + r3_l = Hd + B; + + /* Row 4 to 7 */ + TRANSPOSE4x4_SW_SW(r4_r, r5_r, r6_r, r7_r, + r4_r, r5_r, r6_r, r7_r); + TRANSPOSE4x4_SW_SW(r4_l, r5_l, r6_l, r7_l, + r4_l, r5_l, r6_l, r7_l); + A = ((r5_r * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16); + B = ((r5_r * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16); + C = ((r7_r * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16); + D = ((r5_l * cnst54491w) >> 16) - ((r7_r * cnst36410w) >> 16); + Ad = ((A - C) * cnst46341w) >> 16; + Bd = ((B - D) * cnst46341w) >> 16; + Cd = A + C; + Dd = B + D; + E = ((r4_r + r4_l) * cnst46341w) >> 16; + E += cnst8w; + F = ((r4_r - r4_l) * cnst46341w) >> 16; + F += cnst8w; + if (type == 1) { // HACK + E += cnst2048w; + F += cnst2048w; + } + G = ((r6_r * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16); + H = ((r6_r * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16); + Ed = E - G; + Gd = E + G; + Add = F + Ad; + Bdd = Bd - H; + Fd = F - Ad; + Hd = Bd + H; + A = (Gd + Cd) >> 4; + B = (Gd - Cd) >> 4; + C = (Add + Hd) >> 4; + D = (Add - Hd) >> 4; + E = (Ed + Dd) >> 4; + F = (Ed - Dd) >> 4; + G = (Fd + Bdd) >> 4; + H = (Fd - Bdd) >> 4; + if (type != 1) { + ILVL_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3, + c0, c1, c2, c3); + ILVL_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7, + c4, c5, c6, c7); + A += c0; + B += c7; + C += c1; + D += c2; + E += c3; + F += c4; + G += c5; + H += c6; + } + A = CLIP_SW_0_255(A); + B = CLIP_SW_0_255(B); + C = CLIP_SW_0_255(C); + D = CLIP_SW_0_255(D); + E = CLIP_SW_0_255(E); + F = CLIP_SW_0_255(F); + G = CLIP_SW_0_255(G); + H = CLIP_SW_0_255(H); + sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r); + sign_l = __msa_or_v(sign_l, (v16u8)r7_r); + sign_l = __msa_or_v(sign_l, (v16u8)r4_l); + sign_l = __msa_or_v(sign_l, (v16u8)r5_l); + sign_l = __msa_or_v(sign_l, (v16u8)r6_l); + sign_l = __msa_or_v(sign_l, (v16u8)r7_l); + sign_t = __msa_ceqi_w((v4i32)sign_l, 0); + Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20; + if (type == 1) { + Bdd = Add + cnst128w; + Bdd = CLIP_SW_0_255(Bdd); + Ad = Bdd; + Bd = Bdd; + Cd = Bdd; + Dd = Bdd; + Ed = Bdd; + Fd = Bdd; + Gd = Bdd; + Hd = Bdd; + } else { + Ad = Add + c0; + Bd = Add + c1; + Cd = Add + c2; + Dd = Add + c3; + Ed = Add + c4; + Fd = Add + c5; + Gd = Add + c6; + Hd = Add + c7; + Ad = CLIP_SW_0_255(Ad); + Bd = CLIP_SW_0_255(Bd); + Cd = CLIP_SW_0_255(Cd); + Dd = CLIP_SW_0_255(Dd); + Ed = CLIP_SW_0_255(Ed); + Fd = CLIP_SW_0_255(Fd); + Gd = CLIP_SW_0_255(Gd); + Hd = CLIP_SW_0_255(Hd); + } + Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t); + Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t); + Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t); + Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t); + Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t); + Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t); + Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t); + Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t); + sign_t = __msa_ceqi_w(sign_t, 0); + A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t); + B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t); + C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t); + D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t); + E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t); + F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t); + G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t); + H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t); + r4_r = Ad + A; + r5_r = Bd + C; + r6_r = Cd + D; + r7_r = Dd + E; + r4_l = Ed + F; + r5_l = Fd + G; + r6_l = Gd + H; + r7_l = Hd + B; + VSHF_B2_SB(r0_r, r4_r, r1_r, r5_r, mask, mask, d0, d1); + VSHF_B2_SB(r2_r, r6_r, r3_r, r7_r, mask, mask, d2, d3); + VSHF_B2_SB(r0_l, r4_l, r1_l, r5_l, mask, mask, d4, d5); + VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7); + + /* Final sequence of operations over-write original dst */ + ST8x1_UB(d0, dst); + ST8x1_UB(d1, dst + nstride); + nstride += stride; + ST8x1_UB(d2, dst + nstride); + nstride += stride; + ST8x1_UB(d3, dst + nstride); + nstride += stride; + ST8x1_UB(d4, dst + nstride); + nstride += stride; + ST8x1_UB(d5, dst + nstride); + nstride += stride; + ST8x1_UB(d6, dst + nstride); + nstride += stride; + ST8x1_UB(d7, dst + nstride); +} + +void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + idct_msa(dest, line_size, block, 1); + memset(block, 0, sizeof(*block) * 64); +} + +void ff_vp3_idct_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + idct_msa(dest, line_size, block, 2); + memset(block, 0, sizeof(*block) * 64); +} + +void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + int i = (block[0] + 15) >> 5; + v4i32 dc = {i, i, i, i}; + v16i8 d0, d1, d2, d3, d4, d5, d6, d7; + v4i32 c0, c1, c2, c3, c4, c5, c6, c7; + v4i32 e0, e1, e2, e3, e4, e5, e6, e7; + v4i32 r0, r1, r2, r3, r4, r5, r6, r7; + v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}; + v16i8 zero = {0}; + int nstride = line_size; + + LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7); + ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3, + c0, c1, c2, c3); + ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7, + c4, c5, c6, c7); + /* Right part */ + ILVR_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3, + e0, e1, e2, e3); + ILVR_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7, + e4, e5, e6, e7); + e0 += dc; + e1 += dc; + e2 += dc; + e3 += dc; + e4 += dc; + e5 += dc; + e6 += dc; + e7 += dc; + e0 = CLIP_SW_0_255(e0); + e1 = CLIP_SW_0_255(e1); + e2 = CLIP_SW_0_255(e2); + e3 = CLIP_SW_0_255(e3); + e4 = CLIP_SW_0_255(e4); + e5 = CLIP_SW_0_255(e5); + e6 = CLIP_SW_0_255(e6); + e7 = CLIP_SW_0_255(e7); + + /* Left part */ + ILVL_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3, + r0, r1, r2, r3); + ILVL_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7, + r4, r5, r6, r7); + r0 += dc; + r1 += dc; + r2 += dc; + r3 += dc; + r4 += dc; + r5 += dc; + r6 += dc; + r7 += dc; + r0 = CLIP_SW_0_255(r0); + r1 = CLIP_SW_0_255(r1); + r2 = CLIP_SW_0_255(r2); + r3 = CLIP_SW_0_255(r3); + r4 = CLIP_SW_0_255(r4); + r5 = CLIP_SW_0_255(r5); + r6 = CLIP_SW_0_255(r6); + r7 = CLIP_SW_0_255(r7); + VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1); + VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3); + VSHF_B2_SB(e4, r4, e5, r5, mask, mask, d4, d5); + VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7); + + /* Final sequence of operations over-write original dst */ + ST8x1_UB(d0, dest); + ST8x1_UB(d1, dest + nstride); + nstride += line_size; + ST8x1_UB(d2, dest + nstride); + nstride += line_size; + ST8x1_UB(d3, dest + nstride); + nstride += line_size; + ST8x1_UB(d4, dest + nstride); + nstride += line_size; + ST8x1_UB(d5, dest + nstride); + nstride += line_size; + ST8x1_UB(d6, dest + nstride); + nstride += line_size; + ST8x1_UB(d7, dest + nstride); + + block[0] = 0; +} + +void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, + int *bounding_values) +{ + int nstride = -stride; + v4i32 e0, e1, f0, f1, g0, g1; + v16i8 zero = {0}; + v16i8 d0, d1, d2, d3; + v8i16 c0, c1, c2, c3; + v8i16 r0; + v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3}, + cnst4h = {4, 4, 4, 4, 4, 4, 4, 4}; + v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}; + int16_t temp_16[8]; + int temp_32[8]; + + LD_SB4(first_pixel + nstride * 2, stride, d0, d1, d2, d3); + ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3, + c0, c1, c2, c3); + r0 = (c0 - c3) + (c2 - c1) * cnst3h; + r0 += cnst4h; + r0 = r0 >> 3; + /* Get filter_value from bounding_values one by one */ + ST_SH(r0, temp_16); + for (int i = 0; i < 8; i++) + temp_32[i] = bounding_values[temp_16[i]]; + LD_SW2(temp_32, 4, e0, e1); + ILVR_H2_SW(zero, c1, zero, c2, f0, g0); + ILVL_H2_SW(zero, c1, zero, c2, f1, g1); + f0 += e0; + f1 += e1; + g0 -= e0; + g1 -= e1; + f0 = CLIP_SW_0_255(f0); + f1 = CLIP_SW_0_255(f1); + g0 = CLIP_SW_0_255(g0); + g1 = CLIP_SW_0_255(g1); + VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2); + + /* Final move to first_pixel */ + ST8x1_UB(d1, first_pixel + nstride); + ST8x1_UB(d2, first_pixel); +} + +void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, + int *bounding_values) +{ + v16i8 d0, d1, d2, d3, d4, d5, d6, d7; + v8i16 c0, c1, c2, c3, c4, c5, c6, c7; + v8i16 r0; + v4i32 e0, e1, f0, f1, g0, g1; + v16i8 zero = {0}; + v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3}, + cnst4h = {4, 4, 4, 4, 4, 4, 4, 4}; + v16i8 mask = {0, 16, 4, 20, 8, 24, 12, 28, 0, 0, 0, 0, 0, 0, 0, 0}; + int16_t temp_16[8]; + int temp_32[8]; + + LD_SB8(first_pixel - 2, stride, d0, d1, d2, d3, d4, d5, d6, d7); + ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3, + c0, c1, c2, c3); + ILVR_B4_SH(zero, d4, zero, d5, zero, d6, zero, d7, + c4, c5, c6, c7); + TRANSPOSE8x8_SH_SH(c0, c1, c2, c3, c4, c5, c6, c7, + c0, c1, c2, c3, c4, c5, c6, c7); + r0 = (c0 - c3) + (c2 - c1) * cnst3h; + r0 += cnst4h; + r0 = r0 >> 3; + + /* Get filter_value from bounding_values one by one */ + ST_SH(r0, temp_16); + for (int i = 0; i < 8; i++) + temp_32[i] = bounding_values[temp_16[i]]; + LD_SW2(temp_32, 4, e0, e1); + ILVR_H2_SW(zero, c1, zero, c2, f0, g0); + ILVL_H2_SW(zero, c1, zero, c2, f1, g1); + f0 += e0; + f1 += e1; + g0 -= e0; + g1 -= e1; + f0 = CLIP_SW_0_255(f0); + f1 = CLIP_SW_0_255(f1); + g0 = CLIP_SW_0_255(g0); + g1 = CLIP_SW_0_255(g1); + VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2); + /* Final move to first_pixel */ + ST2x4_UB(d1, 0, first_pixel - 1, stride); + ST2x4_UB(d2, 0, first_pixel - 1 + 4 * stride, stride); +} + +void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, ptrdiff_t stride, int h) +{ + if (h == 8) { + v16i8 d0, d1, d2, d3, d4, d5, d6, d7; + v16i8 c0, c1, c2, c3; + v4i32 a0, a1, a2, a3, b0, b1, b2, b3; + v4i32 e0, e1, e2; + v4i32 f0, f1, f2; + v4u32 t0, t1, t2, t3; + v16i8 mask = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; + int32_t value = 0xfefefefe; + v4i32 fmask = {value, value, value, value}; + + LD_SB8(src1, stride, d0, d1, d2, d3, d4, d5, d6, d7); + VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1); + VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3); + a0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0); + a2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0); + a1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2); + a3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2); + + LD_SB8(src2, stride, d0, d1, d2, d3, d4, d5, d6, d7); + VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1); + VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3); + b0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0); + b2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0); + b1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2); + b3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2); + + e0 = (v4i32) __msa_xor_v((v16u8)a0, (v16u8)b0); + e0 = (v4i32) __msa_and_v((v16u8)e0, (v16u8)fmask); + t0 = ((v4u32)e0) >> 1; + e2 = (v4i32) __msa_and_v((v16u8)a0, (v16u8)b0); + t0 = t0 + (v4u32)e2; + + e1 = (v4i32) __msa_xor_v((v16u8)a1, (v16u8)b1); + e1 = (v4i32) __msa_and_v((v16u8)e1, (v16u8)fmask); + t1 = ((v4u32)e1) >> 1; + e2 = (v4i32) __msa_and_v((v16u8)a1, (v16u8)b1); + t1 = t1 + (v4u32)e2; + + f0 = (v4i32) __msa_xor_v((v16u8)a2, (v16u8)b2); + f0 = (v4i32) __msa_and_v((v16u8)f0, (v16u8)fmask); + t2 = ((v4u32)f0) >> 1; + f2 = (v4i32) __msa_and_v((v16u8)a2, (v16u8)b2); + t2 = t2 + (v4u32)f2; + + f1 = (v4i32) __msa_xor_v((v16u8)a3, (v16u8)b3); + f1 = (v4i32) __msa_and_v((v16u8)f1, (v16u8)fmask); + t3 = ((v4u32)f1) >> 1; + f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3); + t3 = t3 + (v4u32)f2; + + ST4x4_UB(t0, t0, 0, 1, 2, 3, dst, stride); + ST4x4_UB(t1, t1, 0, 1, 2, 3, dst + 4 * stride, stride); + ST4x4_UB(t2, t2, 0, 1, 2, 3, dst + 4, stride); + ST4x4_UB(t3, t3, 0, 1, 2, 3, dst + 4 + 4 * stride, stride); + } else { + int i; + + for (i = 0; i < h; i++) { + uint32_t a, b; + + a = AV_RN32(&src1[i * stride]); + b = AV_RN32(&src2[i * stride]); + AV_WN32A(&dst[i * stride], no_rnd_avg32(a, b)); + a = AV_RN32(&src1[i * stride + 4]); + b = AV_RN32(&src2[i * stride + 4]); + AV_WN32A(&dst[i * stride + 4], no_rnd_avg32(a, b)); + } + } +} diff --git a/libavcodec/mips/vp3dsp_init_mips.c b/libavcodec/mips/vp3dsp_init_mips.c new file mode 100644 index 0000000..d72e8ec --- /dev/null +++ b/libavcodec/mips/vp3dsp_init_mips.c @@ -0,0 +1,46 @@ + +/* + * Copyright (c) 2018 gxw + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/vp3dsp.h" +#include "vp3dsp_mips.h" + +#if HAVE_MSA +static av_cold void vp3dsp_init_msa(VP3DSPContext *c, int flags) +{ + c->put_no_rnd_pixels_l2 = ff_put_no_rnd_pixels_l2_msa; + + c->idct_add = ff_vp3_idct_add_msa; + c->idct_put = ff_vp3_idct_put_msa; + c->idct_dc_add = ff_vp3_idct_dc_add_msa; + c->v_loop_filter = ff_vp3_v_loop_filter_msa; + c->h_loop_filter = ff_vp3_h_loop_filter_msa; +} +#endif /* HAVE_MSA */ + +av_cold void ff_vp3dsp_init_mips(VP3DSPContext *c, int flags) +{ +#if HAVE_MSA + vp3dsp_init_msa(c, flags); +#endif /* HAVE_MSA */ +} diff --git a/libavcodec/mips/vp3dsp_mips.h b/libavcodec/mips/vp3dsp_mips.h new file mode 100644 index 0000000..7750d5c --- /dev/null +++ b/libavcodec/mips/vp3dsp_mips.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2018 gxw + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MIPS_VP3DSP_MIPS_H +#define AVCODEC_MIPS_VP3DSP_MIPS_H + +#include "libavcodec/vp3dsp.h" +#include + +void ff_vp3_idct_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, + int *bounding_values); +void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1, + const uint8_t *src2, ptrdiff_t stride, int h); +void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, + int *bounding_values); + +#endif /* #ifndef AVCODEC_MIPS_VP3DSP_MIPS_H */ diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c index fdaa292..cdf7d64 100644 --- a/libavcodec/vp3dsp.c +++ b/libavcodec/vp3dsp.c @@ -293,4 +293,6 @@ av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags) ff_vp3dsp_init_ppc(c, flags); if (ARCH_X86) ff_vp3dsp_init_x86(c, flags); + if (ARCH_MIPS) + ff_vp3dsp_init_mips(c, flags); } diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h index 2fdad16..f5f042d 100644 --- a/libavcodec/vp3dsp.h +++ b/libavcodec/vp3dsp.h @@ -49,5 +49,6 @@ void ff_vp3dsp_init(VP3DSPContext *c, int flags); void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags); void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags); void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags); +void ff_vp3dsp_init_mips(VP3DSPContext *c, int flags); #endif /* AVCODEC_VP3DSP_H */