diff mbox

[FFmpeg-devel] avcodec/mips: Improve avc mc copy msa functions

Message ID 70293ACCC3BA6A4E81FFCA024C7A86E1E0591B27@PUMAIL01.pu.imgtec.org
State New
Headers show

Commit Message

Manojkumar Bhosale Sept. 15, 2017, 12:04 p.m. UTC
LGTM

-----Original Message-----
From: ffmpeg-devel [mailto:ffmpeg-devel-bounces@ffmpeg.org] On Behalf Of kaustubh.raste@imgtec.com

Sent: Friday, September 15, 2017 11:43 AM
To: ffmpeg-devel@ffmpeg.org
Cc: Kaustubh Raste
Subject: [FFmpeg-devel] [PATCH] avcodec/mips: Improve avc mc copy msa functions

From: Kaustubh Raste <kaustubh.raste@imgtec.com>


Remove loops and unroll as block sizes are known.

Signed-off-by: Kaustubh Raste <kaustubh.raste@imgtec.com>

---
 libavcodec/mips/h264qpel_msa.c |   81 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 75 insertions(+), 6 deletions(-)

http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Comments

Michael Niedermayer Sept. 15, 2017, 8:35 p.m. UTC | #1
On Fri, Sep 15, 2017 at 12:04:58PM +0000, Manojkumar Bhosale wrote:
> LGTM

will apply

thx

[...]
diff mbox

Patch

diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c index 43d21f7..05dffea 100644
--- a/libavcodec/mips/h264qpel_msa.c
+++ b/libavcodec/mips/h264qpel_msa.c
@@ -1,5 +1,5 @@ 
 /*
- * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ * Copyright (c) 2015 -2017 Parag Salasakar 
+ (Parag.Salasakar@imgtec.com)
  *
  * This file is part of FFmpeg.
  *
@@ -2966,31 +2966,100 @@  static void avg_width16_msa(const uint8_t *src, int32_t src_stride,  void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
                                  ptrdiff_t stride)  {
-    copy_width16_msa(src, stride, dst, stride, 16);
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * stride);
+    LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, 
+ src15);
+
+    ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
+    dst += (8 * stride);
+    ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, 
+ stride);
 }
 
 void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
                                 ptrdiff_t stride)  {
-    copy_width8_msa(src, stride, dst, stride, 8);
+    uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
+
+    LD4(src, stride, src0, src1, src2, src3);
+    src += 4 * stride;
+    LD4(src, stride, src4, src5, src6, src7);
+    SD4(src0, src1, src2, src3, dst, stride);
+    dst += 4 * stride;
+    SD4(src4, src5, src6, src7, dst, stride);
 }
 
 void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
                                  ptrdiff_t stride)  {
-    avg_width16_msa(src, stride, dst, stride, 16);
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * stride);
+    LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, 
+ dst7);
+
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+                dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+                dst6, dst7);
+    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
+    dst += (8 * stride);
+
+    LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, 
+ dst7);
+
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+                dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+                dst6, dst7);
+    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, 
+ stride);
 }
 
 void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
                                 ptrdiff_t stride)  {
-    avg_width8_msa(src, stride, dst, stride, 8);
+    uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+    v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+    v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
+
+    LD4(src, stride, tp0, tp1, tp2, tp3);
+    src += 4 * stride;
+    LD4(src, stride, tp4, tp5, tp6, tp7);
+    INSERT_D2_UB(tp0, tp1, src0);
+    INSERT_D2_UB(tp2, tp3, src1);
+    INSERT_D2_UB(tp4, tp5, src2);
+    INSERT_D2_UB(tp6, tp7, src3);
+
+    LD4(dst, stride, tp0, tp1, tp2, tp3);
+    LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    INSERT_D2_UB(tp4, tp5, dst2);
+    INSERT_D2_UB(tp6, tp7, dst3);
+
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+                dst2, dst3);
+
+    ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
 }
 
 void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
                                 ptrdiff_t stride)  {
-    avg_width4_msa(src, stride, dst, stride, 4);
+    uint32_t tp0, tp1, tp2, tp3;
+    v16u8 src0 = { 0 }, dst0 = { 0 };
+
+    LW4(src, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
+    LW4(dst, stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+
+    dst0 = __msa_aver_u_b(src0, dst0);
+
+    ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
 }
 
 void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
--
1.7.9.5

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org