diff mbox

[FFmpeg-devel,4/9] avcodec/mips: [loongson] optimize memset in h264dsp.

Message ID 1531399490-14682-1-git-send-email-yinshiyou-hf@loongson.cn
State Superseded
Headers show

Commit Message

Shiyou Yin July 12, 2018, 12:44 p.m. UTC
Optimized memset with mmi in following functions:
1. ff_h264_add_pixels4_8_mmi.
2. ff_h264_idct_add_8_mmi.
3. ff_h264_idct8_add_8_mmi.

This optimization improved h264 decoding performance about 1fps(tested on loongson 3A3000).

Change-Id: I4e8b75510e6a34b4c80f84ad784b00377570c4ec
Signed-off-by: Shiyou Yin <yinshiyou-hf@loongson.cn>
---
 libavcodec/mips/h264dsp_mmi.c | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)
diff mbox

Patch

diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c
index ac6fa99..8cc632c 100644
--- a/libavcodec/mips/h264dsp_mmi.c
+++ b/libavcodec/mips/h264dsp_mmi.c
@@ -59,6 +59,17 @@  void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
         MMI_SWC1(%[ftmp2], %[dst1], 0x00)
         MMI_SWC1(%[ftmp3], %[dst2], 0x00)
         MMI_SWC1(%[ftmp4], %[dst3], 0x00)
+
+        /* memset(src, 0, 32); */
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        "gssdlc1    %[ftmp0],   0x0f(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x08(%[src])                            \n\t"
+        "gssdlc1    %[ftmp0],   0x17(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x10(%[src])                            \n\t"
+        "gssdlc1    %[ftmp0],   0x1f(%[src])                            \n\t"
+        "gssdrc1    %[ftmp0],   0x18(%[src])                            \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
@@ -72,7 +83,6 @@  void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
         : "memory"
     );
 
-    memset(src, 0, 32);
 }
 
 void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
@@ -152,6 +162,17 @@  void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         MMI_SWC1(%[ftmp2], %[dst], 0x00)
         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
         MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+
+        /* memset(block, 0, 32) */
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[block])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdlc1    %[ftmp0],   0x0f(%[block])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x08(%[block])                          \n\t"
+        "gssdlc1    %[ftmp0],   0x17(%[block])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x10(%[block])                          \n\t"
+        "gssdlc1    %[ftmp0],   0x1f(%[block])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x18(%[block])                          \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
@@ -167,7 +188,6 @@  void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         : "memory"
     );
 
-    memset(block, 0, 32);
 }
 
 void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
@@ -617,6 +637,17 @@  void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         MMI_SWC1(%[ftmp6], %[addr0], 0x00)
         MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
         PTR_ADDIU  "$29,        $29,            0x20                    \n\t"
+
+        /* memset(block, 0, 32) */
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "gssdlc1    %[ftmp0],   0x07(%[block])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdlc1    %[ftmp0],   0x0f(%[block])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x08(%[block])                          \n\t"
+        "gssdlc1    %[ftmp0],   0x17(%[block])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x10(%[block])                          \n\t"
+        "gssdlc1    %[ftmp0],   0x1f(%[block])                          \n\t"
+        "gssdrc1    %[ftmp0],   0x18(%[block])                          \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
@@ -638,7 +669,6 @@  void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         : "$29","memory"
     );
 
-    memset(block, 0, 128);
 }
 
 void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)