diff mbox

[FFmpeg-devel,8/9] avcodec/mips: [loongson] reoptimize put&add pixels with mmi v2.

Message ID 1531399526-15262-1-git-send-email-yinshiyou-hf@loongson.cn
State Superseded
Headers show

Commit Message

Shiyou Yin July 12, 2018, 12:45 p.m. UTC
Reoptimized following functions with mmi:
1. ff_put_pixels4_8_mmi.
2. ff_put_pixels8_8_mmi.
3. ff_put_pixels16_8_mmi.
4. ff_add_pixels_clamped_mmi.

This optimization improved mpeg4 decoding performance about 6fps(from 158fps to 164fps, tested on loongson 3A3000).

Change-Id: I6baff96587c5441cd8b74c9362dc01ca1d20c1a0
Signed-off-by: Shiyou Yin <yinshiyou-hf@loongson.cn>
---
 libavcodec/mips/hpeldsp_mmi.c | 106 +++++++++++++++++++++---------------------
 libavcodec/mips/idctdsp_mmi.c |  62 +++++++++++-------------
 2 files changed, 80 insertions(+), 88 deletions(-)
diff mbox

Patch

diff --git a/libavcodec/mips/hpeldsp_mmi.c b/libavcodec/mips/hpeldsp_mmi.c
index 1bba70e..a0597c1 100644
--- a/libavcodec/mips/hpeldsp_mmi.c
+++ b/libavcodec/mips/hpeldsp_mmi.c
@@ -29,36 +29,35 @@ 
 void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
     ptrdiff_t line_size, int h)
 {
-    double ftmp[2];
-    mips_reg addr[2];
+    double ftmp[4];
     DECLARE_VAR_LOW32;
-    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
-        PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
         "1:                                                             \n\t"
-        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
         MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
-        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
-        MMI_SWC1(%[ftmp0], %[block], 0x00)
-        MMI_SWXC1(%[ftmp1], %[block], %[line_size], 0x00)
-        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
-        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULWC1(%[ftmp1], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULWC1(%[ftmp2], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULWC1(%[ftmp3], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
 
-        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
-        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
         MMI_SWC1(%[ftmp0], %[block], 0x00)
-        MMI_SWXC1(%[ftmp1], %[block], %[line_size], 0x00)
-        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
-        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SWC1(%[ftmp1], %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SWC1(%[ftmp2], %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SWC1(%[ftmp3], %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
 
-        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
         "bnez       %[h],       1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           RESTRICT_ASM_LOW32
-          RESTRICT_ASM_ADDRT
-          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [block]"+&r"(block),              [pixels]"+&r"(pixels),
           [h]"+&r"(h)
         : [line_size]"r"((mips_reg)line_size)
@@ -69,37 +68,35 @@  void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
 void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
     ptrdiff_t line_size, int h)
 {
-    double ftmp[2];
-    mips_reg addr[3];
+    double ftmp[4];
     DECLARE_VAR_ALL64;
 
     __asm__ volatile (
-        PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
         "1:                                                             \n\t"
         MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
-        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
-        MMI_SDC1(%[ftmp0], %[block], 0x00)
-        PTR_ADDU   "%[addr2],   %[block],       %[line_size]            \n\t"
-        MMI_SDC1(%[ftmp1], %[addr2], 0x00)
-        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
-        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULDC1(%[ftmp2], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULDC1(%[ftmp3], %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
 
-        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
-        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
         MMI_SDC1(%[ftmp0], %[block], 0x00)
-        PTR_ADDU   "%[addr2],   %[block],       %[line_size]            \n\t"
-        MMI_SDC1(%[ftmp1], %[addr2], 0x00)
-        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
-        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SDC1(%[ftmp1], %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SDC1(%[ftmp2], %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SDC1(%[ftmp3], %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
 
-        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
         "bnez       %[h],       1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           RESTRICT_ASM_ALL64
-          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-          [addr2]"=&r"(addr[2]),
           [block]"+&r"(block),              [pixels]"+&r"(pixels),
           [h]"+&r"(h)
         : [line_size]"r"((mips_reg)line_size)
@@ -110,7 +107,7 @@  void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
 void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
     ptrdiff_t line_size, int h)
 {
-    double ftmp[4];
+    double ftmp[8];
     DECLARE_VAR_ALL64;
 
     __asm__ volatile (
@@ -120,31 +117,34 @@  void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
         MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
         MMI_ULDC1(%[ftmp3], %[pixels], 0x08)
-        MMI_SDC1(%[ftmp0], %[block], 0x00)
-        MMI_SDC1(%[ftmp2], %[block], 0x08)
-        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
-        MMI_SDC1(%[ftmp1], %[block], 0x00)
-        MMI_SDC1(%[ftmp3], %[block], 0x08)
         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
-        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
-
-        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
-        MMI_ULDC1(%[ftmp2], %[pixels], 0x08)
+        MMI_ULDC1(%[ftmp4], %[pixels], 0x00)
+        MMI_ULDC1(%[ftmp6], %[pixels], 0x08)
         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
-        MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
-        MMI_ULDC1(%[ftmp3], %[pixels], 0x08)
+        MMI_ULDC1(%[ftmp5], %[pixels], 0x00)
+        MMI_ULDC1(%[ftmp7], %[pixels], 0x08)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                  \n\t"
+
         MMI_SDC1(%[ftmp0], %[block], 0x00)
         MMI_SDC1(%[ftmp2], %[block], 0x08)
         PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
         MMI_SDC1(%[ftmp1], %[block], 0x00)
         MMI_SDC1(%[ftmp3], %[block], 0x08)
-        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
+        MMI_SDC1(%[ftmp4], %[block], 0x00)
+        MMI_SDC1(%[ftmp6], %[block], 0x08)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
+        MMI_SDC1(%[ftmp5], %[block], 0x00)
+        MMI_SDC1(%[ftmp7], %[block], 0x08)
         PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
 
-        PTR_ADDI   "%[h],       %[h],           -0x04                  \n\t"
         "bnez       %[h],       1b                                     \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           RESTRICT_ASM_ALL64
           [block]"+&r"(block),              [pixels]"+&r"(pixels),
           [h]"+&r"(h)
diff --git a/libavcodec/mips/idctdsp_mmi.c b/libavcodec/mips/idctdsp_mmi.c
index b797965..a022e18 100644
--- a/libavcodec/mips/idctdsp_mmi.c
+++ b/libavcodec/mips/idctdsp_mmi.c
@@ -174,49 +174,41 @@  void ff_put_signed_pixels_clamped_mmi(const int16_t *block,
 void ff_add_pixels_clamped_mmi(const int16_t *block,
         uint8_t *av_restrict pixels, ptrdiff_t line_size)
 {
-    double ftmp[8];
+    double ftmp[9];
     uint64_t tmp[1];
-    mips_reg addr[1];
-    DECLARE_VAR_ALL64;
-    DECLARE_VAR_ADDRT;
-
     __asm__ volatile (
-        "li         %[tmp0],    0x04                                    \n\t"
-        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-        "1:                                                             \n\t"
-        MMI_LDC1(%[ftmp1], %[block], 0x00)
-        MMI_LDC1(%[ftmp2], %[block], 0x08)
-        MMI_LDC1(%[ftmp3], %[block], 0x10)
-        MMI_LDC1(%[ftmp4], %[block], 0x18)
+        "li         %[tmp0],    0x04                           \n\t"
+        "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]           \n\t"
+        "1:                                                    \n\t"
+
+        "gslqc1     %[ftmp2],   %[ftmp1],   0x00(%[block])     \n\t"
+        "gslqc1     %[ftmp4],   %[ftmp3],   0x10(%[block])     \n\t"
+        PTR_ADDIU  "%[block],   %[block],   0x20               \n\t"
         MMI_LDC1(%[ftmp5], %[pixels], 0x00)
-        MMI_LDXC1(%[ftmp6], %[pixels], %[line_size], 0x00)
-        "mov.d      %[ftmp7],   %[ftmp5]                                \n\t"
-        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
-        "punpckhbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
-        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
-        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
-        "mov.d      %[ftmp7],   %[ftmp6]                                \n\t"
-        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
-        "punpckhbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
-        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
-        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
-        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
-        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],  %[line_size]       \n\t"
+        MMI_LDC1(%[ftmp6], %[pixels], 0x00)
+        PTR_SUBU   "%[pixels],  %[pixels],  %[line_size]       \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp5],   %[ftmp0]           \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],   %[ftmp0]           \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp6],   %[ftmp0]           \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],   %[ftmp0]           \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],   %[ftmp5]           \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],   %[ftmp7]           \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],   %[ftmp6]           \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],   %[ftmp8]           \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],   %[ftmp2]           \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],   %[ftmp4]           \n\t"
         MMI_SDC1(%[ftmp1], %[pixels], 0x00)
-        MMI_SDXC1(%[ftmp3], %[pixels], %[line_size], 0x00)
-        "addi       %[tmp0],    %[tmp0],        -0x01                   \n\t"
-        PTR_ADDIU  "%[block],   %[block],       0x20                    \n\t"
-        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
-        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
-        "bnez       %[tmp0],    1b"
+        PTR_ADDU   "%[pixels],  %[pixels],  %[line_size]       \n\t"
+        MMI_SDC1(%[ftmp3], %[pixels], 0x00)
+        "addi       %[tmp0],    %[tmp0],    -0x01              \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],  %[line_size]       \n\t"
+        "bnez       %[tmp0],    1b                             \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
-          [tmp0]"=&r"(tmp[0]),
-          RESTRICT_ASM_ALL64
-          RESTRICT_ASM_ADDRT
-          [addr0]"=&r"(addr[0]),
+          [ftmp8]"=&f"(ftmp[8]),            [tmp0]"=&r"(tmp[0]),
           [pixels]"+&r"(pixels),            [block]"+&r"(block)
         : [line_size]"r"((mips_reg)line_size)
         : "memory"