diff mbox

[FFmpeg-devel,8/9] avcodec/mips: [loongson] reoptimize put&add pixels with mmi v2.

Message ID 1531302380-16733-1-git-send-email-yinshiyou-hf@loongson.cn
State Superseded
Headers show

Commit Message

Shiyou Yin July 11, 2018, 9:46 a.m. UTC
Reoptimized following functions with mmi:
1. ff_put_pixels4_8_mmi.
2. ff_put_pixels8_8_mmi.
3. ff_put_pixels16_8_mmi.
4. ff_add_pixels_clamped_mmi.

Change-Id: I80be1891f52942e432a72e96c135bc9ead92972f
Signed-off-by: Shiyou Yin <yinshiyou-hf@loongson.cn>
---
 libavcodec/mips/hpeldsp_mmi.c | 133 +++++++++++++++++++-----------------------
 libavcodec/mips/idctdsp_mmi.c |  71 +++++++++-------------
 2 files changed, 89 insertions(+), 115 deletions(-)

Comments

Carl Eugen Hoyos July 11, 2018, 12:09 p.m. UTC | #1
2018-07-11 11:46 GMT+02:00, Shiyou Yin <yinshiyou-hf@loongson.cn>:
> Reoptimized following functions with mmi:
> 1. ff_put_pixels4_8_mmi.
> 2. ff_put_pixels8_8_mmi.
> 3. ff_put_pixels16_8_mmi.
> 4. ff_add_pixels_clamped_mmi.

Same for this and some other patches:
Please explain why the change was done.

Carl Eugen
diff mbox

Patch

diff --git a/libavcodec/mips/hpeldsp_mmi.c b/libavcodec/mips/hpeldsp_mmi.c
index 1bba70e..3272367 100644
--- a/libavcodec/mips/hpeldsp_mmi.c
+++ b/libavcodec/mips/hpeldsp_mmi.c
@@ -29,36 +29,32 @@ 
 void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
     ptrdiff_t line_size, int h)
 {
-    double ftmp[2];
-    mips_reg addr[2];
     DECLARE_VAR_LOW32;
-    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
-        PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
         "1:                                                             \n\t"
-        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
-        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
-        MMI_SWC1(%[ftmp0], %[block], 0x00)
-        MMI_SWXC1(%[ftmp1], %[block], %[line_size], 0x00)
-        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
-        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
-
-        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
-        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
-        MMI_SWC1(%[ftmp0], %[block], 0x00)
-        MMI_SWXC1(%[ftmp1], %[block], %[line_size], 0x00)
-        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
-        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+        MMI_ULWC1($f0, %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULWC1($f1, %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULWC1($f2, %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULWC1($f3, %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
 
         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+
+        MMI_SWC1($f0, %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SWC1($f1, %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SWC1($f2, %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SWC1($f3, %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+
         "bnez       %[h],       1b                                      \n\t"
-        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
-          RESTRICT_ASM_LOW32
-          RESTRICT_ASM_ADDRT
-          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+        : RESTRICT_ASM_LOW32
           [block]"+&r"(block),              [pixels]"+&r"(pixels),
           [h]"+&r"(h)
         : [line_size]"r"((mips_reg)line_size)
@@ -69,37 +65,32 @@  void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
 void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
     ptrdiff_t line_size, int h)
 {
-    double ftmp[2];
-    mips_reg addr[3];
     DECLARE_VAR_ALL64;
 
     __asm__ volatile (
-        PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
         "1:                                                             \n\t"
-        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
-        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
-        MMI_SDC1(%[ftmp0], %[block], 0x00)
-        PTR_ADDU   "%[addr2],   %[block],       %[line_size]            \n\t"
-        MMI_SDC1(%[ftmp1], %[addr2], 0x00)
-        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
-        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
-
-        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
-        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
-        MMI_SDC1(%[ftmp0], %[block], 0x00)
-        PTR_ADDU   "%[addr2],   %[block],       %[line_size]            \n\t"
-        MMI_SDC1(%[ftmp1], %[addr2], 0x00)
-        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
-        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+        MMI_ULDC1($f0, %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULDC1($f1, %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULDC1($f2, %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
+        MMI_ULDC1($f3, %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],   %[pixels],      %[line_size]           \n\t"
 
         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+
+        MMI_SDC1($f0, %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SDC1($f1, %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SDC1($f2, %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+        MMI_SDC1($f3, %[block], 0x00)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
+
         "bnez       %[h],       1b                                      \n\t"
-        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
-          RESTRICT_ASM_ALL64
-          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-          [addr2]"=&r"(addr[2]),
+        : RESTRICT_ASM_ALL64
           [block]"+&r"(block),              [pixels]"+&r"(pixels),
           [h]"+&r"(h)
         : [line_size]"r"((mips_reg)line_size)
@@ -110,42 +101,40 @@  void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
 void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
     ptrdiff_t line_size, int h)
 {
-    double ftmp[4];
     DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "1:                                                            \n\t"
-        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
-        MMI_ULDC1(%[ftmp2], %[pixels], 0x08)
+        MMI_ULDC1($f0, %[pixels], 0x00)
+        MMI_ULDC1($f2, %[pixels], 0x08)
         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
-        MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
-        MMI_ULDC1(%[ftmp3], %[pixels], 0x08)
-        MMI_SDC1(%[ftmp0], %[block], 0x00)
-        MMI_SDC1(%[ftmp2], %[block], 0x08)
-        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
-        MMI_SDC1(%[ftmp1], %[block], 0x00)
-        MMI_SDC1(%[ftmp3], %[block], 0x08)
+        MMI_ULDC1($f1, %[pixels], 0x00)
+        MMI_ULDC1($f3, %[pixels], 0x08)
         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
-        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
-
-        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
-        MMI_ULDC1(%[ftmp2], %[pixels], 0x08)
+        MMI_ULDC1($f4, %[pixels], 0x00)
+        MMI_ULDC1($f6, %[pixels], 0x08)
         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
-        MMI_ULDC1(%[ftmp1], %[pixels], 0x00)
-        MMI_ULDC1(%[ftmp3], %[pixels], 0x08)
-        MMI_SDC1(%[ftmp0], %[block], 0x00)
-        MMI_SDC1(%[ftmp2], %[block], 0x08)
-        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
-        MMI_SDC1(%[ftmp1], %[block], 0x00)
-        MMI_SDC1(%[ftmp3], %[block], 0x08)
+        MMI_ULDC1($f5, %[pixels], 0x00)
+        MMI_ULDC1($f7, %[pixels], 0x08)
         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]           \n\t"
-        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
 
         PTR_ADDI   "%[h],       %[h],           -0x04                  \n\t"
+
+        MMI_SDC1($f0, %[block], 0x00)
+        MMI_SDC1($f2, %[block], 0x08)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
+        MMI_SDC1($f1, %[block], 0x00)
+        MMI_SDC1($f3, %[block], 0x08)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
+        MMI_SDC1($f4, %[block], 0x00)
+        MMI_SDC1($f6, %[block], 0x08)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
+        MMI_SDC1($f5, %[block], 0x00)
+        MMI_SDC1($f7, %[block], 0x08)
+        PTR_ADDU   "%[block],   %[block],       %[line_size]           \n\t"
+
         "bnez       %[h],       1b                                     \n\t"
-        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
-          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
-          RESTRICT_ASM_ALL64
+        : RESTRICT_ASM_ALL64
           [block]"+&r"(block),              [pixels]"+&r"(pixels),
           [h]"+&r"(h)
         : [line_size]"r"((mips_reg)line_size)
diff --git a/libavcodec/mips/idctdsp_mmi.c b/libavcodec/mips/idctdsp_mmi.c
index b797965..93d5fc6 100644
--- a/libavcodec/mips/idctdsp_mmi.c
+++ b/libavcodec/mips/idctdsp_mmi.c
@@ -174,50 +174,35 @@  void ff_put_signed_pixels_clamped_mmi(const int16_t *block,
 void ff_add_pixels_clamped_mmi(const int16_t *block,
         uint8_t *av_restrict pixels, ptrdiff_t line_size)
 {
-    double ftmp[8];
-    uint64_t tmp[1];
-    mips_reg addr[1];
-    DECLARE_VAR_ALL64;
-    DECLARE_VAR_ADDRT;
-
     __asm__ volatile (
-        "li         %[tmp0],    0x04                                    \n\t"
-        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-        "1:                                                             \n\t"
-        MMI_LDC1(%[ftmp1], %[block], 0x00)
-        MMI_LDC1(%[ftmp2], %[block], 0x08)
-        MMI_LDC1(%[ftmp3], %[block], 0x10)
-        MMI_LDC1(%[ftmp4], %[block], 0x18)
-        MMI_LDC1(%[ftmp5], %[pixels], 0x00)
-        MMI_LDXC1(%[ftmp6], %[pixels], %[line_size], 0x00)
-        "mov.d      %[ftmp7],   %[ftmp5]                                \n\t"
-        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
-        "punpckhbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
-        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
-        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
-        "mov.d      %[ftmp7],   %[ftmp6]                                \n\t"
-        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
-        "punpckhbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
-        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
-        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
-        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
-        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
-        MMI_SDC1(%[ftmp1], %[pixels], 0x00)
-        MMI_SDXC1(%[ftmp3], %[pixels], %[line_size], 0x00)
-        "addi       %[tmp0],    %[tmp0],        -0x01                   \n\t"
-        PTR_ADDIU  "%[block],   %[block],       0x20                    \n\t"
-        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
-        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
-        "bnez       %[tmp0],    1b"
-        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
-          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
-          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
-          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
-          [tmp0]"=&r"(tmp[0]),
-          RESTRICT_ASM_ALL64
-          RESTRICT_ASM_ADDRT
-          [addr0]"=&r"(addr[0]),
-          [pixels]"+&r"(pixels),            [block]"+&r"(block)
+        "li         $10,        0x04                           \n\t"
+        "xor        $f0,        $f0,        $f0                \n\t"
+        "1:                                                    \n\t"
+
+        "gslqc1     $f2,        $f1,        0x00(%[block])     \n\t"
+        "gslqc1     $f4,        $f3,        0x10(%[block])     \n\t"
+        PTR_ADDIU  "%[block],   %[block],   0x20               \n\t"
+        MMI_LDC1($f5, %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],  %[pixels],  %[line_size]       \n\t"
+        MMI_LDC1($f6, %[pixels], 0x00)
+        PTR_SUBU   "%[pixels],  %[pixels],  %[line_size]       \n\t"
+        "punpckhbh  $f7,        $f5,        $f0                \n\t"
+        "punpcklbh  $f5,        $f5,        $f0                \n\t"
+        "punpckhbh  $f8,        $f6,        $f0                \n\t"
+        "punpcklbh  $f6,        $f6,        $f0                \n\t"
+        "paddh      $f1,        $f1,        $f5                \n\t"
+        "paddh      $f2,        $f2,        $f7                \n\t"
+        "paddh      $f3,        $f3,        $f6                \n\t"
+        "paddh      $f4,        $f4,        $f8                \n\t"
+        "packushb   $f1,        $f1,        $f2                \n\t"
+        "packushb   $f3,        $f3,        $f4                \n\t"
+        MMI_SDC1($f1, %[pixels], 0x00)
+        PTR_ADDU   "%[pixels],  %[pixels],  %[line_size]       \n\t"
+        MMI_SDC1($f3, %[pixels], 0x00)
+        "addi       $10,        $10,        -0x01              \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],  %[line_size]       \n\t"
+        "bnez       $10,        1b                             \n\t"
+        : [pixels]"+&r"(pixels),            [block]"+&r"(block)
         : [line_size]"r"((mips_reg)line_size)
         : "memory"
     );