diff mbox

[FFmpeg-devel,v2] avcodec/mips: [loongson] refine process of setting block as 0 in h264dsp_mmi.

Message ID 1564288929-1943-1-git-send-email-yinshiyou-hf@loongson.cn
State Accepted
Headers show

Commit Message

Shiyou Yin July 28, 2019, 4:42 a.m. UTC
In function ff_h264_add_pixels4_8_mmi, there is no need to reset '%[ftmp0]'
to 0, because it's value has never changed since the start of the asm block.
This patch remove the redundant 'xor' and set src to zero once it was loaded.

In function ff_h264_idct_add_8_mmi, 'block' is seted to zero twice.
This patch removed the first setting zero operation and move the second one
after the load operation of block.

In function ff_h264_idct8_add_8_mmi, 'block' is seted to zero twice too.
This patch just removed the second setting zero operation.

This patch mainly simplifies the implementation of functions above,
the effect on the performance of whole h264 decoding process is not obvious.
According to the perf data, proportion of ff_h264_idct_add_8_mmi decreased from
0.29% to 0.26% and ff_h264_idct8_add_8_mmi decreased from 0.62% to 0.59% when decoding
H264 format on loongson 3A3000(For reference only , not very stable.).
---
 libavcodec/mips/h264dsp_mmi.c | 44 +++++++++++++------------------------------
 1 file changed, 13 insertions(+), 31 deletions(-)

Comments

Reimar Döffinger July 28, 2019, 2:07 p.m. UTC | #1
I have no MIPS experience, but for what little it is worth thus: it looks fine to me.

On 28.07.2019, at 06:42, Shiyou Yin <yinshiyou-hf@loongson.cn> wrote:

> In function ff_h264_add_pixels4_8_mmi, there is no need to reset '%[ftmp0]'
> to 0, because it's value has never changed since the start of the asm block.
> This patch remove the redundant 'xor' and set src to zero once it was loaded.
> 
> In function ff_h264_idct_add_8_mmi, 'block' is seted to zero twice.
> This patch removed the first setting zero operation and move the second one
> after the load operation of block.
> 
> In function ff_h264_idct8_add_8_mmi, 'block' is seted to zero twice too.
> This patch just removed the second setting zero operation.
> 
> This patch mainly simplifies the implementation of functions above,
> the effect on the performance of whole h264 decoding process is not obvious.
> According to the perf data, proportion of ff_h264_idct_add_8_mmi decreased from
> 0.29% to 0.26% and ff_h264_idct8_add_8_mmi decreased from 0.62% to 0.59% when decoding
> H264 format on loongson 3A3000(For reference only , not very stable.).
> ---
> libavcodec/mips/h264dsp_mmi.c | 44 +++++++++++++------------------------------
> 1 file changed, 13 insertions(+), 31 deletions(-)
> 
> diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c
> index ac65a20..0459711 100644
> --- a/libavcodec/mips/h264dsp_mmi.c
> +++ b/libavcodec/mips/h264dsp_mmi.c
> @@ -38,6 +38,9 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
>         MMI_LDC1(%[ftmp2], %[src], 0x08)
>         MMI_LDC1(%[ftmp3], %[src], 0x10)
>         MMI_LDC1(%[ftmp4], %[src], 0x18)
> +        /* memset(src, 0, 32); */
> +        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[src])            \n\t"
> +        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[src])            \n\t"
>         MMI_ULWC1(%[ftmp5], %[dst0], 0x00)
>         MMI_ULWC1(%[ftmp6], %[dst1], 0x00)
>         MMI_ULWC1(%[ftmp7], %[dst2], 0x00)
> @@ -58,11 +61,6 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
>         MMI_SWC1(%[ftmp2], %[dst1], 0x00)
>         MMI_SWC1(%[ftmp3], %[dst2], 0x00)
>         MMI_SWC1(%[ftmp4], %[dst3], 0x00)
> -
> -        /* memset(src, 0, 32); */
> -        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[src])            \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[src])            \n\t"
>         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
>           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
>           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
> @@ -85,15 +83,19 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
>     DECLARE_VAR_ADDRT;
> 
>     __asm__ volatile (
> -        "dli        %[tmp0],    0x01                                    \n\t"
>         MMI_LDC1(%[ftmp0], %[block], 0x00)
> -        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
>         MMI_LDC1(%[ftmp1], %[block], 0x08)
> -        "dli        %[tmp0],    0x06                                    \n\t"
>         MMI_LDC1(%[ftmp2], %[block], 0x10)
> +        MMI_LDC1(%[ftmp3], %[block], 0x18)
> +        /* memset(block, 0, 32) */
> +        "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
> +        "gssqc1     %[ftmp4],   %[ftmp4],       0x00(%[block])          \n\t"
> +        "gssqc1     %[ftmp4],   %[ftmp4],       0x10(%[block])          \n\t"
> +        "dli        %[tmp0],    0x01                                    \n\t"
> +        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
> +        "dli        %[tmp0],    0x06                                    \n\t"
>         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
>         "psrah      %[ftmp4],   %[ftmp1],       %[ftmp8]                \n\t"
> -        MMI_LDC1(%[ftmp3], %[block], 0x18)
>         "psrah      %[ftmp5],   %[ftmp3],       %[ftmp8]                \n\t"
>         "psubh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
>         "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
> @@ -121,15 +123,11 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
>         "paddh      %[ftmp10],  %[ftmp3],       %[ftmp1]                \n\t"
>         "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
>         "paddh      %[ftmp11],  %[ftmp4],       %[ftmp5]                \n\t"
> -        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
>         "psubh      %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
> -        MMI_SDC1(%[ftmp7], %[block], 0x00)
> -        MMI_SDC1(%[ftmp7], %[block], 0x08)
> -        MMI_SDC1(%[ftmp7], %[block], 0x10)
> -        MMI_SDC1(%[ftmp7], %[block], 0x18)
>         MMI_ULWC1(%[ftmp2], %[dst], 0x00)
> -        "psrah      %[ftmp3],   %[ftmp10],      %[ftmp9]                \n\t"
>         MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
> +        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
> +        "psrah      %[ftmp3],   %[ftmp10],      %[ftmp9]                \n\t"
>         "psrah      %[ftmp4],   %[ftmp11],      %[ftmp9]                \n\t"
>         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
>         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
> @@ -153,11 +151,6 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
>         MMI_SWC1(%[ftmp2], %[dst], 0x00)
>         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
>         MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
> -
> -        /* memset(block, 0, 32) */
> -        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[block])          \n\t"
>         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
>           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
>           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
> @@ -620,17 +613,6 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
>         MMI_SWC1(%[ftmp6], %[addr0], 0x00)
>         MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
>         PTR_ADDIU  "$29,        $29,            0x20                    \n\t"
> -
> -        /* memset(block, 0, 128) */
> -        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x20(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x30(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x40(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x50(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x60(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x70(%[block])          \n\t"
>         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
>           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
>           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
> -- 
> 2.1.0
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Michael Niedermayer July 28, 2019, 6:29 p.m. UTC | #2
On Sun, Jul 28, 2019 at 04:07:40PM +0200, Reimar Döffinger wrote:
> I have no MIPS experience, but for what little it is worth thus: it looks fine to me.

will apply

thanks

[...]
diff mbox

Patch

diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c
index ac65a20..0459711 100644
--- a/libavcodec/mips/h264dsp_mmi.c
+++ b/libavcodec/mips/h264dsp_mmi.c
@@ -38,6 +38,9 @@  void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
         MMI_LDC1(%[ftmp2], %[src], 0x08)
         MMI_LDC1(%[ftmp3], %[src], 0x10)
         MMI_LDC1(%[ftmp4], %[src], 0x18)
+        /* memset(src, 0, 32); */
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[src])            \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[src])            \n\t"
         MMI_ULWC1(%[ftmp5], %[dst0], 0x00)
         MMI_ULWC1(%[ftmp6], %[dst1], 0x00)
         MMI_ULWC1(%[ftmp7], %[dst2], 0x00)
@@ -58,11 +61,6 @@  void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
         MMI_SWC1(%[ftmp2], %[dst1], 0x00)
         MMI_SWC1(%[ftmp3], %[dst2], 0x00)
         MMI_SWC1(%[ftmp4], %[dst3], 0x00)
-
-        /* memset(src, 0, 32); */
-        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[src])            \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[src])            \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
@@ -85,15 +83,19 @@  void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
     DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
-        "dli        %[tmp0],    0x01                                    \n\t"
         MMI_LDC1(%[ftmp0], %[block], 0x00)
-        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
         MMI_LDC1(%[ftmp1], %[block], 0x08)
-        "dli        %[tmp0],    0x06                                    \n\t"
         MMI_LDC1(%[ftmp2], %[block], 0x10)
+        MMI_LDC1(%[ftmp3], %[block], 0x18)
+        /* memset(block, 0, 32) */
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "gssqc1     %[ftmp4],   %[ftmp4],       0x00(%[block])          \n\t"
+        "gssqc1     %[ftmp4],   %[ftmp4],       0x10(%[block])          \n\t"
+        "dli        %[tmp0],    0x01                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        "dli        %[tmp0],    0x06                                    \n\t"
         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
         "psrah      %[ftmp4],   %[ftmp1],       %[ftmp8]                \n\t"
-        MMI_LDC1(%[ftmp3], %[block], 0x18)
         "psrah      %[ftmp5],   %[ftmp3],       %[ftmp8]                \n\t"
         "psubh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
         "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
@@ -121,15 +123,11 @@  void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "paddh      %[ftmp10],  %[ftmp3],       %[ftmp1]                \n\t"
         "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         "paddh      %[ftmp11],  %[ftmp4],       %[ftmp5]                \n\t"
-        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
         "psubh      %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
-        MMI_SDC1(%[ftmp7], %[block], 0x00)
-        MMI_SDC1(%[ftmp7], %[block], 0x08)
-        MMI_SDC1(%[ftmp7], %[block], 0x10)
-        MMI_SDC1(%[ftmp7], %[block], 0x18)
         MMI_ULWC1(%[ftmp2], %[dst], 0x00)
-        "psrah      %[ftmp3],   %[ftmp10],      %[ftmp9]                \n\t"
         MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp10],      %[ftmp9]                \n\t"
         "psrah      %[ftmp4],   %[ftmp11],      %[ftmp9]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
@@ -153,11 +151,6 @@  void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         MMI_SWC1(%[ftmp2], %[dst], 0x00)
         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
         MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
-
-        /* memset(block, 0, 32) */
-        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[block])          \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
@@ -620,17 +613,6 @@  void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         MMI_SWC1(%[ftmp6], %[addr0], 0x00)
         MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
         PTR_ADDIU  "$29,        $29,            0x20                    \n\t"
-
-        /* memset(block, 0, 128) */
-        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x20(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x30(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x40(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x50(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x60(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x70(%[block])          \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),