Message ID | 1564288929-1943-1-git-send-email-yinshiyou-hf@loongson.cn |
---|---|
State | Accepted |
Headers | show |
I have no MIPS experience, but for what little it is worth thus: it looks fine to me. On 28.07.2019, at 06:42, Shiyou Yin <yinshiyou-hf@loongson.cn> wrote: > In function ff_h264_add_pixels4_8_mmi, there is no need to reset '%[ftmp0]' > to 0, because it's value has never changed since the start of the asm block. > This patch remove the redundant 'xor' and set src to zero once it was loaded. > > In function ff_h264_idct_add_8_mmi, 'block' is seted to zero twice. > This patch removed the first setting zero operation and move the second one > after the load operation of block. > > In function ff_h264_idct8_add_8_mmi, 'block' is seted to zero twice too. > This patch just removed the second setting zero operation. > > This patch mainly simplifies the implementation of functions above, > the effect on the performance of whole h264 decoding process is not obvious. > According to the perf data, proportion of ff_h264_idct_add_8_mmi decreased from > 0.29% to 0.26% and ff_h264_idct8_add_8_mmi decreased from 0.62% to 0.59% when decoding > H264 format on loongson 3A3000(For reference only , not very stable.). > --- > libavcodec/mips/h264dsp_mmi.c | 44 +++++++++++++------------------------------ > 1 file changed, 13 insertions(+), 31 deletions(-) > > diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c > index ac65a20..0459711 100644 > --- a/libavcodec/mips/h264dsp_mmi.c > +++ b/libavcodec/mips/h264dsp_mmi.c > @@ -38,6 +38,9 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride) > MMI_LDC1(%[ftmp2], %[src], 0x08) > MMI_LDC1(%[ftmp3], %[src], 0x10) > MMI_LDC1(%[ftmp4], %[src], 0x18) > + /* memset(src, 0, 32); */ > + "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[src]) \n\t" > + "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[src]) \n\t" > MMI_ULWC1(%[ftmp5], %[dst0], 0x00) > MMI_ULWC1(%[ftmp6], %[dst1], 0x00) > MMI_ULWC1(%[ftmp7], %[dst2], 0x00) > @@ -58,11 +61,6 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride) > MMI_SWC1(%[ftmp2], %[dst1], 0x00) > MMI_SWC1(%[ftmp3], %[dst2], 0x00) > MMI_SWC1(%[ftmp4], %[dst3], 0x00) > - > - /* memset(src, 0, 32); */ > - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[src]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[src]) \n\t" > : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), > [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), > [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), > @@ -85,15 +83,19 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride) > DECLARE_VAR_ADDRT; > > __asm__ volatile ( > - "dli %[tmp0], 0x01 \n\t" > MMI_LDC1(%[ftmp0], %[block], 0x00) > - "mtc1 %[tmp0], %[ftmp8] \n\t" > MMI_LDC1(%[ftmp1], %[block], 0x08) > - "dli %[tmp0], 0x06 \n\t" > MMI_LDC1(%[ftmp2], %[block], 0x10) > + MMI_LDC1(%[ftmp3], %[block], 0x18) > + /* memset(block, 0, 32) */ > + "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t" > + "gssqc1 %[ftmp4], %[ftmp4], 0x00(%[block]) \n\t" > + "gssqc1 %[ftmp4], %[ftmp4], 0x10(%[block]) \n\t" > + "dli %[tmp0], 0x01 \n\t" > + "mtc1 %[tmp0], %[ftmp8] \n\t" > + "dli %[tmp0], 0x06 \n\t" > "mtc1 %[tmp0], %[ftmp9] \n\t" > "psrah %[ftmp4], %[ftmp1], %[ftmp8] \n\t" > - MMI_LDC1(%[ftmp3], %[block], 0x18) > "psrah %[ftmp5], %[ftmp3], %[ftmp8] \n\t" > "psubh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" > "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" > @@ -121,15 +123,11 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride) > "paddh %[ftmp10], %[ftmp3], %[ftmp1] \n\t" > "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" > "paddh %[ftmp11], %[ftmp4], %[ftmp5] \n\t" > - "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" > "psubh %[ftmp5], %[ftmp5], %[ftmp4] \n\t" > - MMI_SDC1(%[ftmp7], %[block], 0x00) > - MMI_SDC1(%[ftmp7], %[block], 0x08) > - MMI_SDC1(%[ftmp7], %[block], 0x10) > - MMI_SDC1(%[ftmp7], %[block], 0x18) > MMI_ULWC1(%[ftmp2], %[dst], 0x00) > - "psrah %[ftmp3], %[ftmp10], %[ftmp9] \n\t" > MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00) > + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" > + "psrah %[ftmp3], %[ftmp10], %[ftmp9] \n\t" > "psrah %[ftmp4], %[ftmp11], %[ftmp9] \n\t" > "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" > "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" > @@ -153,11 +151,6 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride) > MMI_SWC1(%[ftmp2], %[dst], 0x00) > "packushb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" > MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00) > - > - /* memset(block, 0, 32) */ > - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[block]) \n\t" > : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), > [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), > [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), > @@ -620,17 +613,6 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride) > MMI_SWC1(%[ftmp6], %[addr0], 0x00) > MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00) > PTR_ADDIU "$29, $29, 0x20 \n\t" > - > - /* memset(block, 0, 128) */ > - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x20(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x30(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x40(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x50(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x60(%[block]) \n\t" > - "gssqc1 %[ftmp0], %[ftmp0], 0x70(%[block]) \n\t" > : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), > [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), > [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), > -- > 2.1.0 > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
On Sun, Jul 28, 2019 at 04:07:40PM +0200, Reimar Döffinger wrote:
> I have no MIPS experience, but for what little it is worth thus: it looks fine to me.
will apply
thanks
[...]
diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c index ac65a20..0459711 100644 --- a/libavcodec/mips/h264dsp_mmi.c +++ b/libavcodec/mips/h264dsp_mmi.c @@ -38,6 +38,9 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride) MMI_LDC1(%[ftmp2], %[src], 0x08) MMI_LDC1(%[ftmp3], %[src], 0x10) MMI_LDC1(%[ftmp4], %[src], 0x18) + /* memset(src, 0, 32); */ + "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[src]) \n\t" + "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[src]) \n\t" MMI_ULWC1(%[ftmp5], %[dst0], 0x00) MMI_ULWC1(%[ftmp6], %[dst1], 0x00) MMI_ULWC1(%[ftmp7], %[dst2], 0x00) @@ -58,11 +61,6 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride) MMI_SWC1(%[ftmp2], %[dst1], 0x00) MMI_SWC1(%[ftmp3], %[dst2], 0x00) MMI_SWC1(%[ftmp4], %[dst3], 0x00) - - /* memset(src, 0, 32); */ - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[src]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[src]) \n\t" : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), @@ -85,15 +83,19 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride) DECLARE_VAR_ADDRT; __asm__ volatile ( - "dli %[tmp0], 0x01 \n\t" MMI_LDC1(%[ftmp0], %[block], 0x00) - "mtc1 %[tmp0], %[ftmp8] \n\t" MMI_LDC1(%[ftmp1], %[block], 0x08) - "dli %[tmp0], 0x06 \n\t" MMI_LDC1(%[ftmp2], %[block], 0x10) + MMI_LDC1(%[ftmp3], %[block], 0x18) + /* memset(block, 0, 32) */ + "xor %[ftmp4], %[ftmp4], %[ftmp4] \n\t" + "gssqc1 %[ftmp4], %[ftmp4], 0x00(%[block]) \n\t" + "gssqc1 %[ftmp4], %[ftmp4], 0x10(%[block]) \n\t" + "dli %[tmp0], 0x01 \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x06 \n\t" "mtc1 %[tmp0], %[ftmp9] \n\t" "psrah %[ftmp4], %[ftmp1], %[ftmp8] \n\t" - MMI_LDC1(%[ftmp3], %[block], 0x18) "psrah %[ftmp5], %[ftmp3], %[ftmp8] \n\t" "psubh %[ftmp4], %[ftmp4], %[ftmp3] \n\t" "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" @@ -121,15 +123,11 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride) "paddh %[ftmp10], %[ftmp3], %[ftmp1] \n\t" "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" "paddh %[ftmp11], %[ftmp4], %[ftmp5] \n\t" - "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" "psubh %[ftmp5], %[ftmp5], %[ftmp4] \n\t" - MMI_SDC1(%[ftmp7], %[block], 0x00) - MMI_SDC1(%[ftmp7], %[block], 0x08) - MMI_SDC1(%[ftmp7], %[block], 0x10) - MMI_SDC1(%[ftmp7], %[block], 0x18) MMI_ULWC1(%[ftmp2], %[dst], 0x00) - "psrah %[ftmp3], %[ftmp10], %[ftmp9] \n\t" MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00) + "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "psrah %[ftmp3], %[ftmp10], %[ftmp9] \n\t" "psrah %[ftmp4], %[ftmp11], %[ftmp9] \n\t" "punpcklbh %[ftmp2], %[ftmp2], %[ftmp7] \n\t" "punpcklbh %[ftmp0], %[ftmp0], %[ftmp7] \n\t" @@ -153,11 +151,6 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride) MMI_SWC1(%[ftmp2], %[dst], 0x00) "packushb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00) - - /* memset(block, 0, 32) */ - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[block]) \n\t" : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), @@ -620,17 +613,6 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride) MMI_SWC1(%[ftmp6], %[addr0], 0x00) MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00) PTR_ADDIU "$29, $29, 0x20 \n\t" - - /* memset(block, 0, 128) */ - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x20(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x30(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x40(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x50(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x60(%[block]) \n\t" - "gssqc1 %[ftmp0], %[ftmp0], 0x70(%[block]) \n\t" : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),