diff mbox

[FFmpeg-devel] avutil/mips: Avoid instruction exception caused by gssqc1/gslqc1.

Message ID 1563866874-10116-1-git-send-email-yinshiyou-hf@loongson.cn
State Superseded
Headers show

Commit Message

Shiyou Yin July 23, 2019, 7:27 a.m. UTC
Ensure the address accesed by gssqc1/gslqc1 are 16-bits memory-aligned.
---
 libavcodec/mips/h264dsp_mmi.c     | 48 +++++++++++++-----------------------
 libavcodec/mips/simple_idct_mmi.c | 51 +++++++++++++++++++++++++--------------
 libavutil/mips/mmiutils.h         |  2 +-
 3 files changed, 51 insertions(+), 50 deletions(-)

Comments

Reimar Döffinger July 23, 2019, 11:34 p.m. UTC | #1
Why is "block" not aligned? Does the code for other architectures also use unaligned instructions for these?

On 23.07.2019, at 09:27, Shiyou Yin <yinshiyou-hf@loongson.cn> wrote:

> Ensure the address accesed by gssqc1/gslqc1 are 16-bits memory-aligned.
> ---
> libavcodec/mips/h264dsp_mmi.c     | 48 +++++++++++++-----------------------
> libavcodec/mips/simple_idct_mmi.c | 51 +++++++++++++++++++++++++--------------
> libavutil/mips/mmiutils.h         |  2 +-
> 3 files changed, 51 insertions(+), 50 deletions(-)
> 
> diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c
> index ac65a20..a85d782 100644
> --- a/libavcodec/mips/h264dsp_mmi.c
> +++ b/libavcodec/mips/h264dsp_mmi.c
> @@ -38,6 +38,11 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
>         MMI_LDC1(%[ftmp2], %[src], 0x08)
>         MMI_LDC1(%[ftmp3], %[src], 0x10)
>         MMI_LDC1(%[ftmp4], %[src], 0x18)
> +        /* memset(src, 0, 32); */
> +        MMI_USDC1(%[ftmp0], %[src], 0x00)
> +        MMI_USDC1(%[ftmp0], %[src], 0x08)
> +        MMI_USDC1(%[ftmp0], %[src], 0x10)
> +        MMI_USDC1(%[ftmp0], %[src], 0x18)
>         MMI_ULWC1(%[ftmp5], %[dst0], 0x00)
>         MMI_ULWC1(%[ftmp6], %[dst1], 0x00)
>         MMI_ULWC1(%[ftmp7], %[dst2], 0x00)
> @@ -58,11 +63,6 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
>         MMI_SWC1(%[ftmp2], %[dst1], 0x00)
>         MMI_SWC1(%[ftmp3], %[dst2], 0x00)
>         MMI_SWC1(%[ftmp4], %[dst3], 0x00)
> -
> -        /* memset(src, 0, 32); */
> -        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[src])            \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[src])            \n\t"
>         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
>           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
>           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
> @@ -85,15 +85,21 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
>     DECLARE_VAR_ADDRT;
> 
>     __asm__ volatile (
> -        "dli        %[tmp0],    0x01                                    \n\t"
>         MMI_LDC1(%[ftmp0], %[block], 0x00)
> -        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
>         MMI_LDC1(%[ftmp1], %[block], 0x08)
> -        "dli        %[tmp0],    0x06                                    \n\t"
>         MMI_LDC1(%[ftmp2], %[block], 0x10)
> +        MMI_LDC1(%[ftmp3], %[block], 0x18)
> +        /* memset(block, 0, 32) */
> +        "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
> +        MMI_USDC1(%[ftmp4], %[block], 0x00)
> +        MMI_USDC1(%[ftmp4], %[block], 0x08)
> +        MMI_USDC1(%[ftmp4], %[block], 0x10)
> +        MMI_USDC1(%[ftmp4], %[block], 0x18)
> +        "dli        %[tmp0],    0x01                                    \n\t"
> +        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
> +        "dli        %[tmp0],    0x06                                    \n\t"
>         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
>         "psrah      %[ftmp4],   %[ftmp1],       %[ftmp8]                \n\t"
> -        MMI_LDC1(%[ftmp3], %[block], 0x18)
>         "psrah      %[ftmp5],   %[ftmp3],       %[ftmp8]                \n\t"
>         "psubh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
>         "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
> @@ -121,15 +127,11 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
>         "paddh      %[ftmp10],  %[ftmp3],       %[ftmp1]                \n\t"
>         "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
>         "paddh      %[ftmp11],  %[ftmp4],       %[ftmp5]                \n\t"
> -        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
>         "psubh      %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
> -        MMI_SDC1(%[ftmp7], %[block], 0x00)
> -        MMI_SDC1(%[ftmp7], %[block], 0x08)
> -        MMI_SDC1(%[ftmp7], %[block], 0x10)
> -        MMI_SDC1(%[ftmp7], %[block], 0x18)
>         MMI_ULWC1(%[ftmp2], %[dst], 0x00)
> -        "psrah      %[ftmp3],   %[ftmp10],      %[ftmp9]                \n\t"
>         MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
> +        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
> +        "psrah      %[ftmp3],   %[ftmp10],      %[ftmp9]                \n\t"
>         "psrah      %[ftmp4],   %[ftmp11],      %[ftmp9]                \n\t"
>         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
>         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
> @@ -153,11 +155,6 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
>         MMI_SWC1(%[ftmp2], %[dst], 0x00)
>         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
>         MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
> -
> -        /* memset(block, 0, 32) */
> -        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[block])          \n\t"
>         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
>           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
>           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
> @@ -620,17 +617,6 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
>         MMI_SWC1(%[ftmp6], %[addr0], 0x00)
>         MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
>         PTR_ADDIU  "$29,        $29,            0x20                    \n\t"
> -
> -        /* memset(block, 0, 128) */
> -        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x20(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x30(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x40(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x50(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x60(%[block])          \n\t"
> -        "gssqc1     %[ftmp0],   %[ftmp0],       0x70(%[block])          \n\t"
>         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
>           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
>           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
> diff --git a/libavcodec/mips/simple_idct_mmi.c b/libavcodec/mips/simple_idct_mmi.c
> index 7f4bb74..f54f9ea 100644
> --- a/libavcodec/mips/simple_idct_mmi.c
> +++ b/libavcodec/mips/simple_idct_mmi.c
> @@ -39,7 +39,7 @@
> #define COL_SHIFT 20
> #define DC_SHIFT 3
> 
> -DECLARE_ALIGNED(8, const int16_t, W_arr)[46] = {
> +DECLARE_ALIGNED(16, const int16_t, W_arr)[46] = {
>     W4,  W2,  W4,  W6,
>     W1,  W3,  W5,  W7,
>     W4,  W6, -W4, -W2,
> @@ -147,14 +147,22 @@ void ff_simple_idct_8_mmi(int16_t *block)
>         "gslqc1       $f25,      $f24,      0x30(%[w_arr])      \n\t"
>         "gslqc1       $f17,      $f16,      0x40(%[w_arr])      \n\t"
>         /* load source in block */
> -        "gslqc1       $f1,       $f0,       0x00(%[block])      \n\t"
> -        "gslqc1       $f3,       $f2,       0x10(%[block])      \n\t"
> -        "gslqc1       $f5,       $f4,       0x20(%[block])      \n\t"
> -        "gslqc1       $f7,       $f6,       0x30(%[block])      \n\t"
> -        "gslqc1       $f9,       $f8,       0x40(%[block])      \n\t"
> -        "gslqc1       $f11,      $f10,      0x50(%[block])      \n\t"
> -        "gslqc1       $f13,      $f12,      0x60(%[block])      \n\t"
> -        "gslqc1       $f15,      $f14,      0x70(%[block])      \n\t"
> +        MMI_ULDC1($f0, %[block], 0x00)
> +        MMI_ULDC1($f1, %[block], 0x08)
> +        MMI_ULDC1($f2, %[block], 0x10)
> +        MMI_ULDC1($f3, %[block], 0x18)
> +        MMI_ULDC1($f4, %[block], 0x20)
> +        MMI_ULDC1($f5, %[block], 0x28)
> +        MMI_ULDC1($f6, %[block], 0x30)
> +        MMI_ULDC1($f7, %[block], 0x38)
> +        MMI_ULDC1($f8, %[block], 0x40)
> +        MMI_ULDC1($f9, %[block], 0x48)
> +        MMI_ULDC1($f10, %[block], 0x50)
> +        MMI_ULDC1($f11, %[block], 0x58)
> +        MMI_ULDC1($f12, %[block], 0x60)
> +        MMI_ULDC1($f13, %[block], 0x68)
> +        MMI_ULDC1($f14, %[block], 0x70)
> +        MMI_ULDC1($f15, %[block], 0x78)
> 
>         /* $9: mask ; $f17: ROW_SHIFT */
>         "dmfc1        $9,        $f17                           \n\t"
> @@ -394,15 +402,22 @@ void ff_simple_idct_8_mmi(int16_t *block)
>         "punpcklwd    $f11,      $f27,      $f29                \n\t"
>         "punpckhwd    $f15,      $f27,      $f29                \n\t"
>         /* Store */
> -        "gssqc1       $f1,       $f0,       0x00(%[block])      \n\t"
> -        "gssqc1       $f5,       $f4,       0x10(%[block])      \n\t"
> -        "gssqc1       $f9,       $f8,       0x20(%[block])      \n\t"
> -        "gssqc1       $f13,      $f12,      0x30(%[block])      \n\t"
> -        "gssqc1       $f3,       $f2,       0x40(%[block])      \n\t"
> -        "gssqc1       $f7,       $f6,       0x50(%[block])      \n\t"
> -        "gssqc1       $f11,      $f10,      0x60(%[block])      \n\t"
> -        "gssqc1       $f15,      $f14,      0x70(%[block])      \n\t"
> -
> +        MMI_USDC1($f0, %[block], 0X00)
> +        MMI_USDC1($f1, %[block], 0X08)
> +        MMI_USDC1($f4, %[block], 0X10)
> +        MMI_USDC1($f5, %[block], 0X18)
> +        MMI_USDC1($f8, %[block], 0X20)
> +        MMI_USDC1($f9, %[block], 0X28)
> +        MMI_USDC1($f12, %[block], 0X30)
> +        MMI_USDC1($f13, %[block], 0X38)
> +        MMI_USDC1($f2, %[block], 0X40)
> +        MMI_USDC1($f3, %[block], 0X48)
> +        MMI_USDC1($f6, %[block], 0X50)
> +        MMI_USDC1($f7, %[block], 0X58)
> +        MMI_USDC1($f10, %[block], 0X60)
> +        MMI_USDC1($f11, %[block], 0X68)
> +        MMI_USDC1($f14, %[block], 0X70)
> +        MMI_USDC1($f15, %[block], 0X78)
>         : [block]"+&r"(block)
>         : [w_arr]"r"(W_arr)
>         : "memory"
> diff --git a/libavutil/mips/mmiutils.h b/libavutil/mips/mmiutils.h
> index 05f6b31..bfa6d8b 100644
> --- a/libavutil/mips/mmiutils.h
> +++ b/libavutil/mips/mmiutils.h
> @@ -205,7 +205,7 @@
>  * backup register
>  */
> #define BACKUP_REG \
> -  double temp_backup_reg[8];                                    \
> +  double __attribute__ ((aligned (16))) temp_backup_reg[8];     \
>   if (_MIPS_SIM == _ABI64)                                      \
>     __asm__ volatile (                                          \
>       "gssqc1       $f25,      $f24,       0x00(%[temp])  \n\t" \
> -- 
> 2.1.0
> 
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Shiyou Yin July 24, 2019, 6:47 a.m. UTC | #2
>Why is "block" not aligned? Does the code for other architectures also use unaligned instructions for
>these?

Thank you for reminding me. After checking the struct H264SliceContext and function call process, 'block' is find out as 16-bit aligned.
There are some refines in this patch, I will upload them in a new patch and only keep the following changes in this patch(V2).

>> diff --git a/libavcodec/mips/simple_idct_mmi.c b/libavcodec/mips/simple_idct_mmi.c
>> index 7f4bb74..f54f9ea 100644
>> --- a/libavcodec/mips/simple_idct_mmi.c
>> +++ b/libavcodec/mips/simple_idct_mmi.c
>> @@ -39,7 +39,7 @@
>> #define COL_SHIFT 20
>> #define DC_SHIFT 3
>>
>> -DECLARE_ALIGNED(8, const int16_t, W_arr)[46] = {
>> +DECLARE_ALIGNED(16, const int16_t, W_arr)[46] = {
>>     W4,  W2,  W4,  W6,
>>     W1,  W3,  W5,  W7,
>>     W4,  W6, -W4, -W2,
>> diff --git a/libavutil/mips/mmiutils.h b/libavutil/mips/mmiutils.h
>> index 05f6b31..bfa6d8b 100644
>> --- a/libavutil/mips/mmiutils.h
>> +++ b/libavutil/mips/mmiutils.h
>> @@ -205,7 +205,7 @@
>>  * backup register
>>  */
>> #define BACKUP_REG \
>> -  double temp_backup_reg[8];                                    \
>> +  double __attribute__ ((aligned (16))) temp_backup_reg[8];     \
>>   if (_MIPS_SIM == _ABI64)                                      \
>>     __asm__ volatile (                                          \
>>       "gssqc1       $f25,      $f24,       0x00(%[temp])  \n\t" \
>> --
>> 2.1.0


>ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff mbox

Patch

diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c
index ac65a20..a85d782 100644
--- a/libavcodec/mips/h264dsp_mmi.c
+++ b/libavcodec/mips/h264dsp_mmi.c
@@ -38,6 +38,11 @@  void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
         MMI_LDC1(%[ftmp2], %[src], 0x08)
         MMI_LDC1(%[ftmp3], %[src], 0x10)
         MMI_LDC1(%[ftmp4], %[src], 0x18)
+        /* memset(src, 0, 32); */
+        MMI_USDC1(%[ftmp0], %[src], 0x00)
+        MMI_USDC1(%[ftmp0], %[src], 0x08)
+        MMI_USDC1(%[ftmp0], %[src], 0x10)
+        MMI_USDC1(%[ftmp0], %[src], 0x18)
         MMI_ULWC1(%[ftmp5], %[dst0], 0x00)
         MMI_ULWC1(%[ftmp6], %[dst1], 0x00)
         MMI_ULWC1(%[ftmp7], %[dst2], 0x00)
@@ -58,11 +63,6 @@  void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
         MMI_SWC1(%[ftmp2], %[dst1], 0x00)
         MMI_SWC1(%[ftmp3], %[dst2], 0x00)
         MMI_SWC1(%[ftmp4], %[dst3], 0x00)
-
-        /* memset(src, 0, 32); */
-        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[src])            \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[src])            \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
@@ -85,15 +85,21 @@  void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
     DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
-        "dli        %[tmp0],    0x01                                    \n\t"
         MMI_LDC1(%[ftmp0], %[block], 0x00)
-        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
         MMI_LDC1(%[ftmp1], %[block], 0x08)
-        "dli        %[tmp0],    0x06                                    \n\t"
         MMI_LDC1(%[ftmp2], %[block], 0x10)
+        MMI_LDC1(%[ftmp3], %[block], 0x18)
+        /* memset(block, 0, 32) */
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        MMI_USDC1(%[ftmp4], %[block], 0x00)
+        MMI_USDC1(%[ftmp4], %[block], 0x08)
+        MMI_USDC1(%[ftmp4], %[block], 0x10)
+        MMI_USDC1(%[ftmp4], %[block], 0x18)
+        "dli        %[tmp0],    0x01                                    \n\t"
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        "dli        %[tmp0],    0x06                                    \n\t"
         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
         "psrah      %[ftmp4],   %[ftmp1],       %[ftmp8]                \n\t"
-        MMI_LDC1(%[ftmp3], %[block], 0x18)
         "psrah      %[ftmp5],   %[ftmp3],       %[ftmp8]                \n\t"
         "psubh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
         "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
@@ -121,15 +127,11 @@  void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "paddh      %[ftmp10],  %[ftmp3],       %[ftmp1]                \n\t"
         "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         "paddh      %[ftmp11],  %[ftmp4],       %[ftmp5]                \n\t"
-        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
         "psubh      %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
-        MMI_SDC1(%[ftmp7], %[block], 0x00)
-        MMI_SDC1(%[ftmp7], %[block], 0x08)
-        MMI_SDC1(%[ftmp7], %[block], 0x10)
-        MMI_SDC1(%[ftmp7], %[block], 0x18)
         MMI_ULWC1(%[ftmp2], %[dst], 0x00)
-        "psrah      %[ftmp3],   %[ftmp10],      %[ftmp9]                \n\t"
         MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp10],      %[ftmp9]                \n\t"
         "psrah      %[ftmp4],   %[ftmp11],      %[ftmp9]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
@@ -153,11 +155,6 @@  void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         MMI_SWC1(%[ftmp2], %[dst], 0x00)
         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
         MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
-
-        /* memset(block, 0, 32) */
-        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[block])          \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
@@ -620,17 +617,6 @@  void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         MMI_SWC1(%[ftmp6], %[addr0], 0x00)
         MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
         PTR_ADDIU  "$29,        $29,            0x20                    \n\t"
-
-        /* memset(block, 0, 128) */
-        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x20(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x30(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x40(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x50(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x60(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp0],       0x70(%[block])          \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
diff --git a/libavcodec/mips/simple_idct_mmi.c b/libavcodec/mips/simple_idct_mmi.c
index 7f4bb74..f54f9ea 100644
--- a/libavcodec/mips/simple_idct_mmi.c
+++ b/libavcodec/mips/simple_idct_mmi.c
@@ -39,7 +39,7 @@ 
 #define COL_SHIFT 20
 #define DC_SHIFT 3
 
-DECLARE_ALIGNED(8, const int16_t, W_arr)[46] = {
+DECLARE_ALIGNED(16, const int16_t, W_arr)[46] = {
     W4,  W2,  W4,  W6,
     W1,  W3,  W5,  W7,
     W4,  W6, -W4, -W2,
@@ -147,14 +147,22 @@  void ff_simple_idct_8_mmi(int16_t *block)
         "gslqc1       $f25,      $f24,      0x30(%[w_arr])      \n\t"
         "gslqc1       $f17,      $f16,      0x40(%[w_arr])      \n\t"
         /* load source in block */
-        "gslqc1       $f1,       $f0,       0x00(%[block])      \n\t"
-        "gslqc1       $f3,       $f2,       0x10(%[block])      \n\t"
-        "gslqc1       $f5,       $f4,       0x20(%[block])      \n\t"
-        "gslqc1       $f7,       $f6,       0x30(%[block])      \n\t"
-        "gslqc1       $f9,       $f8,       0x40(%[block])      \n\t"
-        "gslqc1       $f11,      $f10,      0x50(%[block])      \n\t"
-        "gslqc1       $f13,      $f12,      0x60(%[block])      \n\t"
-        "gslqc1       $f15,      $f14,      0x70(%[block])      \n\t"
+        MMI_ULDC1($f0, %[block], 0x00)
+        MMI_ULDC1($f1, %[block], 0x08)
+        MMI_ULDC1($f2, %[block], 0x10)
+        MMI_ULDC1($f3, %[block], 0x18)
+        MMI_ULDC1($f4, %[block], 0x20)
+        MMI_ULDC1($f5, %[block], 0x28)
+        MMI_ULDC1($f6, %[block], 0x30)
+        MMI_ULDC1($f7, %[block], 0x38)
+        MMI_ULDC1($f8, %[block], 0x40)
+        MMI_ULDC1($f9, %[block], 0x48)
+        MMI_ULDC1($f10, %[block], 0x50)
+        MMI_ULDC1($f11, %[block], 0x58)
+        MMI_ULDC1($f12, %[block], 0x60)
+        MMI_ULDC1($f13, %[block], 0x68)
+        MMI_ULDC1($f14, %[block], 0x70)
+        MMI_ULDC1($f15, %[block], 0x78)
 
         /* $9: mask ; $f17: ROW_SHIFT */
         "dmfc1        $9,        $f17                           \n\t"
@@ -394,15 +402,22 @@  void ff_simple_idct_8_mmi(int16_t *block)
         "punpcklwd    $f11,      $f27,      $f29                \n\t"
         "punpckhwd    $f15,      $f27,      $f29                \n\t"
         /* Store */
-        "gssqc1       $f1,       $f0,       0x00(%[block])      \n\t"
-        "gssqc1       $f5,       $f4,       0x10(%[block])      \n\t"
-        "gssqc1       $f9,       $f8,       0x20(%[block])      \n\t"
-        "gssqc1       $f13,      $f12,      0x30(%[block])      \n\t"
-        "gssqc1       $f3,       $f2,       0x40(%[block])      \n\t"
-        "gssqc1       $f7,       $f6,       0x50(%[block])      \n\t"
-        "gssqc1       $f11,      $f10,      0x60(%[block])      \n\t"
-        "gssqc1       $f15,      $f14,      0x70(%[block])      \n\t"
-
+        MMI_USDC1($f0, %[block], 0X00)
+        MMI_USDC1($f1, %[block], 0X08)
+        MMI_USDC1($f4, %[block], 0X10)
+        MMI_USDC1($f5, %[block], 0X18)
+        MMI_USDC1($f8, %[block], 0X20)
+        MMI_USDC1($f9, %[block], 0X28)
+        MMI_USDC1($f12, %[block], 0X30)
+        MMI_USDC1($f13, %[block], 0X38)
+        MMI_USDC1($f2, %[block], 0X40)
+        MMI_USDC1($f3, %[block], 0X48)
+        MMI_USDC1($f6, %[block], 0X50)
+        MMI_USDC1($f7, %[block], 0X58)
+        MMI_USDC1($f10, %[block], 0X60)
+        MMI_USDC1($f11, %[block], 0X68)
+        MMI_USDC1($f14, %[block], 0X70)
+        MMI_USDC1($f15, %[block], 0X78)
         : [block]"+&r"(block)
         : [w_arr]"r"(W_arr)
         : "memory"
diff --git a/libavutil/mips/mmiutils.h b/libavutil/mips/mmiutils.h
index 05f6b31..bfa6d8b 100644
--- a/libavutil/mips/mmiutils.h
+++ b/libavutil/mips/mmiutils.h
@@ -205,7 +205,7 @@ 
  * backup register
  */
 #define BACKUP_REG \
-  double temp_backup_reg[8];                                    \
+  double __attribute__ ((aligned (16))) temp_backup_reg[8];     \
   if (_MIPS_SIM == _ABI64)                                      \
     __asm__ volatile (                                          \
       "gssqc1       $f25,      $f24,       0x00(%[temp])  \n\t" \