diff mbox

[FFmpeg-devel,3/7] avcodec/mips: loongson optimize mmi load and store operators

Message ID 47f24766.8e00.157ada373f6.Coremail.ipfootball@126.com
State Accepted
Commit 89ec4adad6cb8c8bb4ecd61b51d42ebde424bcfb
Headers show

Commit Message

周晓勇 Oct. 10, 2016, 8:09 a.m. UTC
From ec6fd41adff8541180f4e43e019e31ff06867789 Mon Sep 17 00:00:00 2001
From: Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
Date: Mon, 10 Oct 2016 14:31:39 +0800
Subject: [PATCH 3/7] avcodec/mips: loongson optimize mmi load and store
 operators


1.MMI_ load/store macros are defined in libavutil/mips/mmiutils.h
2.Replace some unnecessary unaligned access with aligned operator
3.The MMI_ load/store is compatible with cpu loongson2e/2f which not support instructions start with gs
---
 libavcodec/mips/blockdsp_mmi.c    |  139 +--
 libavcodec/mips/h264chroma_mmi.c  |  193 ++--
 libavcodec/mips/h264dsp_mmi.c     | 2126 ++++++++++++++++++-------------------
 libavcodec/mips/h264pred_mmi.c    |  259 +++--
 libavcodec/mips/h264qpel_mmi.c    |  777 ++++++--------
 libavcodec/mips/hpeldsp_mmi.c     |  549 +++++-----
 libavcodec/mips/idctdsp_mmi.c     |  130 ++-
 libavcodec/mips/mpegvideo_mmi.c   |  112 +-
 libavcodec/mips/pixblockdsp_mmi.c |   87 +-
 9 files changed, 2066 insertions(+), 2306 deletions(-)

Comments

Michael Niedermayer Oct. 23, 2016, 1:21 a.m. UTC | #1
On Mon, Oct 10, 2016 at 04:09:12PM +0800, 周晓勇 wrote:
> From ec6fd41adff8541180f4e43e019e31ff06867789 Mon Sep 17 00:00:00 2001
> From: Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
> Date: Mon, 10 Oct 2016 14:31:39 +0800
> Subject: [PATCH 3/7] avcodec/mips: loongson optimize mmi load and store
>  operators
> 
> 
> 1.MMI_ load/store macros are defined in libavutil/mips/mmiutils.h
> 2.Replace some unnecessary unaligned access with aligned operator
> 3.The MMI_ load/store is compatible with cpu loongson2e/2f which not support instructions start with gs
> ---
>  libavcodec/mips/blockdsp_mmi.c    |  139 +--
>  libavcodec/mips/h264chroma_mmi.c  |  193 ++--
>  libavcodec/mips/h264dsp_mmi.c     | 2126 ++++++++++++++++++-------------------
>  libavcodec/mips/h264pred_mmi.c    |  259 +++--
>  libavcodec/mips/h264qpel_mmi.c    |  777 ++++++--------
>  libavcodec/mips/hpeldsp_mmi.c     |  549 +++++-----
>  libavcodec/mips/idctdsp_mmi.c     |  130 ++-
>  libavcodec/mips/mpegvideo_mmi.c   |  112 +-
>  libavcodec/mips/pixblockdsp_mmi.c |   87 +-
>  9 files changed, 2066 insertions(+), 2306 deletions(-)

applied

thx

[...]
diff mbox

Patch

diff --git a/libavcodec/mips/blockdsp_mmi.c b/libavcodec/mips/blockdsp_mmi.c
index 6eb2bd7..1035dbb 100644
--- a/libavcodec/mips/blockdsp_mmi.c
+++ b/libavcodec/mips/blockdsp_mmi.c
@@ -22,11 +22,12 @@ 
  */
 
 #include "blockdsp_mips.h"
-#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
 
 void ff_fill_block16_mmi(uint8_t *block, uint8_t value, int line_size, int h)
 {
     double ftmp[1];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "mtc1       %[value],   %[ftmp0]                                \n\t"
@@ -34,15 +35,14 @@  void ff_fill_block16_mmi(uint8_t *block, uint8_t value, int line_size, int h)
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "1:                                                             \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[block])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[block])                          \n\t"
-        PTR_ADDI    "%[h],      %[h],           -0x01                   \n\t"
-        "gssdlc1    %[ftmp0],   0x0f(%[block])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x08(%[block])                          \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        PTR_ADDI   "%[h],       %[h],           -0x01                   \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x08)
         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
         "bnez       %[h],       1b                                      \n\t"
-        : [block]"+&r"(block),              [h]"+&r"(h),
-          [ftmp0]"=&f"(ftmp[0])
+        : [ftmp0]"=&f"(ftmp[0]),
+          RESTRICT_ASM_ALL64
+          [block]"+&r"(block),              [h]"+&r"(h)
         : [value]"r"(value),                [line_size]"r"((mips_reg)line_size)
         : "memory"
     );
@@ -51,6 +51,7 @@  void ff_fill_block16_mmi(uint8_t *block, uint8_t value, int line_size, int h)
 void ff_fill_block8_mmi(uint8_t *block, uint8_t value, int line_size, int h)
 {
     double ftmp0;
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "mtc1       %[value],   %[ftmp0]                                \n\t"
@@ -58,13 +59,13 @@  void ff_fill_block8_mmi(uint8_t *block, uint8_t value, int line_size, int h)
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "1:                                                             \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[block])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[block])                          \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
         PTR_ADDI   "%[h],       %[h],           -0x01                   \n\t"
         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
         "bnez       %[h],       1b                                      \n\t"
-        : [block]"+&r"(block),              [h]"+&r"(h),
-          [ftmp0]"=&f"(ftmp0)
+        : [ftmp0]"=&f"(ftmp0),
+          RESTRICT_ASM_ALL64
+          [block]"+&r"(block),              [h]"+&r"(h)
         : [value]"r"(value),                [line_size]"r"((mips_reg)line_size)
         : "memory"
     );
@@ -77,14 +78,14 @@  void ff_clear_block_mmi(int16_t *block)
     __asm__ volatile (
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "xor        %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x00(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x10(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x20(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x30(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x40(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x50(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x60(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x70(%[block])          \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x00)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x10)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x20)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x30)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x40)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x50)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x60)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x70)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1])
         : [block]"r"(block)
         : "memory"
@@ -98,61 +99,61 @@  void ff_clear_blocks_mmi(int16_t *block)
     __asm__ volatile (
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "xor        %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x00(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x10(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x20(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x30(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x40(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x50(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x60(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x70(%[block])          \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x00)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x10)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x20)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x30)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x40)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x50)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x60)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x70)
 
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x80(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x90(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0xa0(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0xb0(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0xc0(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0xd0(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0xe0(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0xf0(%[block])          \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x80)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x90)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xa0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xb0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xc0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xd0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xe0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xf0)
 
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x100(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x110(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x120(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x130(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x140(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x150(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x160(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x170(%[block])         \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x100)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x110)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x120)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x130)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x140)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x150)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x160)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x170)
 
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x180(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x190(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x1a0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x1b0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x1c0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x1d0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x1e0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x1f0(%[block])         \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x180)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x190)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1a0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1b0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1c0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1d0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1e0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1f0)
 
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x200(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x210(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x220(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x230(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x240(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x250(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x260(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x270(%[block])         \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x200)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x210)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x220)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x230)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x240)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x250)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x260)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x270)
 
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x280(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x290(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x2a0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x2b0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x2c0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x2d0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x2e0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x2f0(%[block])         \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x280)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x290)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2a0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2b0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2c0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2d0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2e0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2f0)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1])
-        : [block]"r"((mips_reg)block)
+        : [block]"r"((uint64_t *)block)
         : "memory"
     );
 }
diff --git a/libavcodec/mips/h264chroma_mmi.c b/libavcodec/mips/h264chroma_mmi.c
index 3dd123d..417b4a2 100644
--- a/libavcodec/mips/h264chroma_mmi.c
+++ b/libavcodec/mips/h264chroma_mmi.c
@@ -24,7 +24,7 @@ 
 
 #include "h264chroma_mips.h"
 #include "constants.h"
-#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
 
 void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
         int h, int x, int y)
@@ -37,6 +37,7 @@  void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
     double ftmp[10];
     uint64_t tmp[1];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
 
     if (D) {
         __asm__ volatile (
@@ -47,16 +48,13 @@  void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
             "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
             "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
+
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
-            "gsldlc1    %[ftmp2],   0x08(%[src])                        \n\t"
-            "gsldrc1    %[ftmp2],   0x01(%[src])                        \n\t"
-            "gsldlc1    %[ftmp3],   0x07(%[addr0])                      \n\t"
-            "gsldrc1    %[ftmp3],   0x00(%[addr0])                      \n\t"
-            "gsldlc1    %[ftmp4],   0x08(%[addr0])                      \n\t"
-            "gsldrc1    %[ftmp4],   0x01(%[addr0])                      \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            MMI_ULDC1(%[ftmp2], %[src], 0x01)
+            MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
+            MMI_ULDC1(%[ftmp4], %[addr0], 0x01)
 
             "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t"
@@ -88,7 +86,7 @@  void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -98,6 +96,7 @@  void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_ALL64
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
               [h]"+&r"(h)
@@ -115,12 +114,11 @@  void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
             "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
             "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
-            "gsldlc1    %[ftmp2],   0x07(%[addr0])                      \n\t"
-            "gsldrc1    %[ftmp2],   0x00(%[addr0])                      \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            MMI_ULDC1(%[ftmp2], %[addr0], 0x00)
 
             "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t"
@@ -139,7 +137,7 @@  void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -148,6 +146,7 @@  void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_ALL64
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
               [h]"+&r"(h)
@@ -162,9 +161,9 @@  void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "dli        %[tmp0],    0x06                                \n\t"
             "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
             "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
             "1:                                                         \n\t"
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
@@ -175,11 +174,10 @@  void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
 
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
@@ -190,7 +188,7 @@  void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x02               \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
 
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
@@ -199,6 +197,7 @@  void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_ALL64
               [dst]"+&r"(dst),              [src]"+&r"(src),
               [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
@@ -219,6 +218,7 @@  void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
     double ftmp[10];
     uint64_t tmp[1];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
 
     if (D) {
         __asm__ volatile (
@@ -229,16 +229,13 @@  void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
             "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
             "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
+
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
-            "gsldlc1    %[ftmp2],   0x08(%[src])                        \n\t"
-            "gsldrc1    %[ftmp2],   0x01(%[src])                        \n\t"
-            "gsldlc1    %[ftmp3],   0x07(%[addr0])                      \n\t"
-            "gsldrc1    %[ftmp3],   0x00(%[addr0])                      \n\t"
-            "gsldlc1    %[ftmp4],   0x08(%[addr0])                      \n\t"
-            "gsldrc1    %[ftmp4],   0x01(%[addr0])                      \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            MMI_ULDC1(%[ftmp2], %[src], 0x01)
+            MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
+            MMI_ULDC1(%[ftmp4], %[addr0], 0x01)
 
             "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t"
@@ -269,10 +266,10 @@  void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
-            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -282,6 +279,7 @@  void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_ALL64
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
               [h]"+&r"(h)
@@ -299,12 +297,11 @@  void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
             "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
             "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
-            "gsldlc1    %[ftmp2],   0x07(%[addr0])                      \n\t"
-            "gsldrc1    %[ftmp2],   0x00(%[addr0])                      \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            MMI_ULDC1(%[ftmp2], %[addr0], 0x00)
 
             "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t"
@@ -322,10 +319,10 @@  void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
-            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -334,6 +331,7 @@  void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_ALL64
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
               [h]"+&r"(h)
@@ -348,9 +346,9 @@  void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "dli        %[tmp0],    0x06                                \n\t"
             "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
             "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
             "1:                                                         \n\t"
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
@@ -360,14 +358,13 @@  void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
-            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
@@ -377,10 +374,10 @@  void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
-            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x02               \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
 
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
@@ -389,6 +386,7 @@  void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_ALL64
               [dst]"+&r"(dst),              [src]"+&r"(src),
               [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
@@ -409,7 +407,7 @@  void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
     double ftmp[8];
     uint64_t tmp[1];
     mips_reg addr[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     if (D) {
         __asm__ volatile (
@@ -420,16 +418,13 @@  void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
             "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
             "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
+
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
-            "uld        %[low32],   0x01(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
-            "uld        %[low32],   0x00(%[addr0])                      \n\t"
-            "mtc1       %[low32],   %[ftmp3]                            \n\t"
-            "uld        %[low32],   0x01(%[addr0])                      \n\t"
-            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            MMI_ULWC1(%[ftmp2], %[src], 0x01)
+            MMI_ULWC1(%[ftmp3], %[addr0], 0x00)
+            MMI_ULWC1(%[ftmp4], %[addr0], 0x01)
 
             "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpcklbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
@@ -448,7 +443,7 @@  void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -457,10 +452,10 @@  void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
-              [h]"+&r"(h),
-              [low32]"=&r"(low32)
+              [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
               [A]"f"(A),                    [B]"f"(B),
               [C]"f"(C),                    [D]"f"(D)
@@ -475,12 +470,11 @@  void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
             "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
             "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
+
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
-            "uld        %[low32],   0x00(%[addr0])                      \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
 
             "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"
@@ -492,7 +486,7 @@  void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -500,10 +494,10 @@  void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
-              [h]"+&r"(h),
-              [low32]"=&r"(low32)
+              [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
               [ff_pw_32]"f"(ff_pw_32),
               [A]"f"(A),                    [E]"f"(E)
@@ -515,27 +509,26 @@  void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "dli        %[tmp0],    0x06                                \n\t"
             "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
             "mtc1       %[tmp0],    %[ftmp3]                            \n\t"
+
             "1:                                                         \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
             "addi       %[h],       %[h],           -0x02               \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
 
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
@@ -543,9 +536,9 @@  void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
               [dst]"+&r"(dst),              [src]"+&r"(src),
-              [h]"+&r"(h),
-              [low32]"=&r"(low32)
+              [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
               [A]"f"(A)
             : "memory"
@@ -564,7 +557,7 @@  void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
     double ftmp[8];
     uint64_t tmp[1];
     mips_reg addr[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     if (D) {
         __asm__ volatile (
@@ -575,16 +568,13 @@  void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
             "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
             "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
+
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
-            "uld        %[low32],   0x01(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
-            "uld        %[low32],   0x00(%[addr0])                      \n\t"
-            "mtc1       %[low32],   %[ftmp3]                            \n\t"
-            "uld        %[low32],   0x01(%[addr0])                      \n\t"
-            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            MMI_ULWC1(%[ftmp2], %[src], 0x01)
+            MMI_ULWC1(%[ftmp3], %[addr0], 0x00)
+            MMI_ULWC1(%[ftmp4], %[addr0], 0x01)
 
             "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpcklbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
@@ -602,10 +592,10 @@  void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
-            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -614,10 +604,10 @@  void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
-              [h]"+&r"(h),
-              [low32]"=&r"(low32)
+              [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
               [A]"f"(A),                    [B]"f"(B),
               [C]"f"(C),                    [D]"f"(D)
@@ -634,10 +624,8 @@  void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
-            "uld        %[low32],   0x00(%[addr0])                      \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
 
             "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"
@@ -648,10 +636,10 @@  void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
-            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -659,10 +647,10 @@  void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
-              [h]"+&r"(h),
-              [low32]"=&r"(low32)
+              [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
               [ff_pw_32]"f"(ff_pw_32),
               [A]"f"(A),                    [E]"f"(E)
@@ -674,31 +662,30 @@  void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "dli        %[tmp0],    0x06                                \n\t"
             "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
             "mtc1       %[tmp0],    %[ftmp3]                            \n\t"
+
             "1:                                                         \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
-            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
-            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x02               \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
 
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
@@ -706,9 +693,9 @@  void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
               [dst]"+&r"(dst),              [src]"+&r"(src),
-              [h]"+&r"(h),
-              [low32]"=&r"(low32)
+              [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
               [A]"f"(A)
             : "memory"
diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c
index a550eee..ac6fa99 100644
--- a/libavcodec/mips/h264dsp_mmi.c
+++ b/libavcodec/mips/h264dsp_mmi.c
@@ -25,27 +25,24 @@ 
 
 #include "libavcodec/bit_depth_template.c"
 #include "h264dsp_mips.h"
-#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
 
 void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
 {
     double ftmp[9];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-        "ldc1       %[ftmp1],   0x00(%[src])                            \n\t"
-        "ldc1       %[ftmp2],   0x08(%[src])                            \n\t"
-        "ldc1       %[ftmp3],   0x10(%[src])                            \n\t"
-        "ldc1       %[ftmp4],   0x18(%[src])                            \n\t"
-        "uld        %[low32],   0x00(%[dst0])                           \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
-        "uld        %[low32],   0x00(%[dst1])                           \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
-        "uld        %[low32],   0x00(%[dst2])                           \n\t"
-        "mtc1       %[low32],   %[ftmp7]                                \n\t"
-        "uld        %[low32],   0x00(%[dst3])                           \n\t"
-        "mtc1       %[low32],   %[ftmp8]                                \n\t"
+        MMI_LDC1(%[ftmp1], %[src], 0x00)
+        MMI_LDC1(%[ftmp2], %[src], 0x08)
+        MMI_LDC1(%[ftmp3], %[src], 0x10)
+        MMI_LDC1(%[ftmp4], %[src], 0x18)
+        MMI_ULWC1(%[ftmp5], %[dst0], 0x00)
+        MMI_ULWC1(%[ftmp6], %[dst1], 0x00)
+        MMI_ULWC1(%[ftmp7], %[dst2], 0x00)
+        MMI_ULWC1(%[ftmp8], %[dst3], 0x00)
         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
@@ -58,20 +55,17 @@  void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[dst0])                           \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[dst0])                           \n\t"
-        "gsswlc1    %[ftmp2],   0x03(%[dst1])                           \n\t"
-        "gsswrc1    %[ftmp2],   0x00(%[dst1])                           \n\t"
-        "gsswlc1    %[ftmp3],   0x03(%[dst2])                           \n\t"
-        "gsswrc1    %[ftmp3],   0x00(%[dst2])                           \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[dst3])                           \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[dst3])                           \n\t"
+        MMI_SWC1(%[ftmp1], %[dst0], 0x00)
+        MMI_SWC1(%[ftmp2], %[dst1], 0x00)
+        MMI_SWC1(%[ftmp3], %[dst2], 0x00)
+        MMI_SWC1(%[ftmp4], %[dst3], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
-          [ftmp8]"=&f"(ftmp[8]),
-          [low32]"=&r"(low32)
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          [ftmp8]"=&f"(ftmp[8])
         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
           [src]"r"(src)
@@ -85,18 +79,20 @@  void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 {
     double ftmp[12];
     uint64_t tmp[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         "dli        %[tmp0],    0x01                                    \n\t"
-        "ldc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp0], %[block], 0x00)
         "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
-        "ldc1       %[ftmp1],   0x08(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x08)
         "dli        %[tmp0],    0x06                                    \n\t"
-        "ldc1       %[ftmp2],   0x10(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp2], %[block], 0x10)
         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
         "psrah      %[ftmp4],   %[ftmp1],       %[ftmp8]                \n\t"
-        "ldc1       %[ftmp3],   0x18(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp3], %[block], 0x18)
         "psrah      %[ftmp5],   %[ftmp3],       %[ftmp8]                \n\t"
         "psubh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
         "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
@@ -126,14 +122,13 @@  void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "paddh      %[ftmp11],  %[ftmp4],       %[ftmp5]                \n\t"
         "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
         "psubh      %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
-        "sdc1       %[ftmp7],   0x00(%[block])                          \n\t"
-        "sdc1       %[ftmp7],   0x08(%[block])                          \n\t"
-        "sdc1       %[ftmp7],   0x10(%[block])                          \n\t"
-        "sdc1       %[ftmp7],   0x18(%[block])                          \n\t"
-        "uld        %[low32],   0x00(%[dst])                            \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        MMI_SDC1(%[ftmp7], %[block], 0x00)
+        MMI_SDC1(%[ftmp7], %[block], 0x08)
+        MMI_SDC1(%[ftmp7], %[block], 0x10)
+        MMI_SDC1(%[ftmp7], %[block], 0x18)
+        MMI_ULWC1(%[ftmp2], %[dst], 0x00)
         "psrah      %[ftmp3],   %[ftmp10],      %[ftmp9]                \n\t"
-        "gslwxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
         "psrah      %[ftmp4],   %[ftmp11],      %[ftmp9]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
@@ -141,33 +136,32 @@  void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
-        "gsswlc1    %[ftmp2],   0x03(%[dst])                            \n\t"
-        "gsswrc1    %[ftmp2],   0x00(%[dst])                            \n\t"
-        "gsswxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        MMI_SWC1(%[ftmp2], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
         PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
-        "uld        %[low32],   0x00(%[dst])                            \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        MMI_ULWC1(%[ftmp2], %[dst], 0x00)
         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
-        "gslwxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
         "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
         "paddh      %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
-        "gsswlc1    %[ftmp2],   0x03(%[dst])                            \n\t"
-        "gsswrc1    %[ftmp2],   0x00(%[dst])                            \n\t"
+        MMI_SWC1(%[ftmp2], %[dst], 0x00)
         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
-        "gsswxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
-          [tmp0]"=&r"(tmp[0]),
-          [low32]"=&r"(low32)
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [tmp0]"=&r"(tmp[0])
         : [dst]"r"(dst),                    [block]"r"(block),
           [stride]"r"((mips_reg)stride),    [ff_pw_32]"f"(ff_pw_32)
         : "memory"
@@ -179,464 +173,450 @@  void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 {
     double ftmp[16];
-    uint64_t tmp[8];
+    uint64_t tmp[7];
     mips_reg addr[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
-        "lhu       %[tmp0],     0x00(%[block])                          \n\t"
-        PTR_ADDI  "$29,         $29,            -0x20                   \n\t"
-        PTR_ADDIU "%[tmp0],     %[tmp0],        0x20                    \n\t"
-        "ldc1      %[ftmp1],    0x10(%[block])                          \n\t"
-        "sh        %[tmp0],     0x00(%[block])                          \n\t"
-        "ldc1      %[ftmp2],    0x20(%[block])                          \n\t"
-        "dli       %[tmp0],     0x01                                    \n\t"
-        "ldc1      %[ftmp3],    0x30(%[block])                          \n\t"
-        "mtc1      %[tmp0],     %[ftmp8]                                \n\t"
-        "ldc1      %[ftmp5],    0x50(%[block])                          \n\t"
-        "ldc1      %[ftmp6],    0x60(%[block])                          \n\t"
-        "ldc1      %[ftmp7],    0x70(%[block])                          \n\t"
-        "mov.d     %[ftmp0],    %[ftmp1]                                \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
-        "psrah     %[ftmp4],    %[ftmp5],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp3]                \n\t"
-        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp8]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
-        "dli       %[tmp0],     0x02                                    \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "mtc1      %[tmp0],     %[ftmp9]                                \n\t"
-        "mov.d     %[ftmp7],    %[ftmp1]                                \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
-        "psrah     %[ftmp3],    %[ftmp4],       %[ftmp9]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
-        "psrah     %[ftmp0],    %[ftmp0],       %[ftmp9]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
-        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp9]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
-        "mov.d     %[ftmp5],    %[ftmp6]                                \n\t"
-        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp8]                \n\t"
-        "psrah     %[ftmp4],    %[ftmp2],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
-        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
-        "ldc1      %[ftmp2],    0x00(%[block])                          \n\t"
-        "ldc1      %[ftmp5],    0x40(%[block])                          \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
-        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp2]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp4]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
-        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp4]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
-        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
-        "sdc1      %[ftmp6],    0x00(%[block])                          \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
-        "punpckhhw %[ftmp6],    %[ftmp7],       %[ftmp0]                \n\t"
-        "punpcklhw %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "punpckhhw %[ftmp0],    %[ftmp3],       %[ftmp1]                \n\t"
-        "punpcklhw %[ftmp3],    %[ftmp3],       %[ftmp1]                \n\t"
-        "punpckhwd %[ftmp1],    %[ftmp7],       %[ftmp3]                \n\t"
-        "punpcklwd %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
-        "punpckhwd %[ftmp3],    %[ftmp6],       %[ftmp0]                \n\t"
-        "punpcklwd %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "ldc1      %[ftmp0],    0x00(%[block])                          \n\t"
-        "sdc1      %[ftmp7],    0x00($29)                               \n\t"
-        "sdc1      %[ftmp1],    0x10($29)                               \n\t"
-        "dmfc1     %[tmp1],     %[ftmp6]                                \n\t"
-        "dmfc1     %[tmp3],     %[ftmp3]                                \n\t"
-        "punpckhhw %[ftmp3],    %[ftmp5],       %[ftmp2]                \n\t"
-        "punpcklhw %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
-        "punpckhhw %[ftmp2],    %[ftmp4],       %[ftmp0]                \n\t"
-        "punpcklhw %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
-        "punpckhwd %[ftmp0],    %[ftmp5],       %[ftmp4]                \n\t"
-        "punpcklwd %[ftmp5],    %[ftmp5],       %[ftmp4]                \n\t"
-        "punpckhwd %[ftmp4],    %[ftmp3],       %[ftmp2]                \n\t"
-        "punpcklwd %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "sdc1      %[ftmp5],    0x08($29)                               \n\t"
-        "sdc1      %[ftmp0],    0x18($29)                               \n\t"
-        "dmfc1     %[tmp2],     %[ftmp3]                                \n\t"
-        "dmfc1     %[tmp4],     %[ftmp4]                                \n\t"
-        "ldc1      %[ftmp1],    0x18(%[block])                          \n\t"
-        "ldc1      %[ftmp6],    0x28(%[block])                          \n\t"
-        "ldc1      %[ftmp2],    0x38(%[block])                          \n\t"
-        "ldc1      %[ftmp0],    0x58(%[block])                          \n\t"
-        "ldc1      %[ftmp3],    0x68(%[block])                          \n\t"
-        "ldc1      %[ftmp4],    0x78(%[block])                          \n\t"
-        "mov.d     %[ftmp7],    %[ftmp1]                                \n\t"
-        "psrah     %[ftmp5],    %[ftmp0],       %[ftmp8]                \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp4]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp2]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "psrah     %[ftmp2],    %[ftmp2],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp4]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
-        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp8]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
-        "mov.d     %[ftmp4],    %[ftmp1]                                \n\t"
-        "psrah     %[ftmp2],    %[ftmp5],       %[ftmp9]                \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp7]                \n\t"
-        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp9]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
-        "psrah     %[ftmp0],    %[ftmp0],       %[ftmp9]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
-        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
-        "mov.d     %[ftmp0],    %[ftmp3]                                \n\t"
-        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp8]                \n\t"
-        "psrah     %[ftmp5],    %[ftmp6],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp0]                \n\t"
-        "ldc1      %[ftmp6],    0x08(%[block])                          \n\t"
-        "ldc1      %[ftmp0],    0x48(%[block])                          \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
-        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
-        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
-        "sdc1      %[ftmp3],    0x08(%[block])                          \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp1]                \n\t"
-        "punpckhhw %[ftmp3],    %[ftmp4],       %[ftmp7]                \n\t"
-        "punpcklhw %[ftmp4],    %[ftmp4],       %[ftmp7]                \n\t"
-        "punpckhhw %[ftmp7],    %[ftmp2],       %[ftmp1]                \n\t"
-        "punpcklhw %[ftmp2],    %[ftmp2],       %[ftmp1]                \n\t"
-        "punpckhwd %[ftmp1],    %[ftmp4],       %[ftmp2]                \n\t"
-        "punpcklwd %[ftmp4],    %[ftmp4],       %[ftmp2]                \n\t"
-        "punpckhwd %[ftmp2],    %[ftmp3],       %[ftmp7]                \n\t"
-        "punpcklwd %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
-        "ldc1      %[ftmp7],    0x08(%[block])                          \n\t"
-        "dmfc1     %[tmp5],     %[ftmp4]                                \n\t"
-        "dmfc1     %[tmp7],     %[ftmp1]                                \n\t"
-        "mov.d     %[ftmp12],   %[ftmp3]                                \n\t"
-        "mov.d     %[ftmp14],   %[ftmp2]                                \n\t"
-        "punpckhhw %[ftmp2],    %[ftmp0],       %[ftmp6]                \n\t"
-        "punpcklhw %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
-        "punpckhhw %[ftmp6],    %[ftmp5],       %[ftmp7]                \n\t"
-        "punpcklhw %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "punpckhwd %[ftmp7],    %[ftmp0],       %[ftmp5]                \n\t"
-        "punpcklwd %[ftmp0],    %[ftmp0],       %[ftmp5]                \n\t"
-        "punpckhwd %[ftmp5],    %[ftmp2],       %[ftmp6]                \n\t"
-        "punpcklwd %[ftmp2],    %[ftmp2],       %[ftmp6]                \n\t"
-        "dmfc1     %[tmp6],     %[ftmp0]                                \n\t"
-        "mov.d     %[ftmp11],   %[ftmp7]                                \n\t"
-        "mov.d     %[ftmp13],   %[ftmp2]                                \n\t"
-        "mov.d     %[ftmp15],   %[ftmp5]                                \n\t"
-        PTR_ADDIU "%[addr0],    %[dst],         0x04                    \n\t"
-        "dmtc1     %[tmp7],     %[ftmp7]                                \n\t"
-        "dmtc1     %[tmp3],     %[ftmp6]                                \n\t"
-        "ldc1      %[ftmp1],    0x10($29)                               \n\t"
-        "dmtc1     %[tmp1],     %[ftmp3]                                \n\t"
-        "mov.d     %[ftmp4],    %[ftmp1]                                \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
-        "psrah     %[ftmp0],    %[ftmp7],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp4]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp14]               \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
-        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
-        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp14]               \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp14]               \n\t"
-        "psrah     %[ftmp5],    %[ftmp14],      %[ftmp8]                \n\t"
-        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
-        "mov.d     %[ftmp5],    %[ftmp1]                                \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
-        "psrah     %[ftmp6],    %[ftmp0],       %[ftmp9]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp4]                \n\t"
-        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp9]                \n\t"
-        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp9]                \n\t"
-        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "mov.d     %[ftmp7],    %[ftmp12]                               \n\t"
-        "psrah     %[ftmp2],    %[ftmp12],      %[ftmp8]                \n\t"
-        "psrah     %[ftmp0],    %[ftmp3],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
-        "ldc1      %[ftmp3],    0x00($29)                               \n\t"
-        "dmtc1     %[tmp5],     %[ftmp7]                                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp7]                \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
-        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
-        "sdc1      %[ftmp3],    0x00($29)                               \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp1]                \n\t"
-        "sdc1      %[ftmp0],    0x10($29)                               \n\t"
-        "dmfc1     %[tmp1],     %[ftmp2]                                \n\t"
-        "xor       %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
-        "sdc1      %[ftmp2],    0x00(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x08(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x10(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x18(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x20(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x28(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x30(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x38(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x40(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x48(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x50(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x58(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x60(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x68(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x70(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x78(%[block])                          \n\t"
-        "dli       %[tmp3],     0x06                                    \n\t"
-        "uld       %[low32],    0x00(%[dst])                            \n\t"
-        "mtc1      %[low32],    %[ftmp3]                                \n\t"
-        "mtc1      %[tmp3],     %[ftmp10]                               \n\t"
-        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
-        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
-        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
-        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
-        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
-        "uld       %[low32],    0x00(%[dst])                            \n\t"
-        "mtc1      %[low32],    %[ftmp3]                                \n\t"
-        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp1]                \n\t"
-        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
-        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
-        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        "ldc1      %[ftmp5],    0x00($29)                               \n\t"
-        "ldc1      %[ftmp4],    0x10($29)                               \n\t"
-        "dmtc1     %[tmp1],     %[ftmp6]                                \n\t"
-        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
-        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
-        "uld       %[low32],    0x00(%[dst])                            \n\t"
-        "mtc1      %[low32],    %[ftmp3]                                \n\t"
-        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp5]                \n\t"
-        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
-        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
-        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
-        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
-        "uld       %[low32],    0x00(%[dst])                            \n\t"
-        "mtc1      %[low32],    %[ftmp3]                                \n\t"
-        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
-        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
-        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
-        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        "dmtc1     %[tmp4],     %[ftmp1]                                \n\t"
-        "dmtc1     %[tmp2],     %[ftmp6]                                \n\t"
-        "ldc1      %[ftmp4],    0x18($29)                               \n\t"
-        "mov.d     %[ftmp5],    %[ftmp4]                                \n\t"
-        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp8]                \n\t"
-        "psrah     %[ftmp7],    %[ftmp11],      %[ftmp8]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp11]               \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp15]               \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp11]               \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp1]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
-        "psubh     %[ftmp3],    %[ftmp11],      %[ftmp1]                \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp15]               \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp15]               \n\t"
-        "psrah     %[ftmp2],    %[ftmp15],      %[ftmp8]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "mov.d     %[ftmp2],    %[ftmp4]                                \n\t"
-        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp9]                \n\t"
-        "psrah     %[ftmp1],    %[ftmp7],       %[ftmp9]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
-        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp9]                \n\t"
-        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp9]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
-        "mov.d     %[ftmp3],    %[ftmp13]                               \n\t"
-        "psrah     %[ftmp0],    %[ftmp13],      %[ftmp8]                \n\t"
-        "psrah     %[ftmp7],    %[ftmp6],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
-        "ldc1      %[ftmp6],    0x08($29)                               \n\t"
-        "dmtc1     %[tmp6],     %[ftmp3]                                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp0]                \n\t"
-        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp1]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
-        "sdc1      %[ftmp6],    0x08($29)                               \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
-        "sdc1      %[ftmp7],    0x18($29)                               \n\t"
-        "dmfc1     %[tmp2],     %[ftmp0]                                \n\t"
-        "xor       %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
-        "uld       %[low32],    0x00(%[addr0])                          \n\t"
-        "mtc1      %[low32],    %[ftmp6]                                \n\t"
-        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        "psrah     %[ftmp2],    %[ftmp2],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
-        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
-        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
-        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
-        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
-        "uld       %[low32],    0x00(%[addr0])                          \n\t"
-        "mtc1      %[low32],    %[ftmp6]                                \n\t"
-        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp1]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp4]                \n\t"
-        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
-        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
-        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        "ldc1      %[ftmp2],    0x08($29)                               \n\t"
-        "ldc1      %[ftmp5],    0x18($29)                               \n\t"
-        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
-        "dmtc1     %[tmp2],     %[ftmp1]                                \n\t"
-        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
-        "uld       %[low32],    0x00(%[addr0])                          \n\t"
-        "mtc1      %[low32],    %[ftmp6]                                \n\t"
-        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp2],    %[ftmp2],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
-        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
-        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
-        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
-        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
-        "uld       %[low32],    0x00(%[addr0])                          \n\t"
-        "mtc1      %[low32],    %[ftmp6]                                \n\t"
-        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp1]                \n\t"
-        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
-        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
-        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        PTR_ADDIU "$29,         $29,            0x20                    \n\t"
+        "lhu        %[tmp0],    0x00(%[block])                          \n\t"
+        PTR_ADDI   "$29,        $29,            -0x20                   \n\t"
+        PTR_ADDIU  "%[tmp0],    %[tmp0],        0x20                    \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x10)
+        "sh         %[tmp0],    0x00(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp2], %[block], 0x20)
+        "dli        %[tmp0],    0x01                                    \n\t"
+        MMI_LDC1(%[ftmp3], %[block], 0x30)
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        MMI_LDC1(%[ftmp5], %[block], 0x50)
+        MMI_LDC1(%[ftmp6], %[block], 0x60)
+        MMI_LDC1(%[ftmp7], %[block], 0x70)
+        "mov.d      %[ftmp0],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp4],   %[ftmp5],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        "mov.d      %[ftmp7],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp4],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp6]                                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp4],   %[ftmp2],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        MMI_LDC1(%[ftmp2], %[block], 0x00)
+        MMI_LDC1(%[ftmp5], %[block], 0x40)
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        MMI_SDC1(%[ftmp6], %[block], 0x00)
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp0]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp3],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp7],       %[ftmp3]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        MMI_LDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDC1(%[ftmp7], $29, 0x00)
+        MMI_SDC1(%[ftmp1], $29, 0x10)
+        "dmfc1      %[tmp1],    %[ftmp6]                                \n\t"
+        "dmfc1      %[tmp3],    %[ftmp3]                                \n\t"
+        "punpckhhw  %[ftmp3],   %[ftmp5],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp5],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        MMI_SDC1(%[ftmp5], $29, 0x08)
+        MMI_SDC1(%[ftmp0], $29, 0x18)
+        "dmfc1      %[tmp2],    %[ftmp3]                                \n\t"
+        "dmfc1      %[tmp4],    %[ftmp4]                                \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x18)
+        MMI_LDC1(%[ftmp6], %[block], 0x28)
+        MMI_LDC1(%[ftmp2], %[block], 0x38)
+        MMI_LDC1(%[ftmp0], %[block], 0x58)
+        MMI_LDC1(%[ftmp3], %[block], 0x68)
+        MMI_LDC1(%[ftmp4], %[block], 0x78)
+        "mov.d      %[ftmp7],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp5],   %[ftmp0],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp2],   %[ftmp5],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "mov.d      %[ftmp0],   %[ftmp3]                                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        MMI_LDC1(%[ftmp6], %[block], 0x08)
+        MMI_LDC1(%[ftmp0], %[block], 0x48)
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        MMI_SDC1(%[ftmp3], %[block], 0x08)
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhhw  %[ftmp3],   %[ftmp4],       %[ftmp7]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "punpckhhw  %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp4],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "punpckhwd  %[ftmp2],   %[ftmp3],       %[ftmp7]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp7], %[block], 0x08)
+        "dmfc1      %[tmp5],    %[ftmp4]                                \n\t"
+        "mov.d      %[ftmp10],  %[ftmp1]                                \n\t"
+        "mov.d      %[ftmp12],  %[ftmp3]                                \n\t"
+        "mov.d      %[ftmp14],  %[ftmp2]                                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp5],       %[ftmp7]                \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "punpckhwd  %[ftmp7],   %[ftmp0],       %[ftmp5]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp2],       %[ftmp6]                \n\t"
+        "punpcklwd  %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "dmfc1      %[tmp6],    %[ftmp0]                                \n\t"
+        "mov.d      %[ftmp11],  %[ftmp7]                                \n\t"
+        "mov.d      %[ftmp13],  %[ftmp2]                                \n\t"
+        "mov.d      %[ftmp15],  %[ftmp5]                                \n\t"
+        PTR_ADDIU  "%[addr0],   %[dst],         0x04                    \n\t"
+        "mov.d      %[ftmp7],   %[ftmp10]                               \n\t"
+        "dmtc1      %[tmp3],    %[ftmp6]                                \n\t"
+        MMI_LDC1(%[ftmp1], $29, 0x10)
+        "dmtc1      %[tmp1],    %[ftmp3]                                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp0],   %[ftmp7],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp14]               \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp14]               \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp14]               \n\t"
+        "psrah      %[ftmp5],   %[ftmp14],      %[ftmp8]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp0],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "mov.d      %[ftmp7],   %[ftmp12]                               \n\t"
+        "psrah      %[ftmp2],   %[ftmp12],      %[ftmp8]                \n\t"
+        "psrah      %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp3], $29, 0x00)
+        "dmtc1      %[tmp5],    %[ftmp7]                                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp3], $29, 0x00)
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        MMI_SDC1(%[ftmp0], $29, 0x10)
+        "dmfc1      %[tmp1],    %[ftmp2]                                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        MMI_SDC1(%[ftmp2], %[block], 0x00)
+        MMI_SDC1(%[ftmp2], %[block], 0x08)
+        MMI_SDC1(%[ftmp2], %[block], 0x10)
+        MMI_SDC1(%[ftmp2], %[block], 0x18)
+        MMI_SDC1(%[ftmp2], %[block], 0x20)
+        MMI_SDC1(%[ftmp2], %[block], 0x28)
+        MMI_SDC1(%[ftmp2], %[block], 0x30)
+        MMI_SDC1(%[ftmp2], %[block], 0x38)
+        MMI_SDC1(%[ftmp2], %[block], 0x40)
+        MMI_SDC1(%[ftmp2], %[block], 0x48)
+        MMI_SDC1(%[ftmp2], %[block], 0x50)
+        MMI_SDC1(%[ftmp2], %[block], 0x58)
+        MMI_SDC1(%[ftmp2], %[block], 0x60)
+        MMI_SDC1(%[ftmp2], %[block], 0x68)
+        MMI_SDC1(%[ftmp2], %[block], 0x70)
+        MMI_SDC1(%[ftmp2], %[block], 0x78)
+        "dli        %[tmp3],    0x06                                    \n\t"
+        "mtc1       %[tmp3],    %[ftmp10]                               \n\t"
+        MMI_ULWC1(%[ftmp3], %[dst], 0x00)
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_SWC1(%[ftmp3], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp3], %[dst], 0x00)
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_SWC1(%[ftmp3], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        MMI_LDC1(%[ftmp5], $29, 0x00)
+        MMI_LDC1(%[ftmp4], $29, 0x10)
+        "dmtc1      %[tmp1],    %[ftmp6]                                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp3], %[dst], 0x00)
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_SWC1(%[ftmp3], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp3], %[dst], 0x00)
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_SWC1(%[ftmp3], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "dmtc1      %[tmp4],    %[ftmp1]                                \n\t"
+        "dmtc1      %[tmp2],    %[ftmp6]                                \n\t"
+        MMI_LDC1(%[ftmp4], $29, 0x18)
+        "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp11],      %[ftmp8]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp15]               \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp11]               \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp11],      %[ftmp1]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp15]               \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp15]               \n\t"
+        "psrah      %[ftmp2],   %[ftmp15],      %[ftmp8]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "mov.d      %[ftmp2],   %[ftmp4]                                \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "mov.d      %[ftmp3],   %[ftmp13]                               \n\t"
+        "psrah      %[ftmp0],   %[ftmp13],      %[ftmp8]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        MMI_LDC1(%[ftmp6], $29, 0x08)
+        "dmtc1      %[tmp6],    %[ftmp3]                                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        MMI_SDC1(%[ftmp6], $29, 0x08)
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        MMI_SDC1(%[ftmp7], $29, 0x18)
+        "dmfc1      %[tmp2],    %[ftmp0]                                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        MMI_LDC1(%[ftmp2], $29, 0x08)
+        MMI_LDC1(%[ftmp5], $29, 0x18)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "dmtc1      %[tmp2],    %[ftmp1]                                \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        PTR_ADDIU  "$29,        $29,            0x20                    \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
@@ -648,9 +628,11 @@  void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
           [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
           [tmp2]"=&r"(tmp[2]),              [tmp3]"=&r"(tmp[3]),
           [tmp4]"=&r"(tmp[4]),              [tmp5]"=&r"(tmp[5]),
-          [tmp6]"=&r"(tmp[6]),              [tmp7]"=&r"(tmp[7]),
-          [addr0]"=&r"(addr[0]),
-          [low32]"=&r"(low32)
+          [tmp6]"=&r"(tmp[6]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0])
         : [dst]"r"(dst),                    [block]"r"(block),
           [stride]"r"((mips_reg)stride)
         : "$29","memory"
@@ -663,7 +645,7 @@  void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 {
     int dc = (block[0] + 32) >> 6;
     double ftmp[6];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     block[0] = 0;
 
@@ -671,14 +653,10 @@  void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "mtc1       %[dc],      %[ftmp5]                                \n\t"
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
-        "uld        %[low32],   0x00(%[dst0])                           \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "uld        %[low32],   0x00(%[dst1])                           \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
-        "uld        %[low32],   0x00(%[dst2])                           \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
-        "uld        %[low32],   0x00(%[dst3])                           \n\t"
-        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        MMI_ULWC1(%[ftmp1], %[dst0], 0x00)
+        MMI_ULWC1(%[ftmp2], %[dst1], 0x00)
+        MMI_ULWC1(%[ftmp3], %[dst2], 0x00)
+        MMI_ULWC1(%[ftmp4], %[dst3], 0x00)
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
@@ -691,18 +669,15 @@  void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[dst0])                           \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[dst0])                           \n\t"
-        "gsswlc1    %[ftmp2],   0x03(%[dst1])                           \n\t"
-        "gsswrc1    %[ftmp2],   0x00(%[dst1])                           \n\t"
-        "gsswlc1    %[ftmp3],   0x03(%[dst2])                           \n\t"
-        "gsswrc1    %[ftmp3],   0x00(%[dst2])                           \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[dst3])                           \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[dst3])                           \n\t"
+        MMI_SWC1(%[ftmp1], %[dst0], 0x00)
+        MMI_SWC1(%[ftmp2], %[dst1], 0x00)
+        MMI_SWC1(%[ftmp3], %[dst2], 0x00)
+        MMI_SWC1(%[ftmp4], %[dst3], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
-          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
-          [low32]"=&r"(low32)
+          [ftmp4]"=&f"(ftmp[4]),
+          RESTRICT_ASM_LOW32
+          [ftmp5]"=&f"(ftmp[5])
         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
           [dc]"r"(dc)
@@ -714,6 +689,7 @@  void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 {
     int dc = (block[0] + 32) >> 6;
     double ftmp[10];
+    DECLARE_VAR_ALL64;
 
     block[0] = 0;
 
@@ -721,10 +697,10 @@  void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "mtc1       %[dc],      %[ftmp5]                                \n\t"
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
-        "ldc1       %[ftmp1],   0x00(%[dst0])                           \n\t"
-        "ldc1       %[ftmp2],   0x00(%[dst1])                           \n\t"
-        "ldc1       %[ftmp3],   0x00(%[dst2])                           \n\t"
-        "ldc1       %[ftmp4],   0x00(%[dst3])                           \n\t"
+        MMI_LDC1(%[ftmp1], %[dst0], 0x00)
+        MMI_LDC1(%[ftmp2], %[dst1], 0x00)
+        MMI_LDC1(%[ftmp3], %[dst2], 0x00)
+        MMI_LDC1(%[ftmp4], %[dst3], 0x00)
         "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
@@ -745,15 +721,15 @@  void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
-        "sdc1       %[ftmp1],   0x00(%[dst0])                           \n\t"
-        "sdc1       %[ftmp2],   0x00(%[dst1])                           \n\t"
-        "sdc1       %[ftmp3],   0x00(%[dst2])                           \n\t"
-        "sdc1       %[ftmp4],   0x00(%[dst3])                           \n\t"
-
-        "ldc1       %[ftmp1],   0x00(%[dst4])                           \n\t"
-        "ldc1       %[ftmp2],   0x00(%[dst5])                           \n\t"
-        "ldc1       %[ftmp3],   0x00(%[dst6])                           \n\t"
-        "ldc1       %[ftmp4],   0x00(%[dst7])                           \n\t"
+        MMI_SDC1(%[ftmp1], %[dst0], 0x00)
+        MMI_SDC1(%[ftmp2], %[dst1], 0x00)
+        MMI_SDC1(%[ftmp3], %[dst2], 0x00)
+        MMI_SDC1(%[ftmp4], %[dst3], 0x00)
+
+        MMI_LDC1(%[ftmp1], %[dst4], 0x00)
+        MMI_LDC1(%[ftmp2], %[dst5], 0x00)
+        MMI_LDC1(%[ftmp3], %[dst6], 0x00)
+        MMI_LDC1(%[ftmp4], %[dst7], 0x00)
         "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
@@ -774,15 +750,17 @@  void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
-        "sdc1       %[ftmp1],   0x00(%[dst4])                           \n\t"
-        "sdc1       %[ftmp2],   0x00(%[dst5])                           \n\t"
-        "sdc1       %[ftmp3],   0x00(%[dst6])                           \n\t"
-        "sdc1       %[ftmp4],   0x00(%[dst7])                           \n\t"
+        MMI_SDC1(%[ftmp1], %[dst4], 0x00)
+        MMI_SDC1(%[ftmp2], %[dst5], 0x00)
+        MMI_SDC1(%[ftmp3], %[dst6], 0x00)
+        MMI_SDC1(%[ftmp4], %[dst7], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
-          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9])
+          [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
+          [ftmp9]"=&f"(ftmp[9])
         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
           [dst4]"r"(dst+4*stride),          [dst5]"r"(dst+5*stride),
@@ -888,17 +866,18 @@  void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
 {
     double ftmp[10];
     uint64_t tmp[2];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         ".set       noreorder                                           \n\t"
         "dli        %[tmp0],    0x08                                    \n\t"
-        "ldc1       %[ftmp3],   0x18(%[input])                          \n\t"
+        MMI_LDC1(%[ftmp3], %[input], 0x18)
         "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
-        "ldc1       %[ftmp2],   0x10(%[input])                          \n\t"
+        MMI_LDC1(%[ftmp2], %[input], 0x10)
         "dli        %[tmp0],    0x20                                    \n\t"
-        "ldc1       %[ftmp1],   0x08(%[input])                          \n\t"
+        MMI_LDC1(%[ftmp1], %[input], 0x08)
         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
-        "ldc1       %[ftmp0],   0x00(%[input])                          \n\t"
+        MMI_LDC1(%[ftmp0], %[input], 0x00)
         "mov.d      %[ftmp4],   %[ftmp3]                                \n\t"
         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
         "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
@@ -1009,7 +988,10 @@  void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
         "sh         %[input],   0x1e0(%[output])                        \n\t"
         "1:                                                             \n\t"
         "ori        %[tmp0],    $0,             0x1f                    \n\t"
+#if HAVE_LOONGSON3
         "clz        %[tmp1],    %[qmul]                                 \n\t"
+#elif HAVE_LOONGSON2
+#endif
         "ori        %[input],   $0,             0x07                    \n\t"
         "dsubu      %[tmp1],    %[tmp0],        %[tmp1]                 \n\t"
         "ori        %[tmp0],    $0,             0x80                    \n\t"
@@ -1098,6 +1080,7 @@  void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          RESTRICT_ASM_ALL64
           [output]"+&r"(output),            [input]"+&r"(input),
           [qmul]"+&r"(qmul)
         : [ff_pw_1]"f"(ff_pw_1)
@@ -1157,6 +1140,7 @@  void ff_h264_weight_pixels16_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
 {
     int y;
     double ftmp[8];
+    DECLARE_VAR_ALL64;
 
     offset <<= log2_denom;
 
@@ -1166,8 +1150,8 @@  void ff_h264_weight_pixels16_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
     for (y=0; y<height; y++, block+=stride) {
         __asm__ volatile (
             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-            "ldc1       %[ftmp1],   0x00(%[block0])                     \n\t"
-            "ldc1       %[ftmp2],   0x00(%[block1])                     \n\t"
+            MMI_LDC1(%[ftmp1], %[block0], 0x00)
+            MMI_LDC1(%[ftmp2], %[block1], 0x00)
             "mtc1       %[weight],  %[ftmp3]                            \n\t"
             "mtc1       %[offset],  %[ftmp4]                            \n\t"
             "mtc1       %[log2_denom],              %[ftmp5]            \n\t"
@@ -1191,12 +1175,14 @@  void ff_h264_weight_pixels16_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
             "psrah      %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
             "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
-            "sdc1       %[ftmp1],   0x00(%[block0])                     \n\t"
-            "sdc1       %[ftmp2],   0x00(%[block1])                     \n\t"
+            MMI_SDC1(%[ftmp1], %[block0], 0x00)
+            MMI_SDC1(%[ftmp2], %[block1], 0x00)
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
-              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7])
+              [ftmp6]"=&f"(ftmp[6]),
+              RESTRICT_ASM_ALL64
+              [ftmp7]"=&f"(ftmp[7])
             : [block0]"r"(block),           [block1]"r"(block+8),
               [weight]"r"(weight),          [offset]"r"(offset),
               [log2_denom]"r"(log2_denom)
@@ -1205,19 +1191,21 @@  void ff_h264_weight_pixels16_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
     }
 }
 
-void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-        int height, int log2_denom, int weightd, int weights, int offset)
+void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src,
+        ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
+        int offset)
 {
     int y;
     double ftmp[9];
+    DECLARE_VAR_ALL64;
 
     offset = ((offset + 1) | 1) << log2_denom;
 
     for (y=0; y<height; y++, dst+=stride, src+=stride) {
         __asm__ volatile (
             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-            "ldc1       %[ftmp1],   0x00(%[src0])                       \n\t"
-            "ldc1       %[ftmp2],   0x00(%[dst0])                       \n\t"
+            MMI_LDC1(%[ftmp1], %[src0], 0x00)
+            MMI_LDC1(%[ftmp2], %[dst0], 0x00)
             "mtc1       %[weights], %[ftmp3]                            \n\t"
             "mtc1       %[weightd], %[ftmp4]                            \n\t"
             "mtc1       %[offset],  %[ftmp5]                            \n\t"
@@ -1240,9 +1228,9 @@  void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t strid
             "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst0])                       \n\t"
-            "ldc1       %[ftmp1],   0x00(%[src1])                       \n\t"
-            "ldc1       %[ftmp2],   0x00(%[dst1])                       \n\t"
+            MMI_SDC1(%[ftmp1], %[dst0], 0x00)
+            MMI_LDC1(%[ftmp1], %[src1], 0x00)
+            MMI_LDC1(%[ftmp2], %[dst1], 0x00)
             "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
@@ -1258,11 +1246,12 @@  void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t strid
             "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst1])                       \n\t"
+            MMI_SDC1(%[ftmp1], %[dst1], 0x00)
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              RESTRICT_ASM_ALL64
               [ftmp8]"=&f"(ftmp[8])
             : [dst0]"r"(dst),               [dst1]"r"(dst+8),
               [src0]"r"(src),               [src1]"r"(src+8),
@@ -1278,6 +1267,7 @@  void ff_h264_weight_pixels8_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
 {
     int y;
     double ftmp[6];
+    DECLARE_VAR_ALL64;
 
     offset <<= log2_denom;
 
@@ -1287,7 +1277,7 @@  void ff_h264_weight_pixels8_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
     for (y=0; y<height; y++, block+=stride) {
         __asm__ volatile (
             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-            "ldc1       %[ftmp1],   0x00(%[block])                      \n\t"
+            MMI_LDC1(%[ftmp1], %[block], 0x00)
             "mtc1       %[weight],  %[ftmp2]                            \n\t"
             "mtc1       %[offset],  %[ftmp3]                            \n\t"
             "mtc1       %[log2_denom],              %[ftmp5]            \n\t"
@@ -1302,10 +1292,12 @@  void ff_h264_weight_pixels8_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
             "psrah      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"
             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
-            "sdc1       %[ftmp1],   0x00(%[block])                      \n\t"
+            MMI_SDC1(%[ftmp1], %[block], 0x00)
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
-              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5])
+              [ftmp4]"=&f"(ftmp[4]),
+              RESTRICT_ASM_ALL64
+              [ftmp5]"=&f"(ftmp[5])
             : [block]"r"(block),            [weight]"r"(weight),
               [offset]"r"(offset),          [log2_denom]"r"(log2_denom)
             : "memory"
@@ -1313,19 +1305,21 @@  void ff_h264_weight_pixels8_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
     }
 }
 
-void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-        int height, int log2_denom, int weightd, int weights, int offset)
+void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src,
+        ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
+        int offset)
 {
     int y;
     double ftmp[9];
+    DECLARE_VAR_ALL64;
 
     offset = ((offset + 1) | 1) << log2_denom;
 
     for (y=0; y<height; y++, dst+=stride, src+=stride) {
         __asm__ volatile (
             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-            "ldc1       %[ftmp1],   0x00(%[src])                        \n\t"
-            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LDC1(%[ftmp1], %[src], 0x00)
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
             "mtc1       %[weights], %[ftmp3]                            \n\t"
             "mtc1       %[weightd], %[ftmp4]                            \n\t"
             "mtc1       %[offset],  %[ftmp5]                            \n\t"
@@ -1348,11 +1342,12 @@  void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride
             "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              RESTRICT_ASM_ALL64
               [ftmp8]"=&f"(ftmp[8])
             : [dst]"r"(dst),                [src]"r"(src),
               [weights]"r"(weights),        [weightd]"r"(weightd),
@@ -1367,7 +1362,7 @@  void ff_h264_weight_pixels4_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
 {
     int y;
     double ftmp[5];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     offset <<= log2_denom;
 
@@ -1377,8 +1372,7 @@  void ff_h264_weight_pixels4_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
     for (y=0; y<height; y++, block+=stride) {
         __asm__ volatile (
             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-            "uld        %[low32],   0x00(%[block])                      \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[block], 0x00)
             "mtc1       %[weight],  %[ftmp2]                            \n\t"
             "mtc1       %[offset],  %[ftmp3]                            \n\t"
             "mtc1       %[log2_denom],              %[ftmp4]            \n\t"
@@ -1389,12 +1383,11 @@  void ff_h264_weight_pixels4_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
-            "gsswlc1    %[ftmp1],   0x03(%[block])                      \n\t"
-            "gsswrc1    %[ftmp1],   0x00(%[block])                      \n\t"
+            MMI_SWC1(%[ftmp1], %[block], 0x00)
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
-              [ftmp4]"=&f"(ftmp[4]),
-              [low32]"=&r"(low32)
+              RESTRICT_ASM_LOW32
+              [ftmp4]"=&f"(ftmp[4])
             : [block]"r"(block),            [weight]"r"(weight),
               [offset]"r"(offset),          [log2_denom]"r"(log2_denom)
             : "memory"
@@ -1402,22 +1395,21 @@  void ff_h264_weight_pixels4_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
     }
 }
 
-void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-        int height, int log2_denom, int weightd, int weights, int offset)
+void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src,
+        ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
+        int offset)
 {
     int y;
     double ftmp[7];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     offset = ((offset + 1) | 1) << log2_denom;
 
     for (y=0; y<height; y++, dst+=stride, src+=stride) {
         __asm__ volatile (
             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
-            "uld        %[low32],   0x00(%[dst])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            MMI_ULWC1(%[ftmp2], %[dst], 0x00)
             "mtc1       %[weight],  %[ftmp3]                            \n\t"
             "mtc1       %[weightd], %[ftmp4]                            \n\t"
             "mtc1       %[offset],  %[ftmp5]                            \n\t"
@@ -1433,13 +1425,12 @@  void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride
             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
-            "gsswlc1    %[ftmp1],   0x03(%[dst])                        \n\t"
-            "gsswrc1    %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
-              [ftmp6]"=&f"(ftmp[6]),
-              [low32]"=&r"(low32)
+              RESTRICT_ASM_LOW32
+              [ftmp6]"=&f"(ftmp[6])
             : [dst]"r"(dst),                [src]"r"(src),
               [weight]"r"(weights),         [weightd]"r"(weightd),
               [offset]"r"(offset),          [log2_denom]"r"(log2_denom+1)
@@ -1453,7 +1444,9 @@  void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
 {
     double ftmp[12];
     mips_reg addr[2];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
@@ -1463,10 +1456,10 @@  void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         PTR_SUBU   "%[addr1],   $0,             %[addr1]                \n\t"
         "addi       %[beta],    %[beta],        -0x01                   \n\t"
         PTR_ADDU   "%[addr1],   %[addr1],       %[pix]                  \n\t"
-        "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
-        "gsldxc1    %[ftmp1],   0x00(%[addr1],  %[stride])              \n\t"
-        "gsldxc1    %[ftmp2],   0x00(%[addr1],  %[addr0])               \n\t"
-        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+        MMI_LDC1(%[ftmp3], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp1], %[addr1], %[stride], 0x00)
+        MMI_LDXC1(%[ftmp2], %[addr1], %[addr0], 0x00)
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
         "mtc1       %[alpha],   %[ftmp5]                                \n\t"
         "mtc1       %[beta],    %[ftmp6]                                \n\t"
         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
@@ -1489,12 +1482,11 @@  void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
         "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
         "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
-        "uld        %[low32],   0x00(%[tc0])                            \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        MMI_ULWC1(%[ftmp5], %[tc0], 0x00)
         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp5]                \n\t"
         "pcmpgtb    %[ftmp5],   %[ftmp9],       %[ftmp4]                \n\t"
-        "ldc1       %[ftmp4],   0x00(%[addr1])                          \n\t"
+        MMI_LDC1(%[ftmp4], %[addr1], 0x00)
         "and        %[ftmp10],  %[ftmp5],       %[ftmp8]                \n\t"
         "psubusb    %[ftmp8],   %[ftmp4],       %[ftmp2]                \n\t"
         "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp4]                \n\t"
@@ -1506,7 +1498,7 @@  void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "psubb      %[ftmp8],   %[ftmp5],       %[ftmp7]                \n\t"
         "and        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
         "pavgb      %[ftmp5],   %[ftmp2],       %[ftmp3]                \n\t"
-        "ldc1       %[ftmp11],  0x00(%[addr1])                          \n\t"
+        MMI_LDC1(%[ftmp11], %[addr1], 0x00)
         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         "xor        %[ftmp5],   %[ftmp5],       %[ftmp11]               \n\t"
         "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
@@ -1515,8 +1507,8 @@  void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "paddusb    %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
         "pmaxub     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         "pminub     %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
-        "gssdxc1    %[ftmp4],   0x00(%[addr1],  %[stride])              \n\t"
-        "gsldxc1    %[ftmp5],   0x00(%[pix],    %[addr0])               \n\t"
+        MMI_SDXC1(%[ftmp4], %[addr1], %[stride], 0x00)
+        MMI_LDXC1(%[ftmp5], %[pix], %[addr0], 0x00)
         "psubusb    %[ftmp4],   %[ftmp5],       %[ftmp3]                \n\t"
         "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp5]                \n\t"
         "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
@@ -1525,9 +1517,9 @@  void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
         "psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
         "and        %[ftmp6],   %[ftmp9],       %[ftmp7]                \n\t"
-        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
         "pavgb      %[ftmp7],   %[ftmp2],       %[ftmp3]                \n\t"
-        "gsldxc1    %[ftmp11],  0x00(%[pix],    %[addr0])               \n\t"
+        MMI_LDXC1(%[ftmp11], %[pix], %[addr0], 0x00)
         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
         "xor        %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
         "and        %[ftmp7],   %[ftmp7],       %[ff_pb_1]              \n\t"
@@ -1536,7 +1528,7 @@  void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "paddusb    %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
         "pmaxub     %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
         "pminub     %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
-        "gssdxc1    %[ftmp5],   0x00(%[pix],    %[stride])              \n\t"
+        MMI_SDXC1(%[ftmp5], %[pix], %[stride], 0x00)
         "xor        %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
         "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
         "and        %[ftmp6],   %[ftmp6],       %[ff_pb_1]              \n\t"
@@ -1555,16 +1547,18 @@  void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "psubusb    %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
         "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
         "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
-        "gssdxc1    %[ftmp2],   0x00(%[addr1],  %[addr0])               \n\t"
-        "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        MMI_SDXC1(%[ftmp2], %[addr1], %[addr0], 0x00)
+        MMI_SDC1(%[ftmp3], %[pix], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
-          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-          [low32]"=&r"(low32)
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1])
         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
           [alpha]"r"((mips_reg)alpha),      [beta]"r"((mips_reg)beta),
           [tc0]"r"(tc0),                    [ff_pb_1]"f"(ff_pb_1),
@@ -1580,203 +1574,205 @@  static void deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
     double ftmp[16];
     uint64_t tmp[1];
     mips_reg addr[3];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
-__asm__ volatile (
-"ori        %[tmp0],    $0,             0x01                    \n\t"
-"xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-"mtc1       %[tmp0],    %[ftmp9]                                \n\t"
-PTR_SLL    "%[addr0],   %[stride],      0x02                    \n\t"
-PTR_ADDU   "%[addr2],   %[stride],      %[stride]               \n\t"
-PTR_ADDIU  "%[alpha],   %[alpha],       -0x01                   \n\t"
-PTR_SLL    "%[ftmp11],  %[ftmp9],       %[ftmp9]                \n\t"
-"bltz       %[alpha],   1f                                      \n\t"
-PTR_ADDU   "%[addr1],   %[addr2],       %[stride]               \n\t"
-PTR_ADDIU  "%[beta],    %[beta],        -0x01                   \n\t"
-"bltz       %[beta],    1f                                      \n\t"
-PTR_SUBU   "%[addr0],   $0,             %[addr0]                \n\t"
-PTR_ADDU   "%[addr0],   %[addr0],       %[pix]                  \n\t"
-"ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
-"gsldxc1    %[ftmp1],   0x00(%[addr0],  %[addr2])               \n\t"
-"gsldxc1    %[ftmp2],   0x00(%[addr0],  %[addr1])               \n\t"
-"gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
-"mtc1       %[alpha],   %[ftmp5]                                \n\t"
-"mtc1       %[beta],    %[ftmp6]                                \n\t"
-"pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
-"pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
-"packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
-"psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
-"psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
-"packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
-"or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
-"sdc1       %[ftmp5],   0x10+%[stack]                           \n\t"
-"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
-"psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
-"or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
-"or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
-"psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
-"or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
-"or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
-"xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
-"ldc1       %[ftmp5],   0x10+%[stack]                           \n\t"
-"pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
-"ldc1       %[ftmp10],  %[ff_pb_1]                              \n\t"
-"sdc1       %[ftmp8],   0x20+%[stack]                           \n\t"
-"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
-"psubusb    %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
-"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
-"psubusb    %[ftmp7],   %[ftmp2],       %[ftmp3]                \n\t"
-"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
-"psubusb    %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
-"ldc1       %[ftmp15],  0x20+%[stack]                           \n\t"
-"pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp15]               \n\t"
-"gsldxc1    %[ftmp15],  0x00(%[addr0],  %[stride])              \n\t"
-"psubusb    %[ftmp8],   %[ftmp15],      %[ftmp2]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp2],       %[ftmp15]               \n\t"
-"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
-"pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
-"and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-"gsldxc1    %[ftmp14],  0x00(%[pix],    %[addr2])               \n\t"
-"sdc1       %[ftmp5],   0x30+%[stack]                           \n\t"
-"psubusb    %[ftmp8],   %[ftmp14],      %[ftmp3]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp3],       %[ftmp14]               \n\t"
-"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
-"pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
-"and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-"sdc1       %[ftmp5],   0x40+%[stack]                           \n\t"
-"pavgb      %[ftmp5],   %[ftmp15],      %[ftmp1]                \n\t"
-"pavgb      %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
-"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
-"sdc1       %[ftmp6],   0x10+%[stack]                           \n\t"
-"paddb      %[ftmp7],   %[ftmp15],      %[ftmp1]                \n\t"
-"paddb      %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
-"paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
-"mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
-"sdc1       %[ftmp7],   0x00+%[stack]                           \n\t"
-"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
-"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
-"xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
-"psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-"pavgb      %[ftmp6],   %[ftmp15],      %[ftmp4]                \n\t"
-"psubb      %[ftmp7],   %[ftmp15],      %[ftmp4]                \n\t"
-"paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
-"psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
-"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"ldc1       %[ftmp13],  0x10+%[stack]                           \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
-"psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
-"pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
-"xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
-"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
-"psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
-"xor        %[ftmp8],   %[ftmp2],       %[ftmp4]                \n\t"
-"pavgb      %[ftmp7],   %[ftmp2],       %[ftmp4]                \n\t"
-"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
-"psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
-"ldc1       %[ftmp13],  0x30+%[stack]                           \n\t"
-"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
-"ldc1       %[ftmp12],  0x20+%[stack]                           \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"xor        %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
-"and        %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
-"gssdxc1    %[ftmp6],   0x00(%[addr0],  %[addr1])               \n\t"
-"ldc1       %[ftmp6],   0x00(%[addr0])                          \n\t"
-"paddb      %[ftmp7],   %[ftmp15],      %[ftmp6]                \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
-"ldc1       %[ftmp12],  0x00+%[stack]                           \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
-"paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
-"paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
-"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
-"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
-"xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
-"ldc1       %[ftmp12],  0x30+%[stack]                           \n\t"
-"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
-"and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
-"and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
-"xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
-"gssdxc1    %[ftmp5],   0x00(%[addr0],  %[addr2])               \n\t"
-"gssdxc1    %[ftmp6],   0x00(%[addr0],  %[stride])              \n\t"
-"pavgb      %[ftmp5],   %[ftmp14],      %[ftmp4]                \n\t"
-"pavgb      %[ftmp6],   %[ftmp3],       %[ftmp2]                \n\t"
-"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
-"sdc1       %[ftmp6],   0x10+%[stack]                           \n\t"
-"paddb      %[ftmp7],   %[ftmp14],      %[ftmp4]                \n\t"
-"paddb      %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
-"paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
-"mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
-"sdc1       %[ftmp7],   0x00+%[stack]                           \n\t"
-"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
-"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
-"xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
-"psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-"pavgb      %[ftmp6],   %[ftmp14],      %[ftmp1]                \n\t"
-"paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
-"psubb      %[ftmp7],   %[ftmp14],      %[ftmp1]                \n\t"
-"psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
-"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"ldc1       %[ftmp12],  0x10+%[stack]                           \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
-"psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
-"pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
-"xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
-"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
-"psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
-"xor        %[ftmp8],   %[ftmp3],       %[ftmp1]                \n\t"
-"pavgb      %[ftmp7],   %[ftmp3],       %[ftmp1]                \n\t"
-"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
-"ldc1       %[ftmp12],  0x40+%[stack]                           \n\t"
-"psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
-"ldc1       %[ftmp13],  0x20+%[stack]                           \n\t"
-"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"xor        %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
-"and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp13]               \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
-"sdc1       %[ftmp6],   0x00(%[pix])                            \n\t"
-"gsldxc1    %[ftmp6],   0x00(%[pix],    %[addr1])               \n\t"
-"paddb      %[ftmp7],   %[ftmp14],      %[ftmp6]                \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
-"ldc1       %[ftmp12],  0x00+%[stack]                           \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
-"paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
-"paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
-"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
-"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
-"xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
-"ldc1       %[ftmp12],  0x40+%[stack]                           \n\t"
-"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
-"and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
-"and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
-"xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
-"gssdxc1    %[ftmp5],   0x00(%[pix],    %[stride])              \n\t"
-"gssdxc1    %[ftmp6],   0x00(%[pix],    %[addr2])               \n\t"
-"1:                                                             \n\t"
+    __asm__ volatile (
+        "ori        %[tmp0],    $0,             0x01                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        PTR_SLL    "%[addr0],   %[stride],      0x02                    \n\t"
+        PTR_ADDU   "%[addr2],   %[stride],      %[stride]               \n\t"
+        PTR_ADDIU  "%[alpha],   %[alpha],       -0x01                   \n\t"
+        PTR_SLL    "%[ftmp11],  %[ftmp9],       %[ftmp9]                \n\t"
+        "bltz       %[alpha],   1f                                      \n\t"
+        PTR_ADDU   "%[addr1],   %[addr2],       %[stride]               \n\t"
+        PTR_ADDIU  "%[beta],    %[beta],        -0x01                   \n\t"
+        "bltz       %[beta],    1f                                      \n\t"
+        PTR_SUBU   "%[addr0],   $0,             %[addr0]                \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[pix]                  \n\t"
+        MMI_LDC1(%[ftmp3], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp1], %[addr0], %[addr2], 0x00)
+        MMI_LDXC1(%[ftmp2], %[addr0], %[addr1], 0x00)
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
+        "mtc1       %[alpha],   %[ftmp5]                                \n\t"
+        "mtc1       %[beta],    %[ftmp6]                                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp5], %[stack], 0x10)
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp5], %[stack], 0x10)
+        "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "ldc1       %[ftmp10],  %[ff_pb_1]                              \n\t"
+        MMI_SDC1(%[ftmp8], %[stack], 0x20)
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        MMI_LDC1(%[ftmp15], %[stack], 0x20)
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp15]               \n\t"
+        MMI_LDXC1(%[ftmp15], %[addr0], %[stride], 0x00)
+        "psubusb    %[ftmp8],   %[ftmp15],      %[ftmp2]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp2],       %[ftmp15]               \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        MMI_LDXC1(%[ftmp14], %[pix], %[addr2], 0x00)
+        MMI_SDC1(%[ftmp5], %[stack], 0x30)
+        "psubusb    %[ftmp8],   %[ftmp14],      %[ftmp3]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp3],       %[ftmp14]               \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp5], %[stack], 0x40)
+        "pavgb      %[ftmp5],   %[ftmp15],      %[ftmp1]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        MMI_SDC1(%[ftmp6], %[stack], 0x10)
+        "paddb      %[ftmp7],   %[ftmp15],      %[ftmp1]                \n\t"
+        "paddb      %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
+        MMI_SDC1(%[ftmp7], %[stack], 0x00)
+        "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp15],      %[ftmp4]                \n\t"
+        "psubb      %[ftmp7],   %[ftmp15],      %[ftmp4]                \n\t"
+        "paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp13], %[stack], 0x10)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
+        "pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "xor        %[ftmp8],   %[ftmp2],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp7],   %[ftmp2],       %[ftmp4]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        MMI_LDC1(%[ftmp13], %[stack], 0x30)
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x20)
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        MMI_SDXC1(%[ftmp6], %[addr0], %[addr1], 0x00)
+        MMI_LDC1(%[ftmp6], %[addr0], 0x00)
+        "paddb      %[ftmp7],   %[ftmp15],      %[ftmp6]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x00)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+        "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x30)
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+        MMI_SDXC1(%[ftmp5], %[addr0], %[addr2], 0x00)
+        MMI_SDXC1(%[ftmp6], %[addr0], %[stride], 0x00)
+        "pavgb      %[ftmp5],   %[ftmp14],      %[ftmp4]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp3],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        MMI_SDC1(%[ftmp6], %[stack], 0x10)
+        "paddb      %[ftmp7],   %[ftmp14],      %[ftmp4]                \n\t"
+        "paddb      %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
+        MMI_SDC1(%[ftmp7], %[stack], 0x00)
+        "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp14],      %[ftmp1]                \n\t"
+        "paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "psubb      %[ftmp7],   %[ftmp14],      %[ftmp1]                \n\t"
+        "psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x10)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+        "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
+        "pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "xor        %[ftmp8],   %[ftmp3],       %[ftmp1]                \n\t"
+        "pavgb      %[ftmp7],   %[ftmp3],       %[ftmp1]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x40)
+        "psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        MMI_LDC1(%[ftmp13], %[stack], 0x20)
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp13]               \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        MMI_SDC1(%[ftmp6], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp6], %[pix], %[addr1], 0x00)
+        "paddb      %[ftmp7],   %[ftmp14],      %[ftmp6]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x00)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+        "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x40)
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+        MMI_SDXC1(%[ftmp5], %[pix], %[stride], 0x00)
+        MMI_SDXC1(%[ftmp6], %[pix], %[addr2], 0x00)
+        "1:                                                             \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
@@ -1785,22 +1781,26 @@  PTR_ADDU   "%[addr0],   %[addr0],       %[pix]                  \n\t"
           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
           [ftmp12]"=&f"(ftmp[12]),          [ftmp13]"=&f"(ftmp[13]),
           [ftmp14]"=&f"(ftmp[14]),          [ftmp15]"=&f"(ftmp[15]),
-  [tmp0]"=&r"(tmp[0]),
-  [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-  [addr2]"=&r"(addr[2]),
-  [alpha]"+&r"(alpha),              [beta]"+&r"(beta)
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),
+          [alpha]"+&r"(alpha),              [beta]"+&r"(beta)
         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
-  [stack]"m"(stack[0]),             [ff_pb_1]"m"(ff_pb_1)
-: "memory"
-);
+          [stack]"r"(stack),                [ff_pb_1]"m"(ff_pb_1)
+        : "memory"
+    );
 }
 
-void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
-        int8_t *tc0)
+void ff_deblock_v_chroma_8_mmi(uint8_t *pix, ptrdiff_t stride, int alpha,
+        int beta, int8_t *tc0)
 {
     double ftmp[9];
     mips_reg addr[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
@@ -1808,10 +1808,10 @@  void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "or         %[addr0],   $0,             %[pix]                  \n\t"
         PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
         PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "ldc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
-        "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
-        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+        MMI_LDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_LDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
+        MMI_LDC1(%[ftmp3], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
 
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "mtc1       %[alpha],   %[ftmp5]                                \n\t"
@@ -1836,8 +1836,7 @@  void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
         "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
         "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
-        "uld        %[low32],   0x00(%[tc0])                            \n\t"
-        "mtc1       %[low32],   %[ftmp7]                                \n\t"
+        MMI_ULWC1(%[ftmp7], %[tc0], 0x00)
         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
         "and        %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
         "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
@@ -1859,15 +1858,17 @@  void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
         "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 
-        "gssdxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
-        "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        MMI_SDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
+        MMI_SDC1(%[ftmp3], %[pix], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),
-          [addr0]"=&r"(addr[0]),
-          [low32]"=&r"(low32)
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0])
         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
           [alpha]"r"(alpha),                [beta]"r"(beta),
           [tc0]"r"(tc0),                    [ff_pb_1]"f"(ff_pb_1),
@@ -1881,6 +1882,8 @@  void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
 {
     double ftmp[9];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
@@ -1888,10 +1891,10 @@  void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "or         %[addr0],   $0,             %[pix]                  \n\t"
         PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
         PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "ldc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
-        "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
-        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+        MMI_LDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_LDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
+        MMI_LDC1(%[ftmp3], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
 
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "mtc1       %[alpha],   %[ftmp5]                                \n\t"
@@ -1935,13 +1938,15 @@  void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "paddb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
         "paddb      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 
-        "gssdxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
-        "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        MMI_SDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
+        MMI_SDC1(%[ftmp3], %[pix], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0])
         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
           [alpha]"r"(alpha),                [beta]"r"(beta),
@@ -1955,7 +1960,7 @@  void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
 {
     double ftmp[11];
     mips_reg addr[6];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     __asm__ volatile (
         "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
@@ -1966,32 +1971,24 @@  void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         PTR_ADDU   "%[addr2],   %[addr0],       %[addr0]                \n\t"
         "or         %[addr5],   $0,             %[pix]                  \n\t"
         PTR_ADDU   "%[pix],     %[pix],         %[addr1]                \n\t"
-        "uld        %[low32],   0x00(%[addr5])                          \n\t"
-        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        MMI_ULWC1(%[ftmp0], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
-        "uld        %[low32],   0x00(%[addr3])                          \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        MMI_ULWC1(%[ftmp2], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
-        "uld        %[low32],   0x00(%[addr4])                          \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "uld        %[low32],   0x00(%[pix])                            \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        MMI_ULWC1(%[ftmp1], %[addr4], 0x00)
+        MMI_ULWC1(%[ftmp3], %[pix], 0x00)
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
         "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
-        "uld        %[low32],   0x00(%[addr3])                          \n\t"
-        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        MMI_ULWC1(%[ftmp4], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[pix],         %[addr0]                \n\t"
-        "uld        %[low32],   0x00(%[addr4])                          \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr4], 0x00)
         PTR_ADDU   "%[addr3],   %[pix],         %[addr1]                \n\t"
-        "uld        %[low32],   0x00(%[addr3])                          \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        MMI_ULWC1(%[ftmp5], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[pix],         %[addr2]                \n\t"
-        "uld        %[low32],   0x00(%[addr4])                          \n\t"
-        "mtc1       %[low32],   %[ftmp7]                                \n\t"
+        MMI_ULWC1(%[ftmp7], %[addr4], 0x00)
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
         "mov.d      %[ftmp6],   %[ftmp4]                                \n\t"
@@ -2027,8 +2024,7 @@  void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
         "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
         "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
-        "uld        %[low32],   0x00(%[tc0])                            \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        MMI_ULWC1(%[ftmp6], %[tc0], 0x00)
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
         "and        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
         "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
@@ -2057,48 +2053,40 @@  void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp10]               \n\t"
         "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr5])                          \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
         "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        MMI_USWC1(%[ftmp0], %[addr4], 0x00)
         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "punpckhwd  %[ftmp3],   %[ftmp10],      %[ftmp10]               \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[pix])                            \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[pix])                            \n\t"
+        MMI_USWC1(%[ftmp0], %[pix], 0x00)
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
         PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
         "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
         "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
         PTR_ADDU   "%[addr3],   %[pix],         %[addr0]                \n\t"
         PTR_ADDU   "%[addr4],   %[pix],         %[addr1]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[addr4])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp4], %[addr4], 0x00)
         PTR_ADDU   "%[addr3],   %[pix],         %[addr2]                \n\t"
         "punpckhwd  %[ftmp9],   %[ftmp4],       %[ftmp4]                \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[addr4])                          \n\t"
-        "gsswlc1    %[ftmp9],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp9],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp9], %[addr3], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),
+          RESTRICT_ASM_LOW32
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
-          [pix]"+&r"(pix),
-          [low32]"=&r"(low32)
+          [pix]"+&r"(pix)
         : [alpha]"r"(alpha),                [beta]"r"(beta),
           [stride]"r"((mips_reg)stride),    [tc0]"r"(tc0),
           [ff_pb_1]"f"(ff_pb_1),            [ff_pb_3]"f"(ff_pb_3),
@@ -2112,7 +2100,7 @@  void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
 {
     double ftmp[11];
     mips_reg addr[6];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     __asm__ volatile (
         "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
@@ -2123,32 +2111,24 @@  void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         PTR_ADDU   "%[addr2],   %[addr0],       %[addr0]                \n\t"
         "or         %[addr5],   $0,             %[pix]                  \n\t"
         PTR_ADDU   "%[pix],     %[pix],         %[addr1]                \n\t"
-        "uld        %[low32],   0x00(%[addr5])                          \n\t"
-        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        MMI_ULWC1(%[ftmp0], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
-        "uld        %[low32],   0x00(%[addr3])                          \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        MMI_ULWC1(%[ftmp2], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
-        "uld        %[low32],   0x00(%[addr4])                          \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "uld        %[low32],   0x00(%[pix])                            \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        MMI_ULWC1(%[ftmp1], %[addr4], 0x00)
+        MMI_ULWC1(%[ftmp3], %[pix], 0x00)
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
         "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
-        "uld        %[low32],   0x00(%[addr3])                          \n\t"
-        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        MMI_ULWC1(%[ftmp4], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[pix],         %[addr0]                \n\t"
-        "uld        %[low32],   0x00(%[addr4])                          \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr4], 0x00)
         PTR_ADDU   "%[addr3],   %[pix],         %[addr1]                \n\t"
-        "uld        %[low32],   0x00(%[addr3])                          \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        MMI_ULWC1(%[ftmp5], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[pix],         %[addr2]                \n\t"
-        "uld        %[low32],   0x00(%[addr4])                          \n\t"
-        "mtc1       %[low32],   %[ftmp7]                                \n\t"
+        MMI_ULWC1(%[ftmp7], %[addr4], 0x00)
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
         "mov.d      %[ftmp6],   %[ftmp4]                                \n\t"
@@ -2208,48 +2188,40 @@  void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr5])                          \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
         "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
         PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp0], %[addr4], 0x00)
         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[pix])                            \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[pix])                            \n\t"
+        MMI_USWC1(%[ftmp0], %[pix], 0x00)
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
         PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
         "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
         "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
         PTR_ADDU   "%[addr3],   %[pix],         %[addr0]                \n\t"
         PTR_ADDU   "%[addr4],   %[pix],         %[addr1]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[addr4])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
         PTR_ADDU   "%[addr3],   %[pix],         %[addr2]                \n\t"
+        MMI_USWC1(%[ftmp4], %[addr4], 0x00)
         "punpckhwd  %[ftmp9],   %[ftmp4],       %[ftmp4]                \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[addr4])                          \n\t"
-        "gsswlc1    %[ftmp9],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp9],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp9], %[addr3], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),
+          RESTRICT_ASM_LOW32
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
-          [pix]"+&r"(pix),
-          [low32]"=&r"(low32)
+          [pix]"+&r"(pix)
         : [alpha]"r"(alpha),                [beta]"r"(beta),
           [stride]"r"((mips_reg)stride),    [ff_pb_1]"f"(ff_pb_1)
         : "memory"
@@ -2275,34 +2247,29 @@  void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
 void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         int8_t *tc0)
 {
-    uint64_t stack[0xd];
+    DECLARE_ALIGNED(8, const uint64_t, stack[0x0d]);
     double ftmp[9];
     mips_reg addr[8];
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
         PTR_ADDI   "%[addr1],   %[pix],         -0x4                    \n\t"
         PTR_ADDU   "%[addr2],   %[stride],      %[addr0]                \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
         PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
         PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr3])                          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr3], 0x00)
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
+        MMI_ULDC1(%[ftmp2], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
-        "gsldlc1    %[ftmp4],   0x07(%[addr3])                          \n\t"
+        MMI_ULDC1(%[ftmp4], %[addr3], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
-        "gsldrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr2]                \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp6],   0x07(%[addr3])                          \n\t"
-        "gsldrc1    %[ftmp6],   0x00(%[addr3])                          \n\t"
+        MMI_ULDC1(%[ftmp6], %[addr3], 0x00)
         PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
@@ -2311,9 +2278,8 @@  void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
-        "sdc1       %[ftmp1],   0x10(%[stack])                          \n\t"
-        "gsldlc1    %[ftmp8],   0x07(%[addr3])                          \n\t"
-        "gsldrc1    %[ftmp8],   0x00(%[addr3])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[stack], 0x10)
+        MMI_ULDC1(%[ftmp8], %[addr3], 0x00)
         PTR_ADDU   "%[addr7],   %[addr6],       %[addr6]                \n\t"
         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
@@ -2321,9 +2287,9 @@  void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
-        "ldc1       %[ftmp8],   0x10(%[stack])                          \n\t"
+        MMI_LDC1(%[ftmp8], %[stack], 0x10)
         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[stack])                          \n\t"
+        MMI_SDC1(%[ftmp0], %[stack], 0x00)
         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp8]                \n\t"
         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
         "punpckhhw  %[ftmp0],   %[ftmp3],       %[ftmp5]                \n\t"
@@ -2333,32 +2299,25 @@  void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
         "punpckhwd  %[ftmp3],   %[ftmp1],       %[ftmp2]                \n\t"
         "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
-        "sdc1       %[ftmp1],   0x10(%[stack])                          \n\t"
-        "sdc1       %[ftmp3],   0x20(%[stack])                          \n\t"
-        "sdc1       %[ftmp7],   0x30(%[stack])                          \n\t"
-        "sdc1       %[ftmp5],   0x40(%[stack])                          \n\t"
-        "sdc1       %[ftmp6],   0x50(%[stack])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[stack], 0x10)
+        MMI_SDC1(%[ftmp3], %[stack], 0x20)
+        MMI_SDC1(%[ftmp7], %[stack], 0x30)
+        MMI_SDC1(%[ftmp5], %[stack], 0x40)
+        MMI_SDC1(%[ftmp6], %[stack], 0x50)
         PTR_ADDU   "%[addr1],   %[addr1],       %[addr7]                \n\t"
         PTR_ADDU   "%[addr4],   %[addr4],       %[addr7]                \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
         PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr3])                          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr3], 0x00)
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
+        MMI_ULDC1(%[ftmp2], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
-        "gsldlc1    %[ftmp4],   0x07(%[addr3])                          \n\t"
+        MMI_ULDC1(%[ftmp4], %[addr3], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
-        "gsldrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr2]                \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp6],   0x07(%[addr3])                          \n\t"
-        "gsldrc1    %[ftmp6],   0x00(%[addr3])                          \n\t"
+        MMI_ULDC1(%[ftmp6], %[addr3], 0x00)
         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
@@ -2366,9 +2325,8 @@  void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
-        "sdc1       %[ftmp1],   0x18(%[stack])                          \n\t"
-        "gsldlc1    %[ftmp8],   0x07(%[addr3])                          \n\t"
-        "gsldrc1    %[ftmp8],   0x00(%[addr3])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[stack], 0x18)
+        MMI_ULDC1(%[ftmp8], %[addr3], 0x00)
         "punpckhhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
@@ -2376,8 +2334,8 @@  void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
-        "ldc1       %[ftmp8],   0x18(%[stack])                          \n\t"
-        "sdc1       %[ftmp0],   0x08(%[stack])                          \n\t"
+        MMI_LDC1(%[ftmp8], %[stack], 0x18)
+        MMI_SDC1(%[ftmp0], %[stack], 0x08)
         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp8]                \n\t"
         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
         "punpckhhw  %[ftmp0],   %[ftmp3],       %[ftmp5]                \n\t"
@@ -2387,16 +2345,17 @@  void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpckhwd  %[ftmp3],   %[ftmp1],       %[ftmp2]                \n\t"
         "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
-        "sdc1       %[ftmp1],   0x18(%[stack])                          \n\t"
-        "sdc1       %[ftmp3],   0x28(%[stack])                          \n\t"
-        "sdc1       %[ftmp7],   0x38(%[stack])                          \n\t"
-        "sdc1       %[ftmp5],   0x48(%[stack])                          \n\t"
-        "sdc1       %[ftmp6],   0x58(%[stack])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[stack], 0x18)
+        MMI_SDC1(%[ftmp3], %[stack], 0x28)
+        MMI_SDC1(%[ftmp7], %[stack], 0x38)
+        MMI_SDC1(%[ftmp5], %[stack], 0x48)
+        MMI_SDC1(%[ftmp6], %[stack], 0x58)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
@@ -2410,15 +2369,15 @@  void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
 
     __asm__ volatile (
         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
-        PTR_ADDI   "%[addr1],   %[pix],          -0x02                  \n\t"
+        PTR_ADDI   "%[addr1],   %[pix],         -0x02                   \n\t"
         PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
         PTR_ADDU   "%[addr2],   %[addr0],       %[stride]               \n\t"
         PTR_ADDU   "%[addr7],   %[addr6],       %[addr6]                \n\t"
         PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
-        "ldc1       %[ftmp0],   0x10(%[stack])                          \n\t"
-        "ldc1       %[ftmp1],   0x20(%[stack])                          \n\t"
-        "ldc1       %[ftmp2],   0x30(%[stack])                          \n\t"
-        "ldc1       %[ftmp3],   0x40(%[stack])                          \n\t"
+        MMI_LDC1(%[ftmp0], %[stack], 0x10)
+        MMI_LDC1(%[ftmp1], %[stack], 0x20)
+        MMI_LDC1(%[ftmp2], %[stack], 0x30)
+        MMI_LDC1(%[ftmp3], %[stack], 0x40)
         "punpckhwd  %[ftmp4],   %[ftmp0],       %[ftmp0]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
         "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
@@ -2426,43 +2385,35 @@  void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr1])                          \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr1], 0x00)
         PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
         "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[addr5])                          \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[addr5])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp0], %[addr5], 0x00)
         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        MMI_USWC1(%[ftmp0], %[addr4], 0x00)
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
         "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
         PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
         "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr0]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[addr5])                          \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp4], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
         "punpckhwd  %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
         PTR_ADDU   "%[addr1],   %[addr1],       %[addr7]                \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp4], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[addr4],       %[addr7]                \n\t"
-        "ldc1       %[ftmp0],   0x18(%[stack])                          \n\t"
-        "ldc1       %[ftmp1],   0x28(%[stack])                          \n\t"
-        "ldc1       %[ftmp2],   0x38(%[stack])                          \n\t"
-        "ldc1       %[ftmp3],   0x48(%[stack])                          \n\t"
+        MMI_LDC1(%[ftmp0], %[stack], 0x18)
+        MMI_LDC1(%[ftmp1], %[stack], 0x28)
+        MMI_LDC1(%[ftmp2], %[stack], 0x38)
+        MMI_LDC1(%[ftmp3], %[stack], 0x48)
         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
         "punpckhwd  %[ftmp4],   %[ftmp0],       %[ftmp0]                \n\t"
         PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
@@ -2473,41 +2424,35 @@  void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
         "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr1])                          \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr1], 0x00)
         "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[addr5])                          \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[addr5])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp0], %[addr5], 0x00)
         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        MMI_USWC1(%[ftmp0], %[addr4], 0x00)
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
         PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
         "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr0]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[addr5])                          \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp4], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
         "punpckhwd  %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp4], %[addr3], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
@@ -2521,10 +2466,11 @@  void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
 void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         int beta)
 {
-    uint64_t ptmp[0x11];
-    uint64_t pdat[4];
+    DECLARE_ALIGNED(8, const uint64_t, ptmp[0x11]);
+    DECLARE_ALIGNED(8, const uint64_t, pdat[0x04]);
     double ftmp[9];
     mips_reg addr[7];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
@@ -2533,24 +2479,17 @@  void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         PTR_ADDU   "%[addr3],   %[addr0],       %[addr0]                \n\t"
         PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
         PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[addr6])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[addr6])                          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp2], %[addr6], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[stride]               \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
         PTR_ADDU   "%[addr6],   %[addr4],       %[addr0]                \n\t"
-        "gsldlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp4], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
-        "gsldlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp5], %[addr6], 0x00)
+        MMI_ULDC1(%[ftmp6], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr3]                \n\t"
         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
@@ -2558,60 +2497,52 @@  void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
-        "gsldlc1    %[ftmp8],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp8],   0x00(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp8], %[addr5], 0x00)
         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
-        "sdc1       %[ftmp3],   0x00(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp3], %[ptmp], 0x00)
         "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
-        "sdc1       %[ftmp2],   0x20(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp2],   0x00(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp2], %[ptmp], 0x20)
+        MMI_LDC1(%[ftmp2], %[ptmp], 0x00)
         "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp5],   0x10(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp7],   0x40(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp4],   0x50(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp8],   0x20(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp0], %[ptmp], 0x00)
+        MMI_SDC1(%[ftmp5], %[ptmp], 0x10)
+        MMI_SDC1(%[ftmp7], %[ptmp], 0x40)
+        MMI_SDC1(%[ftmp4], %[ptmp], 0x50)
+        MMI_LDC1(%[ftmp8], %[ptmp], 0x20)
         "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr3],       %[addr3]                \n\t"
-        "sdc1       %[ftmp3],   0x20(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp0],   0x30(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp6],   0x60(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp5],   0x70(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp3], %[ptmp], 0x20)
+        MMI_SDC1(%[ftmp0], %[ptmp], 0x30)
+        MMI_SDC1(%[ftmp6], %[ptmp], 0x60)
+        MMI_SDC1(%[ftmp5], %[ptmp], 0x70)
         PTR_ADDU   "%[addr1],   %[addr1],       %[addr5]                \n\t"
         PTR_ADDU   "%[addr4],   %[addr4],       %[addr5]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
         PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[addr6])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[addr6])                          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp2], %[addr6], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[stride]               \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
         PTR_ADDU   "%[addr6],   %[addr4],       %[addr0]                \n\t"
-        "gsldlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp4], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
-        "gsldlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp5], %[addr6], 0x00)
+        MMI_ULDC1(%[ftmp6], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr3]                \n\t"
         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
@@ -2619,38 +2550,37 @@  void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
-        "gsldlc1    %[ftmp8],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp8],   0x00(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp8], %[addr5], 0x00)
         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
-        "sdc1       %[ftmp3],   0x08(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp3], %[ptmp], 0x08)
         "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
-        "sdc1       %[ftmp2],   0x28(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp2],   0x08(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp2], %[ptmp], 0x28)
+        MMI_LDC1(%[ftmp2], %[ptmp], 0x08)
         "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
-        "sdc1       %[ftmp0],   0x08(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp5],   0x18(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp7],   0x48(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp4],   0x58(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp8],   0x28(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp0], %[ptmp], 0x08)
+        MMI_SDC1(%[ftmp5], %[ptmp], 0x18)
+        MMI_SDC1(%[ftmp7], %[ptmp], 0x48)
+        MMI_SDC1(%[ftmp4], %[ptmp], 0x58)
+        MMI_LDC1(%[ftmp8], %[ptmp], 0x28)
         "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
-        "sdc1       %[ftmp3],   0x28(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp0],   0x38(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp6],   0x68(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp5],   0x78(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp3], %[ptmp], 0x28)
+        MMI_SDC1(%[ftmp0], %[ptmp], 0x38)
+        MMI_SDC1(%[ftmp6], %[ptmp], 0x68)
+        MMI_SDC1(%[ftmp5], %[ptmp], 0x78)
         PTR_S      "%[addr1],   0x00(%[pdat])                           \n\t"
         PTR_S      "%[addr2],   0x08(%[pdat])                           \n\t"
         PTR_S      "%[addr0],   0x10(%[pdat])                           \n\t"
@@ -2660,6 +2590,7 @@  void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
@@ -2677,24 +2608,23 @@  void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         PTR_L      "%[addr0],   0x10(%[pdat])                           \n\t"
         PTR_L      "%[addr3],   0x18(%[pdat])                           \n\t"
         PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
-        "ldc1       %[ftmp0],   0x08(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp1],   0x18(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp2],   0x28(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp3],   0x38(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp4],   0x48(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp5],   0x58(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp6],   0x68(%[ptmp])                           \n\t"
+        MMI_LDC1(%[ftmp0], %[ptmp], 0x08)
+        MMI_LDC1(%[ftmp1], %[ptmp], 0x18)
+        MMI_LDC1(%[ftmp2], %[ptmp], 0x28)
+        MMI_LDC1(%[ftmp3], %[ptmp], 0x38)
+        MMI_LDC1(%[ftmp4], %[ptmp], 0x48)
+        MMI_LDC1(%[ftmp5], %[ptmp], 0x58)
+        MMI_LDC1(%[ftmp6], %[ptmp], 0x68)
         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
-        "ldc1       %[ftmp8],   0x78(%[ptmp])                           \n\t"
+        MMI_LDC1(%[ftmp8], %[ptmp], 0x78)
         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
-        "gssdlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
-        "gssdrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        MMI_USDC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
         "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
@@ -2702,10 +2632,8 @@  void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
-        "gssdlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+        MMI_USDC1(%[ftmp2], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
         "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
@@ -2713,55 +2641,45 @@  void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        MMI_USDC1(%[ftmp0], %[addr1], 0x00)
         PTR_ADDU   "%[addr6],   %[addr4],       %[stride]               \n\t"
-        "gssdlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
+        MMI_USDC1(%[ftmp5], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
-        "gssdlc1    %[ftmp7],   0x07(%[addr6])                          \n\t"
-        "gssdrc1    %[ftmp7],   0x00(%[addr6])                          \n\t"
+        MMI_USDC1(%[ftmp7], %[addr6], 0x00)
         PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
-        "gssdlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp8],   0x07(%[addr6])                          \n\t"
-        "gsldrc1    %[ftmp8],   0x00(%[addr6])                          \n\t"
+        MMI_USDC1(%[ftmp4], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp8], %[addr6], 0x00)
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
         "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
-        "gssdlc1    %[ftmp3],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp3],   0x00(%[addr5])                          \n\t"
+        MMI_USDC1(%[ftmp3], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[addr4])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        MMI_USDC1(%[ftmp0], %[addr4], 0x00)
         PTR_ADDU   "%[addr6],   %[addr4],       %[addr3]                \n\t"
-        "gssdlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
+        MMI_USDC1(%[ftmp6], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr3],       %[addr3]                \n\t"
-        "gssdlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
-        "gssdrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
+        MMI_USDC1(%[ftmp5], %[addr6], 0x00)
         PTR_SUBU   "%[addr1],   %[addr1],       %[addr5]                \n\t"
         PTR_SUBU   "%[addr4],   %[addr4],       %[addr5]                \n\t"
-        "ldc1       %[ftmp0],   0x00(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp1],   0x10(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp2],   0x20(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp3],   0x30(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp4],   0x40(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp5],   0x50(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp6],   0x60(%[ptmp])                           \n\t"
+        MMI_LDC1(%[ftmp0], %[ptmp], 0x00)
+        MMI_LDC1(%[ftmp1], %[ptmp], 0x10)
+        MMI_LDC1(%[ftmp2], %[ptmp], 0x20)
+        MMI_LDC1(%[ftmp3], %[ptmp], 0x30)
+        MMI_LDC1(%[ftmp4], %[ptmp], 0x40)
+        MMI_LDC1(%[ftmp5], %[ptmp], 0x50)
+        MMI_LDC1(%[ftmp6], %[ptmp], 0x60)
         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
-        "ldc1       %[ftmp8],   0x70(%[ptmp])                           \n\t"
+        MMI_LDC1(%[ftmp8], %[ptmp], 0x70)
         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
-        "gssdlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
-        "gssdrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        MMI_USDC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
         "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
@@ -2769,10 +2687,8 @@  void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
-        "gssdlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+        MMI_USDC1(%[ftmp2], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
         "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
@@ -2780,39 +2696,31 @@  void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        MMI_USDC1(%[ftmp0], %[addr1], 0x00)
         PTR_ADDU   "%[addr6],   %[addr4],       %[stride]               \n\t"
-        "gssdlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
+        MMI_USDC1(%[ftmp5], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
-        "gssdlc1    %[ftmp7],   0x07(%[addr6])                          \n\t"
-        "gssdrc1    %[ftmp7],   0x00(%[addr6])                          \n\t"
+        MMI_USDC1(%[ftmp7], %[addr6], 0x00)
         PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
-        "gssdlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp8],   0x07(%[addr6])                          \n\t"
-        "gsldrc1    %[ftmp8],   0x00(%[addr6])                          \n\t"
+        MMI_USDC1(%[ftmp4], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp8], %[addr6], 0x00)
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
         "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
-        "gssdlc1    %[ftmp3],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp3],   0x00(%[addr5])                          \n\t"
+        MMI_USDC1(%[ftmp3], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[addr4])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        MMI_USDC1(%[ftmp0], %[addr4], 0x00)
         PTR_ADDU   "%[addr6],   %[addr4],       %[addr3]                \n\t"
-        "gssdlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
-        "gssdlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
-        "gssdrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
+        MMI_USDC1(%[ftmp6], %[addr5], 0x00)
+        MMI_USDC1(%[ftmp5], %[addr6], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
diff --git a/libavcodec/mips/h264pred_mmi.c b/libavcodec/mips/h264pred_mmi.c
index bb795a1..f4fe091 100644
--- a/libavcodec/mips/h264pred_mmi.c
+++ b/libavcodec/mips/h264pred_mmi.c
@@ -24,35 +24,33 @@ 
 
 #include "h264pred_mips.h"
 #include "libavcodec/bit_depth_template.c"
-#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
 #include "constants.h"
 
 void ff_pred16x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
 {
     double ftmp[2];
     uint64_t tmp[1];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "dli        %[tmp0],    0x08                                    \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[srcA])                           \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[srcA])                           \n\t"
-        "gsldlc1    %[ftmp1],   0x0f(%[srcA])                           \n\t"
-        "gsldrc1    %[ftmp1],   0x08(%[srcA])                           \n\t"
+        MMI_LDC1(%[ftmp0], %[srcA], 0x00)
+        MMI_LDC1(%[ftmp1], %[srcA], 0x08)
+
         "1:                                                             \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
-        "gssdlc1    %[ftmp1],   0x0f(%[src])                            \n\t"
-        "gssdrc1    %[ftmp1],   0x08(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDC1(%[ftmp1], %[src], 0x08)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
-        "gssdlc1    %[ftmp1],   0x0f(%[src])                            \n\t"
-        "gssdrc1    %[ftmp1],   0x08(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDC1(%[ftmp1], %[src], 0x08)
+
         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
         "bnez       %[tmp0],    1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
           [src]"+&r"(src)
         : [stride]"r"((mips_reg)stride),    [srcA]"r"((mips_reg)(src-stride))
         : "memory"
@@ -160,15 +158,14 @@  void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft,
     uint32_t dc;
     double ftmp[11];
     mips_reg tmp[3];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-        "gsldlc1    %[ftmp10],  0x07(%[srcA])                           \n\t"
-        "gsldrc1    %[ftmp10],  0x00(%[srcA])                           \n\t"
-        "gsldlc1    %[ftmp9],   0x07(%[src0])                           \n\t"
-        "gsldrc1    %[ftmp9],   0x00(%[src0])                           \n\t"
-        "gsldlc1    %[ftmp8],   0x07(%[src1])                           \n\t"
-        "gsldrc1    %[ftmp8],   0x00(%[src1])                           \n\t"
+        MMI_ULDC1(%[ftmp10], %[srcA], 0x00)
+        MMI_ULDC1(%[ftmp9], %[src0], 0x00)
+        MMI_ULDC1(%[ftmp8], %[src1], 0x00)
 
         "punpcklbh  %[ftmp7],   %[ftmp10],      %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp6],   %[ftmp10],      %[ftmp0]                \n\t"
@@ -209,6 +206,7 @@  void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft,
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),
           [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          RESTRICT_ASM_ALL64
           [dc]"=r"(dc)
         : [srcA]"r"((mips_reg)(src-stride-1)),
           [src0]"r"((mips_reg)(src-stride)),
@@ -221,20 +219,22 @@  void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft,
     __asm__ volatile (
         "dli        %[tmp0],    0x02                                    \n\t"
         "punpcklwd  %[ftmp0],   %[dc],          %[dc]                   \n\t"
+
         "1:                                                             \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
-        "gssdxc1    %[ftmp0],   0x00(%[src],    %[stride])              \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
-        "gssdxc1    %[ftmp0],   0x00(%[src],    %[stride])              \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
+
         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
         "bnez       %[tmp0],    1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [src]"+&r"(src)
         : [dc]"f"(dc),                      [stride]"r"((mips_reg)stride)
         : "memory"
@@ -257,13 +257,13 @@  void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
     const int l6 = (src[-1+5*stride] + 2*src[-1+6*stride] + src[-1+7*stride] + 2) >> 2;
     const int l7 = (src[-1+6*stride] + 2*src[-1+7*stride] + src[-1+7*stride] + 2) >> 2;
 
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
     __asm__ volatile (
-        "gsldlc1    %[ftmp4],   0x07(%[srcA])                           \n\t"
-        "gsldrc1    %[ftmp4],   0x00(%[srcA])                           \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[src0])                           \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[src0])                           \n\t"
-        "gsldlc1    %[ftmp6],   0x07(%[src1])                           \n\t"
-        "gsldrc1    %[ftmp6],   0x00(%[src1])                           \n\t"
+        MMI_ULDC1(%[ftmp4], %[srcA], 0x00)
+        MMI_ULDC1(%[ftmp5], %[src0], 0x00)
+        MMI_ULDC1(%[ftmp6], %[src1], 0x00)
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "dli        %[tmp0],    0x03                                    \n\t"
         "punpcklbh  %[ftmp7],   %[ftmp4],       %[ftmp0]                \n\t"
@@ -309,7 +309,9 @@  void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
           [ftmp12]"=&f"(ftmp[12]),          [ftmp13]"=&f"(ftmp[13]),
-          [tmp0]"=&r"(tmp[0]),              [dc2]"=r"(dc2)
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [dc2]"=r"(dc2)
         : [srcA]"r"((mips_reg)(src-stride-1)),
           [src0]"r"((mips_reg)(src-stride)),
           [src1]"r"((mips_reg)(src-stride+1)),
@@ -323,20 +325,22 @@  void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
     __asm__ volatile (
         "dli        %[tmp0],    0x02                                    \n\t"
         "punpcklwd  %[ftmp0],   %[dc],          %[dc]                   \n\t"
+
         "1:                                                             \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
-        "gssdxc1    %[ftmp0],   0x00(%[src],    %[stride])              \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
-        "gssdxc1    %[ftmp0],   0x00(%[src],    %[stride])              \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
+
         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
         "bnez       %[tmp0],    1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [src]"+&r"(src)
         : [dc]"f"(dc),                      [stride]"r"((mips_reg)stride)
         : "memory"
@@ -348,15 +352,13 @@  void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft,
 {
     double ftmp[12];
     mips_reg tmp[1];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[srcA])                           \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[srcA])                           \n\t"
-        "gsldlc1    %[ftmp4],   0x07(%[src0])                           \n\t"
-        "gsldrc1    %[ftmp4],   0x00(%[src0])                           \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[src1])                           \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[src1])                           \n\t"
+        MMI_LDC1(%[ftmp3], %[srcA], 0x00)
+        MMI_LDC1(%[ftmp4], %[src0], 0x00)
+        MMI_LDC1(%[ftmp5], %[src1], 0x00)
         "punpcklbh  %[ftmp6],   %[ftmp3],       %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp8],   %[ftmp4],       %[ftmp0]                \n\t"
@@ -385,7 +387,7 @@  void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft,
         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
         "psrah      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
         "packushb   %[ftmp4],   %[ftmp6],       %[ftmp7]                \n\t"
-        "sdc1       %[ftmp4],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp4], %[src], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
@@ -393,6 +395,7 @@  void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft,
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
           [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
           [src]"=r"(src)
         : [srcA]"r"((mips_reg)(src-stride-1)),
           [src0]"r"((mips_reg)(src-stride)),
@@ -403,22 +406,21 @@  void ff_pred8x8l_vertical_8_mmi(uint8_t *src, int has_topleft,
 
     __asm__ volatile (
         "dli        %[tmp0],    0x02                                    \n\t"
+
         "1:                                                             \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+
         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
         "bnez       %[tmp0],    1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
           [src]"+&r"(src)
         : [stride]"r"((mips_reg)stride)
         : "memory"
@@ -433,19 +435,21 @@  void ff_pred4x4_dc_8_mmi(uint8_t *src, const uint8_t *topright,
                  + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
     uint64_t tmp[2];
     mips_reg addr[1];
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         PTR_ADDU   "%[tmp0],    %[dc],          $0                      \n\t"
         "dmul       %[tmp1],    %[tmp0],        %[ff_pb_1]              \n\t"
         "xor        %[addr0],   %[addr0],       %[addr0]                \n\t"
-        "gsswx      %[tmp1],    0x00(%[src],    %[addr0])               \n\t"
+        MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "gsswx      %[tmp1],    0x00(%[src],    %[addr0])               \n\t"
+        MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "gsswx      %[tmp1],    0x00(%[src],    %[addr0])               \n\t"
+        MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "gsswx      %[tmp1],    0x00(%[src],    %[addr0])               \n\t"
+        MMI_SWX(%[tmp1], %[src], %[addr0], 0x00)
         : [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0])
         : [src]"r"((mips_reg)src),          [stride]"r"((mips_reg)stride),
           [dc]"r"(dc),                      [ff_pb_1]"r"(ff_pb_1)
@@ -518,13 +522,13 @@  void ff_pred8x8_top_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
     double ftmp[4];
     uint64_t tmp[1];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "dli        %[tmp0],    0x02                                    \n\t"
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         PTR_SUBU   "%[addr0],   %[src],         %[stride]               \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        MMI_LDC1(%[ftmp1], %[addr0], 0x00)
         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
         "biadd      %[ftmp2],   %[ftmp2]                                \n\t"
@@ -539,32 +543,25 @@  void ff_pred8x8_top_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
         "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
         "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
         "packushb   %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
-        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp1],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp1],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp1], %[src], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
           [addr0]"=&r"(addr[0]),
           [src]"+&r"(src)
         : [stride]"r"((mips_reg)stride)
@@ -651,21 +648,21 @@  void ff_pred8x8_dc_8_mmi(uint8_t *src, ptrdiff_t stride)
         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
         "packushb   %[ftmp2],   %[ftmp3],       %[ftmp4]                \n\t"
         PTR_ADDU   "%[addr0],   $0,             %[src]                  \n\t"
-        "sdc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "sdc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "sdc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "sdc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "sdc1       %[ftmp2],   0x00(%[addr0])                          \n\t"
+        MMI_SDC1(%[ftmp2], %[addr0], 0x00)
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "sdc1       %[ftmp2],   0x00(%[addr0])                          \n\t"
+        MMI_SDC1(%[ftmp2], %[addr0], 0x00)
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "sdc1       %[ftmp2],   0x00(%[addr0])                          \n\t"
+        MMI_SDC1(%[ftmp2], %[addr0], 0x00)
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "sdc1       %[ftmp2],   0x00(%[addr0])                          \n\t"
+        MMI_SDC1(%[ftmp2], %[addr0], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),
@@ -682,28 +679,27 @@  void ff_pred8x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
 {
     double ftmp[1];
     uint64_t tmp[1];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
-        "gsldlc1    %[ftmp0],   0x07(%[srcA])                           \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[srcA])                           \n\t"
+        MMI_LDC1(%[ftmp0], %[srcA], 0x00)
         "dli        %[tmp0],    0x04                                    \n\t"
+
         "1:                                                             \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+
         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
         "bnez       %[tmp0],    1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),
           [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
           [src]"+&r"(src)
         : [stride]"r"((mips_reg)stride),    [srcA]"r"((mips_reg)(src-stride))
         : "memory"
@@ -747,17 +743,16 @@  static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride,
         const int svq3, const int rv40)
 {
     double ftmp[11];
-    uint64_t tmp[7];
+    uint64_t tmp[6];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile(
         PTR_SUBU   "%[addr0],   %[src],         %[stride]               \n\t"
-        "dli        %[tmp2],    0x20                                    \n\t"
-        "dmtc1      %[tmp2],    %[ftmp4]                                \n\t"
-        "gsldlc1    %[ftmp0],   0x06(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x0f(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp0],   -0x01(%[addr0])                         \n\t"
-        "gsldrc1    %[ftmp2],   0x08(%[addr0])                          \n\t"
+        "dli        %[tmp0],    0x20                                    \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr0], -0x01)
+        MMI_ULDC1(%[ftmp2], %[addr0],  0x08)
         "dsrl       %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
         "dsrl       %[ftmp3],   %[ftmp2],       %[ftmp4]                \n\t"
         "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
@@ -772,29 +767,29 @@  static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride,
         "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
-        "dli        %[tmp2],    0x0e                                    \n\t"
-        "dmtc1      %[tmp2],    %[ftmp4]                                \n\t"
+        "dli        %[tmp0],    0x0e                                    \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
         "pshufh     %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
         "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
-        "dli        %[tmp2],    0x01                                    \n\t"
-        "dmtc1      %[tmp2],    %[ftmp4]                                \n\t"
+        "dli        %[tmp0],    0x01                                    \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
         "pshufh     %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
         "paddsh     %[ftmp5],   %[ftmp0],       %[ftmp1]                \n\t"
 
         PTR_ADDIU  "%[addr0],   %[src],         -0x01                   \n\t"
         PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
         "lbu        %[tmp2],    0x00(%[addr0])                          \n\t"
-        "lbu        %[tmp6],    0x10(%[addr0])                          \n\t"
+        "lbu        %[tmp5],    0x10(%[addr0])                          \n\t"
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
         "lbu        %[tmp3],    0x00(%[addr0])                          \n\t"
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
         "lbu        %[tmp4],    0x00(%[addr0])                          \n\t"
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "lbu        %[tmp5],    0x00(%[addr0])                          \n\t"
+        "lbu        %[tmp0],    0x00(%[addr0])                          \n\t"
         "dsll       %[tmp3],    %[tmp3],        0x10                    \n\t"
         "dsll       %[tmp4],    %[tmp4],        0x20                    \n\t"
-        "dsll       %[tmp5],    %[tmp5],        0x30                    \n\t"
-        "or         %[tmp4],    %[tmp4],        %[tmp5]                 \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x30                    \n\t"
+        "or         %[tmp4],    %[tmp4],        %[tmp0]                 \n\t"
         "or         %[tmp2],    %[tmp2],        %[tmp3]                 \n\t"
         "or         %[tmp2],    %[tmp2],        %[tmp4]                 \n\t"
         "dmtc1      %[tmp2],    %[ftmp0]                                \n\t"
@@ -806,11 +801,11 @@  static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride,
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
         "lbu        %[tmp4],    0x00(%[addr0])                          \n\t"
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "lbu        %[tmp5],    0x00(%[addr0])                          \n\t"
+        "lbu        %[tmp0],    0x00(%[addr0])                          \n\t"
         "dsll       %[tmp3],    %[tmp3],        0x10                    \n\t"
         "dsll       %[tmp4],    %[tmp4],        0x20                    \n\t"
-        "dsll       %[tmp5],    %[tmp5],        0x30                    \n\t"
-        "or         %[tmp4],    %[tmp4],        %[tmp5]                 \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x30                    \n\t"
+        "or         %[tmp4],    %[tmp4],        %[tmp0]                 \n\t"
         "or         %[tmp2],    %[tmp2],        %[tmp3]                 \n\t"
         "or         %[tmp2],    %[tmp2],        %[tmp4]                 \n\t"
         "dmtc1      %[tmp2],    %[ftmp1]                                \n\t"
@@ -823,11 +818,11 @@  static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride,
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
         "lbu        %[tmp4],    0x00(%[addr0])                          \n\t"
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "lbu        %[tmp5],    0x00(%[addr0])                          \n\t"
+        "lbu        %[tmp0],    0x00(%[addr0])                          \n\t"
         "dsll       %[tmp3],    %[tmp3],        0x10                    \n\t"
         "dsll       %[tmp4],    %[tmp4],        0x20                    \n\t"
-        "dsll       %[tmp5],    %[tmp5],        0x30                    \n\t"
-        "or         %[tmp4],    %[tmp4],        %[tmp5]                 \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x30                    \n\t"
+        "or         %[tmp4],    %[tmp4],        %[tmp0]                 \n\t"
         "or         %[tmp2],    %[tmp2],        %[tmp3]                 \n\t"
         "or         %[tmp2],    %[tmp2],        %[tmp4]                 \n\t"
         "dmtc1      %[tmp2],    %[ftmp2]                                \n\t"
@@ -839,15 +834,15 @@  static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride,
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
         "lbu        %[tmp4],    0x00(%[addr0])                          \n\t"
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "lbu        %[tmp5],    0x00(%[addr0])                          \n\t"
-        "daddu      %[tmp6],    %[tmp6],        %[tmp5]                 \n\t"
-        "daddiu     %[tmp6],    %[tmp6],        0x01                    \n\t"
-        "dsll       %[tmp6],    %[tmp6],        0x04                    \n\t"
+        "lbu        %[tmp0],    0x00(%[addr0])                          \n\t"
+        "daddu      %[tmp5],    %[tmp5],        %[tmp0]                 \n\t"
+        "daddiu     %[tmp5],    %[tmp5],        0x01                    \n\t"
+        "dsll       %[tmp5],    %[tmp5],        0x04                    \n\t"
 
         "dsll       %[tmp3],    %[tmp3],        0x10                    \n\t"
         "dsll       %[tmp4],    %[tmp4],        0x20                    \n\t"
-        "dsll       %[tmp5],    %[tmp5],        0x30                    \n\t"
-        "or         %[tmp4],    %[tmp4],        %[tmp5]                 \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x30                    \n\t"
+        "or         %[tmp4],    %[tmp4],        %[tmp0]                 \n\t"
         "or         %[tmp2],    %[tmp2],        %[tmp3]                 \n\t"
         "or         %[tmp2],    %[tmp2],        %[tmp4]                 \n\t"
         "dmtc1      %[tmp2],    %[ftmp3]                                \n\t"
@@ -859,13 +854,13 @@  static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride,
         "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
-        "dli        %[tmp2],    0x0e                                    \n\t"
-        "dmtc1      %[tmp2],    %[ftmp4]                                \n\t"
+        "dli        %[tmp0],    0x0e                                    \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
         "pshufh     %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
         "paddsh     %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
 
-        "dli        %[tmp2],    0x01                                    \n\t"
-        "dmtc1      %[tmp2],    %[ftmp4]                                \n\t"
+        "dli        %[tmp0],    0x01                                    \n\t"
+        "dmtc1      %[tmp0],    %[ftmp4]                                \n\t"
         "pshufh     %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
         "paddsh     %[ftmp6],   %[ftmp0],       %[ftmp1]                \n\t"
 
@@ -914,17 +909,17 @@  static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride,
         "daddu      %[tmp3],    %[tmp0],        %[tmp1]                 \n\t"
         "dli        %[tmp2],    0x07                                    \n\t"
         "dmul       %[tmp3],    %[tmp3],        %[tmp2]                 \n\t"
-        "dsubu      %[tmp6],    %[tmp6],        %[tmp3]                 \n\t"
+        "dsubu      %[tmp5],    %[tmp5],        %[tmp3]                 \n\t"
 
         "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
         "dmtc1      %[tmp0],    %[ftmp0]                                \n\t"
         "pshufh     %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "dmtc1      %[tmp1],    %[ftmp5]                                \n\t"
         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
-        "dmtc1      %[tmp6],    %[ftmp6]                                \n\t"
+        "dmtc1      %[tmp5],    %[ftmp6]                                \n\t"
         "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
-        "dli        %[tmp2],    0x05                                    \n\t"
-        "dmtc1      %[tmp2],    %[ftmp7]                                \n\t"
+        "dli        %[tmp0],    0x05                                    \n\t"
+        "dmtc1      %[tmp0],    %[ftmp7]                                \n\t"
         "pmullh     %[ftmp1],   %[ff_pw_0to3],  %[ftmp0]                \n\t"
         "dmtc1      %[ff_pw_4to7],              %[ftmp2]                \n\t"
         "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
@@ -941,16 +936,14 @@  static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride,
         "paddsh     %[ftmp9],   %[ftmp2],       %[ftmp6]                \n\t"
         "psrah      %[ftmp9],   %[ftmp9],       %[ftmp7]                \n\t"
         "packushb   %[ftmp0],   %[ftmp8],       %[ftmp9]                \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[addr0])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[addr0])                          \n\t"
+        MMI_SDC1(%[ftmp0], %[addr0], 0x00)
 
         "paddsh     %[ftmp8],   %[ftmp3],       %[ftmp6]                \n\t"
         "psrah      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
         "paddsh     %[ftmp9],   %[ftmp4],       %[ftmp6]                \n\t"
         "psrah      %[ftmp9],   %[ftmp9],       %[ftmp7]                \n\t"
         "packushb   %[ftmp0],   %[ftmp8],       %[ftmp9]                \n\t"
-        "gssdlc1    %[ftmp0],   0x0f(%[addr0])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x08(%[addr0])                          \n\t"
+        MMI_SDC1(%[ftmp0], %[addr0], 0x08)
 
         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
         PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
@@ -964,7 +957,7 @@  static inline void pred16x16_plane_compat_mmi(uint8_t *src, int stride,
           [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
           [tmp2]"=&r"(tmp[2]),              [tmp3]"=&r"(tmp[3]),
           [tmp4]"=&r"(tmp[4]),              [tmp5]"=&r"(tmp[5]),
-          [tmp6]"=&r"(tmp[6]),
+          RESTRICT_ASM_ALL64
           [addr0]"=&r"(addr[0])
         : [src]"r"(src),                    [stride]"r"((mips_reg)stride),
           [svq3]"r"(svq3),                  [rv40]"r"(rv40),
diff --git a/libavcodec/mips/h264qpel_mmi.c b/libavcodec/mips/h264qpel_mmi.c
index b4e83e4..13fbebf 100644
--- a/libavcodec/mips/h264qpel_mmi.c
+++ b/libavcodec/mips/h264qpel_mmi.c
@@ -24,28 +24,26 @@ 
 #include "h264dsp_mips.h"
 #include "hpeldsp_mips.h"
 #include "libavcodec/bit_depth_template.c"
-#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
 
 static inline void copy_block4_mmi(uint8_t *dst, const uint8_t *src,
         int dstStride, int srcStride, int h)
 {
     double ftmp[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     __asm__ volatile (
         "1:                                                             \n\t"
-        "uld        %[low32],   0x00(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp0]                                \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[dst])                            \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[dst])                            \n\t"
+        MMI_ULWC1(%[ftmp0], %[src], 0x00)
+        MMI_SWC1(%[ftmp0], %[dst], 0x00)
         "addi       %[h],       %[h],           -0x01                   \n\t"
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
         "bnez       %[h],       1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),
           [dst]"+&r"(dst),                  [src]"+&r"(src),
-          [h]"+&r"(h),
-          [low32]"=&r"(low32)
+          RESTRICT_ASM_LOW32
+          [h]"+&r"(h)
         : [dstStride]"r"((mips_reg)dstStride),
           [srcStride]"r"((mips_reg)srcStride)
         : "memory"
@@ -56,18 +54,18 @@  static inline void copy_block8_mmi(uint8_t *dst, const uint8_t *src,
         int dstStride, int srcStride, int h)
 {
     double ftmp[1];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[src])                            \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[dst])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[dst])                            \n\t"
+        MMI_ULDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
         "addi       %[h],       %[h],           -0x01                   \n\t"
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
         "bnez       %[h],       1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),
+          RESTRICT_ASM_ALL64
           [dst]"+&r"(dst),                  [src]"+&r"(src),
           [h]"+&r"(h)
         : [dstStride]"r"((mips_reg)dstStride),
@@ -81,15 +79,14 @@  static inline void copy_block16_mmi(uint8_t *dst, const uint8_t *src,
 {
     double ftmp[1];
     uint64_t tmp[1];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[src])                            \n\t"
+        MMI_ULDC1(%[ftmp0], %[src], 0x00)
         "ldl        %[tmp0],    0x0f(%[src])                            \n\t"
         "ldr        %[tmp0],    0x08(%[src])                            \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[dst])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[dst])                            \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
         "sdl        %[tmp0],    0x0f(%[dst])                            \n\t"
         "sdr        %[tmp0],    0x08(%[dst])                            \n\t"
         "addi       %[h],       %[h],           -0x01                   \n\t"
@@ -98,6 +95,7 @@  static inline void copy_block16_mmi(uint8_t *dst, const uint8_t *src,
         "bnez       %[h],       1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),
           [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
           [dst]"+&r"(dst),                  [src]"+&r"(src),
           [h]"+&r"(h)
         : [dstStride]"r"((mips_reg)dstStride),
@@ -113,24 +111,19 @@  static void put_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
 {
     double ftmp[10];
     uint64_t tmp[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     __asm__ volatile (
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "dli        %[tmp0],    0x04                                    \n\t"
         "1:                                                             \n\t"
-        "uld        %[low32],   -0x02(%[src])                           \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "uld        %[low32],   -0x01(%[src])                           \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
-        "uld        %[low32],   0x00(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
-        "uld        %[low32],   0x01(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp4]                                \n\t"
-        "uld        %[low32],   0x02(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
-        "uld        %[low32],   0x03(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        MMI_ULWC1(%[ftmp1], %[src], -0x02)
+        MMI_ULWC1(%[ftmp2], %[src], -0x01)
+        MMI_ULWC1(%[ftmp3], %[src],  0x00)
+        MMI_ULWC1(%[ftmp4], %[src],  0x01)
+        MMI_ULWC1(%[ftmp5], %[src],  0x02)
+        MMI_ULWC1(%[ftmp6], %[src],  0x03)
+
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
@@ -147,8 +140,7 @@  static void put_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "paddsh     %[ftmp9],   %[ftmp9],       %[ff_pw_16]             \n\t"
         "psrah      %[ftmp9],   %[ftmp9],       %[ff_pw_5]              \n\t"
         "packushb   %[ftmp9],   %[ftmp9],       %[ftmp0]                \n\t"
-        "gsswlc1    %[ftmp9],   0x03(%[dst])                            \n\t"
-        "gsswrc1    %[ftmp9],   0x00(%[dst])                            \n\t"
+        MMI_SWC1(%[ftmp9], %[dst],  0x00)
         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
@@ -159,8 +151,8 @@  static void put_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [tmp0]"=&r"(tmp[0]),
-          [dst]"+&r"(dst),                  [src]"+&r"(src),
-          [low32]"=&r"(low32)
+          RESTRICT_ASM_LOW32
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
         : [dstStride]"r"((mips_reg)dstStride),
           [srcStride]"r"((mips_reg)srcStride),
           [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5),
@@ -174,23 +166,18 @@  static void put_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
 {
     double ftmp[11];
     uint64_t tmp[1];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "dli        %[tmp0],    0x08                                    \n\t"
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp1],   0x05(%[src])                            \n\t"
-        "gsldrc1    %[ftmp1],   -0x02(%[src])                           \n\t"
-        "gsldlc1    %[ftmp2],   0x06(%[src])                            \n\t"
-        "gsldrc1    %[ftmp2],   -0x01(%[src])                           \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[src])                            \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[src])                            \n\t"
-        "gsldlc1    %[ftmp4],   0x08(%[src])                            \n\t"
-        "gsldrc1    %[ftmp4],   0x01(%[src])                            \n\t"
-        "gsldlc1    %[ftmp5],   0x09(%[src])                            \n\t"
-        "gsldrc1    %[ftmp5],   0x02(%[src])                            \n\t"
-        "gsldlc1    %[ftmp6],   0x0a(%[src])                            \n\t"
-        "gsldrc1    %[ftmp6],   0x03(%[src])                            \n\t"
+        MMI_ULDC1(%[ftmp1], %[src], -0x02)
+        MMI_ULDC1(%[ftmp2], %[src], -0x01)
+        MMI_ULDC1(%[ftmp3], %[src],  0x00)
+        MMI_ULDC1(%[ftmp4], %[src],  0x01)
+        MMI_ULDC1(%[ftmp5], %[src],  0x02)
+        MMI_ULDC1(%[ftmp6], %[src],  0x03)
         "punpcklbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
@@ -222,8 +209,7 @@  static void put_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "psrah      %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
         "psrah      %[ftmp4],   %[ftmp4],       %[ff_pw_5]              \n\t"
         "packushb   %[ftmp9],   %[ftmp3],       %[ftmp4]                \n\t"
-        "gssdlc1    %[ftmp9],   0x07(%[dst])                            \n\t"
-        "gssdrc1    %[ftmp9],   0x00(%[dst])                            \n\t"
+        MMI_SDC1(%[ftmp9], %[dst],  0x00)
         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
@@ -235,6 +221,7 @@  static void put_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),
           [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
           [dst]"+&r"(dst),                  [src]"+&r"(src)
         : [dstStride]"r"((mips_reg)dstStride),
           [srcStride]"r"((mips_reg)srcStride),
@@ -260,24 +247,18 @@  static void avg_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
 {
     double ftmp[11];
     uint64_t tmp[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     __asm__ volatile (
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "dli        %[tmp0],    0x04                                    \n\t"
         "1:                                                             \n\t"
-        "uld        %[low32],   -0x02(%[src])                           \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "uld        %[low32],   -0x01(%[src])                           \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
-        "uld        %[low32],   0x00(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
-        "uld        %[low32],   0x01(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp4]                                \n\t"
-        "uld        %[low32],   0x02(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
-        "uld        %[low32],   0x03(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        MMI_ULWC1(%[ftmp1], %[src], -0x02)
+        MMI_ULWC1(%[ftmp2], %[src], -0x01)
+        MMI_ULWC1(%[ftmp3], %[src],  0x00)
+        MMI_ULWC1(%[ftmp4], %[src],  0x01)
+        MMI_ULWC1(%[ftmp5], %[src],  0x02)
+        MMI_ULWC1(%[ftmp6], %[src],  0x03)
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
@@ -294,10 +275,9 @@  static void avg_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "paddsh     %[ftmp9],   %[ftmp9],       %[ff_pw_16]             \n\t"
         "psrah      %[ftmp9],   %[ftmp9],       %[ff_pw_5]              \n\t"
         "packushb   %[ftmp9],   %[ftmp9],       %[ftmp0]                \n\t"
-        "lwc1       %[ftmp10],  0x00(%[dst])                            \n\t"
+        MMI_LWC1(%[ftmp10], %[dst],  0x00)
         "pavgb      %[ftmp9],   %[ftmp9],       %[ftmp10]               \n\t"
-        "gsswlc1    %[ftmp9],   0x03(%[dst])                            \n\t"
-        "gsswrc1    %[ftmp9],   0x00(%[dst])                            \n\t"
+        MMI_SWC1(%[ftmp9], %[dst],  0x00)
         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
@@ -309,8 +289,8 @@  static void avg_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),
           [tmp0]"=&r"(tmp[0]),
-          [dst]"+&r"(dst),                  [src]"+&r"(src),
-          [low32]"=&r"(low32)
+          RESTRICT_ASM_LOW32
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
         : [dstStride]"r"((mips_reg)dstStride),
           [srcStride]"r"((mips_reg)srcStride),
           [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5),
@@ -324,23 +304,18 @@  static void avg_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
 {
     double ftmp[11];
     uint64_t tmp[1];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "dli        %[tmp0],    0x08                                    \n\t"
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp1],   0x05(%[src])                            \n\t"
-        "gsldrc1    %[ftmp1],   -0x02(%[src])                           \n\t"
-        "gsldlc1    %[ftmp2],   0x06(%[src])                            \n\t"
-        "gsldrc1    %[ftmp2],   -0x01(%[src])                           \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[src])                            \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[src])                            \n\t"
-        "gsldlc1    %[ftmp4],   0x08(%[src])                            \n\t"
-        "gsldrc1    %[ftmp4],   0x01(%[src])                            \n\t"
-        "gsldlc1    %[ftmp5],   0x09(%[src])                            \n\t"
-        "gsldrc1    %[ftmp5],   0x02(%[src])                            \n\t"
-        "gsldlc1    %[ftmp6],   0x0a(%[src])                            \n\t"
-        "gsldrc1    %[ftmp6],   0x03(%[src])                            \n\t"
+        MMI_ULDC1(%[ftmp1], %[src], -0x02)
+        MMI_ULDC1(%[ftmp2], %[src], -0x01)
+        MMI_ULDC1(%[ftmp3], %[src],  0x00)
+        MMI_ULDC1(%[ftmp4], %[src],  0x01)
+        MMI_ULDC1(%[ftmp5], %[src],  0x02)
+        MMI_ULDC1(%[ftmp6], %[src],  0x03)
         "punpcklbh  %[ftmp7],   %[ftmp3],       %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
@@ -372,9 +347,9 @@  static void avg_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "psrah      %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
         "psrah      %[ftmp4],   %[ftmp4],       %[ff_pw_5]              \n\t"
         "packushb   %[ftmp9],   %[ftmp3],       %[ftmp4]                \n\t"
-        "ldc1       %[ftmp10],  0x00(%[dst])                            \n\t"
+        MMI_LDC1(%[ftmp10], %[dst], 0x00)
         "pavgb      %[ftmp9],   %[ftmp9],       %[ftmp10]               \n\t"
-        "sdc1       %[ftmp9],   0x00(%[dst])                            \n\t"
+        MMI_SDC1(%[ftmp9], %[dst], 0x00)
         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
@@ -386,6 +361,7 @@  static void avg_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),
           [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
           [dst]"+&r"(dst),                  [src]"+&r"(src)
         : [dstStride]"r"((mips_reg)dstStride),
           [srcStride]"r"((mips_reg)srcStride),
@@ -411,7 +387,7 @@  static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
 {
     double ftmp[12];
     uint64_t tmp[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     src -= 2 * srcStride;
 
@@ -420,31 +396,25 @@  static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         ".set       noreorder                                           \n\t"
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "dli        %[tmp0],    0x02                                    \n\t"
-        "uld        %[low32],   0x00(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        MMI_LWC1(%[ftmp1], %[src], 0x00)
         "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
         "dli        %[tmp0],    0x05                                    \n\t"
-        "uld        %[low32],   0x00(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        MMI_LWC1(%[ftmp2], %[src], 0x00)
         "mtc1       %[tmp0],    %[ftmp11]                               \n\t"
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
-        "uld        %[low32],   0x00(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        MMI_LWC1(%[ftmp3], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
-        "uld        %[low32],   0x00(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        MMI_LWC1(%[ftmp4], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
-        "uld        %[low32],   0x00(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        MMI_LWC1(%[ftmp5], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
-        "uld        %[low32],   0x00(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        MMI_LWC1(%[ftmp6], %[src], 0x00)
         "paddh      %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
         "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
@@ -457,10 +427,9 @@  static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "paddh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
         "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
-        "swc1       %[ftmp7],   0x00(%[dst])                            \n\t"
+        MMI_SWC1(%[ftmp7], %[dst], 0x00)
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
-        "uld        %[low32],   0x00(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        MMI_LWC1(%[ftmp1], %[src], 0x00)
         "paddh      %[ftmp7],   %[ftmp4],       %[ftmp5]                \n\t"
         "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
@@ -473,10 +442,9 @@  static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "paddh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
         "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
-        "swc1       %[ftmp7],   0x00(%[dst])                            \n\t"
+        MMI_SWC1(%[ftmp7], %[dst], 0x00)
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
-        "uld        %[low32],   0x00(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        MMI_LWC1(%[ftmp2], %[src], 0x00)
         "paddh      %[ftmp7],   %[ftmp5],       %[ftmp6]                \n\t"
         "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
@@ -489,10 +457,9 @@  static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
         "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
-        "swc1       %[ftmp7],   0x00(%[dst])                            \n\t"
+        MMI_SWC1(%[ftmp7], %[dst], 0x00)
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
-        "uld        %[low32],   0x00(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        MMI_LWC1(%[ftmp3], %[src], 0x00)
         "paddh      %[ftmp7],   %[ftmp6],       %[ftmp1]                \n\t"
         "psllh      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
@@ -505,7 +472,7 @@  static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "paddh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
         "psrah      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
         "packushb   %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
-        "swc1       %[ftmp7],   0x00(%[dst])                            \n\t"
+        MMI_SWC1(%[ftmp7], %[dst], 0x00)
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
         ".set       pop                                                 \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
@@ -515,8 +482,8 @@  static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
           [tmp0]"=&r"(tmp[0]),
-          [dst]"+&r"(dst),                  [src]"+&r"(src),
-          [low32]"=&r"(low32)
+          RESTRICT_ASM_LOW32
+          [dst]"+&r"(dst),                  [src]"+&r"(src)
         : [dstStride]"r"((mips_reg)dstStride),
           [srcStride]"r"((mips_reg)srcStride),
           [ff_pw_5]"f"(ff_pw_5),            [ff_pw_16]"f"(ff_pw_16)
@@ -531,7 +498,7 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
     int h = 8;
     double ftmp[10];
     uint64_t tmp[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     src -= 2 * srcStride;
 
@@ -540,29 +507,23 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             ".set       push                                            \n\t"
             ".set       noreorder                                       \n\t"
             "dli        %[tmp0],    0x02                                \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
             "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "dli        %[tmp0],    0x05                                \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
             "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            MMI_LWC1(%[ftmp3], %[src], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            MMI_LWC1(%[ftmp4], %[src], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
             "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            MMI_LWC1(%[ftmp5], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
@@ -578,10 +539,9 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
@@ -594,11 +554,10 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
@@ -610,12 +569,11 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
@@ -626,12 +584,11 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            MMI_LWC1(%[ftmp3], %[src], 0x00)
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
             "punpcklbh  %[ftmp3] ,  %[ftmp3],       %[ftmp7]            \n\t"
@@ -642,12 +599,11 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            MMI_LWC1(%[ftmp4], %[src], 0x00)
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
             "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
@@ -658,12 +614,11 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            MMI_LWC1(%[ftmp5], %[src], 0x00)
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
             "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
@@ -674,12 +629,11 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
@@ -690,12 +644,11 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             "bne        %[h],       0x10,           2f                  \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
@@ -707,12 +660,11 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
@@ -723,12 +675,11 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            MMI_LWC1(%[ftmp3], %[src], 0x00)
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
             "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
@@ -739,12 +690,11 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            MMI_LWC1(%[ftmp4], %[src], 0x00)
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
             "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
@@ -755,12 +705,11 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            MMI_LWC1(%[ftmp5], %[src], 0x00)
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
             "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
@@ -771,12 +720,11 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
@@ -787,12 +735,11 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
@@ -803,12 +750,11 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
@@ -819,7 +765,7 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "2:                                                         \n\t"
             ".set       pop                                             \n\t"
@@ -829,9 +775,9 @@  static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
               [src]"+&r"(src),              [dst]"+&r"(dst),
-              [h]"+&r"(h),
-              [low32]"=&r"(low32)
+              [h]"+&r"(h)
             : [dstStride]"r"((mips_reg)dstStride),
               [srcStride]"r"((mips_reg)srcStride),
               [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
@@ -869,23 +815,23 @@  static void avg_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
         "dli        %[tmp0],    0x05                                    \n\t"
-        "lwc1       %[ftmp0],   0x00(%[src])                            \n\t"
+        MMI_LWC1(%[ftmp0], %[src], 0x00)
         "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
-        "lwc1       %[ftmp1],   0x00(%[src])                            \n\t"
+        MMI_LWC1(%[ftmp1], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
-        "lwc1       %[ftmp2],   0x00(%[src])                            \n\t"
+        MMI_LWC1(%[ftmp2], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
-        "lwc1       %[ftmp3],   0x00(%[src])                            \n\t"
+        MMI_LWC1(%[ftmp3], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
-        "lwc1       %[ftmp4],   0x00(%[src])                            \n\t"
+        MMI_LWC1(%[ftmp4], %[src], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
-        "lwc1       %[ftmp5],   0x00(%[src])                            \n\t"
+        MMI_LWC1(%[ftmp5], %[src], 0x00)
         "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
         "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
         "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
@@ -898,11 +844,11 @@  static void avg_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
-        "lwc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        MMI_LWC1(%[ftmp0], %[dst], 0x00)
         "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
-        "swc1       %[ftmp6],   0x00(%[dst])                            \n\t"
+        MMI_SWC1(%[ftmp6], %[dst], 0x00)
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
-        "lwc1       %[ftmp0],   0x00(%[src])                            \n\t"
+        MMI_LWC1(%[ftmp0], %[src], 0x00)
         "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]                \n\t"
         "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
         "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
@@ -915,11 +861,11 @@  static void avg_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
-        "lwc1       %[ftmp1],   0x00(%[dst])                            \n\t"
+        MMI_LWC1(%[ftmp1], %[dst], 0x00)
         "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
-        "swc1       %[ftmp6],   0x00(%[dst])                            \n\t"
+        MMI_SWC1(%[ftmp6], %[dst], 0x00)
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
-        "lwc1       %[ftmp1],   0x00(%[src])                            \n\t"
+        MMI_LWC1(%[ftmp1], %[src], 0x00)
         "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]                \n\t"
         "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
         "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
@@ -932,11 +878,11 @@  static void avg_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
-        "lwc1       %[ftmp2],   0x00(%[dst])                            \n\t"
+        MMI_LWC1(%[ftmp2], %[dst], 0x00)
         "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
-        "swc1       %[ftmp6],   0x00(%[dst])                            \n\t"
+        MMI_SWC1(%[ftmp6], %[dst], 0x00)
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
-        "lwc1       %[ftmp2],   0x00(%[src])                            \n\t"
+        MMI_LWC1(%[ftmp2], %[src], 0x00)
         "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
         "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
         "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
@@ -949,9 +895,9 @@  static void avg_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
-        "lwc1       %[ftmp3],   0x00(%[dst])                            \n\t"
+        MMI_LWC1(%[ftmp3], %[dst], 0x00)
         "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
-        "swc1       %[ftmp6],   0x00(%[dst])                            \n\t"
+        MMI_SWC1(%[ftmp6], %[dst], 0x00)
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
         ".set       pop                                                 \n\t"
         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
@@ -975,7 +921,7 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
     int h = 8;
     double ftmp[10];
     uint64_t tmp[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     src -= 2 * srcStride;
 
@@ -987,29 +933,23 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
             "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
             "dli        %[tmp0],    0x05                                \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
             "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            MMI_LWC1(%[ftmp3], %[src], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            MMI_LWC1(%[ftmp4], %[src], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
             "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
             "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            MMI_LWC1(%[ftmp5], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
@@ -1022,12 +962,11 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp0],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp0], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
@@ -1040,12 +979,11 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp1], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
@@ -1058,12 +996,11 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
@@ -1076,12 +1013,11 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp3],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp3], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            MMI_LWC1(%[ftmp3], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
@@ -1094,12 +1030,11 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp4],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp4], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            MMI_LWC1(%[ftmp4], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
@@ -1112,12 +1047,11 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp5],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp5], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            MMI_LWC1(%[ftmp5], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
@@ -1130,12 +1064,11 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp0],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp0], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
@@ -1148,13 +1081,12 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp1], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             "bne        %[h],       0x10,           2f                  \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
@@ -1167,12 +1099,11 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
@@ -1185,12 +1116,11 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp3],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp3], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            MMI_LWC1(%[ftmp3], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
@@ -1203,12 +1133,11 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp4],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp4], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            MMI_LWC1(%[ftmp4], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
@@ -1221,12 +1150,11 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp5],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp5], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            MMI_LWC1(%[ftmp5], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
@@ -1239,12 +1167,11 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp0],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp0], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            MMI_LWC1(%[ftmp0], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
@@ -1257,12 +1184,11 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp1], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_LWC1(%[ftmp1], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
@@ -1275,12 +1201,11 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_LWC1(%[ftmp2], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
             "psubh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
@@ -1293,9 +1218,9 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
             "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"
             "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-            "lwc1       %[ftmp3],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp3], %[dst], 0x00)
             "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
-            "swc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp6], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "2:                                                         \n\t"
             ".set       pop                                             \n\t"
@@ -1305,9 +1230,9 @@  static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
               [src]"+&r"(src),              [dst]"+&r"(dst),
-              [h]"+&r"(h),
-              [low32]"=&r"(low32)
+              [h]"+&r"(h)
             : [dstStride]"r"((mips_reg)dstStride),
               [srcStride]"r"((mips_reg)srcStride),
               [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
@@ -1339,7 +1264,7 @@  static void put_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
     int16_t *tmp = _tmp;
     double ftmp[10];
     uint64_t tmp0;
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     src -= 2*srcStride;
 
@@ -1347,18 +1272,12 @@  static void put_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "dli        %[tmp0],    0x09                                    \n\t"
         "1:                                                             \n\t"
-        "uld        %[low32],   -0x02(%[src])                           \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "uld        %[low32],   -0x01(%[src])                           \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
-        "uld        %[low32],   0x00(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
-        "uld        %[low32],   0x01(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp4]                                \n\t"
-        "uld        %[low32],   0x02(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
-        "uld        %[low32],   0x03(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        MMI_ULWC1(%[ftmp1], %[src], -0x02)
+        MMI_ULWC1(%[ftmp2], %[src], -0x01)
+        MMI_ULWC1(%[ftmp3], %[src],  0x00)
+        MMI_ULWC1(%[ftmp4], %[src],  0x01)
+        MMI_ULWC1(%[ftmp5], %[src],  0x02)
+        MMI_ULWC1(%[ftmp6], %[src],  0x03)
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
@@ -1372,7 +1291,7 @@  static void put_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
         "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
         "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
-        "sdc1       %[ftmp9],   0x00(%[tmp])                            \n\t"
+        MMI_SDC1(%[ftmp9], %[tmp], 0x00)
         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
         PTR_ADDU   "%[tmp],     %[tmp],         %[tmpStride]            \n\t"
@@ -1383,8 +1302,8 @@  static void put_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [tmp0]"=&r"(tmp0),
-          [tmp]"+&r"(tmp),                  [src]"+&r"(src),
-          [low32]"=&r"(low32)
+          RESTRICT_ASM_LOW32
+          [tmp]"+&r"(tmp),                  [src]"+&r"(src)
         : [tmpStride]"r"(8),
           [srcStride]"r"((mips_reg)srcStride),
           [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5)
@@ -1418,37 +1337,31 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
     int w = (size + 8) >> 2;
     double ftmp[11];
     uint64_t tmp0;
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     src -= 2 * srcStride + 2;
 
     while (w--) {
         __asm__ volatile (
             "dli        %[tmp0],    0x02                                \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            MMI_ULWC1(%[ftmp0], %[src], 0x00)
             "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_ULWC1(%[ftmp2], %[src], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            MMI_ULWC1(%[ftmp3], %[src], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            MMI_ULWC1(%[ftmp4], %[src], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
             "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
             "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
             "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            MMI_ULWC1(%[ftmp5], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
@@ -1459,9 +1372,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
-            "sdc1       %[ftmp6],   0x00(%[tmp])                        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x00)
+            MMI_ULWC1(%[ftmp0], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
@@ -1472,9 +1384,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
-            "sdc1       %[ftmp6],   0x30(%[tmp])                        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x30)
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
@@ -1485,9 +1396,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
-            "sdc1       %[ftmp6],   0x60(%[tmp])                        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x60)
+            MMI_ULWC1(%[ftmp2], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
@@ -1498,9 +1408,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
-            "sdc1       %[ftmp6],   0x90(%[tmp])                        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x90)
+            MMI_ULWC1(%[ftmp3], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
@@ -1511,9 +1420,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
-            "sdc1       %[ftmp6],   0xc0(%[tmp])                        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0xc0)
+            MMI_ULWC1(%[ftmp4], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
@@ -1524,9 +1432,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
-            "sdc1       %[ftmp6],   0xf0(%[tmp])                        \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0xf0)
+            MMI_ULWC1(%[ftmp5], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
@@ -1537,9 +1444,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
-            "sdc1       %[ftmp6],   0x120(%[tmp])                       \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x120)
+            MMI_ULWC1(%[ftmp0], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
@@ -1550,11 +1456,10 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
-            "sdc1       %[ftmp6],   0x150(%[tmp])                       \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x150)
             "bne        %[size],    0x10,           2f                  \n\t"
 
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
@@ -1565,9 +1470,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
-            "sdc1       %[ftmp6],   0x180(%[tmp])                       \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x180)
+            MMI_ULWC1(%[ftmp2], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
@@ -1578,9 +1482,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
-            "sdc1       %[ftmp6],   0x1b0(%[tmp])                       \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp3]                            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x1b0)
+            MMI_ULWC1(%[ftmp3], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp0],       %[ftmp1]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_16]         \n\t"
@@ -1591,9 +1494,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
-            "sdc1       %[ftmp6],   0x1e0(%[tmp])                       \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x1e0)
+            MMI_ULWC1(%[ftmp4], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_16]         \n\t"
@@ -1604,9 +1506,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
-            "sdc1       %[ftmp6],   0x210(%[tmp])                       \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp5]                            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x210)
+            MMI_ULWC1(%[ftmp5], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp0],   %[ftmp0],       %[ff_pw_16]         \n\t"
@@ -1617,9 +1518,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
-            "sdc1       %[ftmp6],   0x240(%[tmp])                       \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp0]                            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x240)
+            MMI_ULWC1(%[ftmp0], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp3],       %[ftmp4]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_16]         \n\t"
@@ -1630,9 +1530,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]            \n\t"
-            "sdc1       %[ftmp6],   0x270(%[tmp])                       \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x270)
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp4],       %[ftmp5]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_16]         \n\t"
@@ -1643,9 +1542,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]            \n\t"
-            "sdc1       %[ftmp6],   0x2a0(%[tmp])                       \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x2a0)
+            MMI_ULWC1(%[ftmp2], %[src], 0x00)
             "paddh      %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
             "psllh      %[ftmp6],   %[ftmp6],       %[ftmp10]           \n\t"
             "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_16]         \n\t"
@@ -1656,7 +1554,7 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[srcStride]        \n\t"
             "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
-            "sdc1       %[ftmp6],   0x2d0(%[tmp])                       \n\t"
+            MMI_SDC1(%[ftmp6], %[tmp], 0x2d0)
             "2:                                                         \n\t"
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
@@ -1665,8 +1563,8 @@  static void put_h264_qpel8or16_hv1_lowpass_mmi(int16_t *tmp,
               [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
               [ftmp10]"=&f"(ftmp[10]),
               [tmp0]"=&r"(tmp0),
-              [src]"+&r"(src),
-              [low32]"=&r"(low32)
+              RESTRICT_ASM_LOW32
+              [src]"+&r"(src)
             : [tmp]"r"(tmp),                [size]"r"(size),
               [srcStride]"r"((mips_reg)srcStride),
               [ff_pw_5]"f"(ff_pw_5),        [ff_pw_16]"f"(ff_pw_16)
@@ -1684,6 +1582,7 @@  static void put_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
     int w = size >> 4;
     double ftmp[10];
     uint64_t tmp0;
+    DECLARE_VAR_ALL64;
 
     do {
         int h = size;
@@ -1694,27 +1593,20 @@  static void put_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
             "dli        %[tmp0],    0x06                                \n\t"
             "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
             "1:                                                         \n\t"
-            "ldc1       %[ftmp0],   0x00(%[tmp])                        \n\t"
-            "ldc1       %[ftmp3],   0x08(%[tmp])                        \n\t"
-            "ldc1       %[ftmp6],   0x10(%[tmp])                        \n\t"
-            "gsldlc1    %[ftmp1],   0x09(%[tmp])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x02(%[tmp])                        \n\t"
-            "gsldlc1    %[ftmp4],   0x11(%[tmp])                        \n\t"
-            "gsldrc1    %[ftmp4],   0x0a(%[tmp])                        \n\t"
-            "gsldlc1    %[ftmp5],   0x19(%[tmp])                        \n\t"
-            "gsldrc1    %[ftmp5],   0x12(%[tmp])                        \n\t"
+            MMI_LDC1(%[ftmp0], %[tmp], 0x00)
+            MMI_LDC1(%[ftmp3], %[tmp], 0x08)
+            MMI_LDC1(%[ftmp6], %[tmp], 0x10)
+            MMI_ULDC1(%[ftmp1], %[tmp], 0x02)
+            MMI_ULDC1(%[ftmp4], %[tmp], 0x0a)
+            MMI_ULDC1(%[ftmp5], %[tmp], 0x12)
             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
             "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t"
-            "gsldlc1    %[ftmp2],   0x0b(%[tmp])                        \n\t"
-            "gsldrc1    %[ftmp2],   0x04(%[tmp])                        \n\t"
-            "gsldlc1    %[ftmp6],   0x0d(%[tmp])                        \n\t"
-            "gsldrc1    %[ftmp6],   0x06(%[tmp])                        \n\t"
-            "gsldlc1    %[ftmp5],   0x13(%[tmp])                        \n\t"
-            "gsldrc1    %[ftmp5],   0x0c(%[tmp])                        \n\t"
-            "gsldlc1    %[ftmp7],   0x15(%[tmp])                        \n\t"
-            "gsldrc1    %[ftmp7],   0x0e(%[tmp])                        \n\t"
+            MMI_ULDC1(%[ftmp2], %[tmp], 0x04)
+            MMI_ULDC1(%[ftmp6], %[tmp], 0x06)
+            MMI_ULDC1(%[ftmp5], %[tmp], 0x0c)
+            MMI_ULDC1(%[ftmp7], %[tmp], 0x0e)
             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
             "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"
             "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
@@ -1733,8 +1625,7 @@  static void put_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
             "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
             "packushb   %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "gssdlc1    %[ftmp0],   0x07(%[dst])                        \n\t"
-            "gssdrc1    %[ftmp0],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp0], %[dst], 0x00)
             PTR_ADDIU  "%[tmp],     %[tmp],         0x30                \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -1744,6 +1635,7 @@  static void put_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
               [tmp0]"=&r"(tmp0),
+              RESTRICT_ASM_ALL64
               [tmp]"+&r"(tmp),              [dst]"+&r"(dst),
               [h]"+&r"(h)
             : [dstStride]"r"((mips_reg)dstStride)
@@ -1785,7 +1677,8 @@  static void put_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
     int h = 8;
     double ftmp[9];
     uint64_t tmp[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "dli        %[tmp0],    0x02                                    \n\t"
@@ -1794,10 +1687,8 @@  static void put_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[src])                            \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[src])                            \n\t"
-        "gsldlc1    %[ftmp3],   0x08(%[src])                            \n\t"
-        "gsldrc1    %[ftmp3],   0x01(%[src])                            \n\t"
+        MMI_ULDC1(%[ftmp1], %[src], 0x00)
+        MMI_ULDC1(%[ftmp3], %[src], 0x01)
         "punpckhbh  %[ftmp2],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp4],   %[ftmp3],       %[ftmp0]                \n\t"
@@ -1806,10 +1697,8 @@  static void put_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         "psllh      %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
         "psllh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
-        "gsldlc1    %[ftmp3],   0x06(%[src])                            \n\t"
-        "gsldrc1    %[ftmp3],   -0x01(%[src])                           \n\t"
-        "gsldlc1    %[ftmp5],   0x09(%[src])                            \n\t"
-        "gsldrc1    %[ftmp5],   0x02(%[src])                            \n\t"
+        MMI_ULDC1(%[ftmp3], %[src], -0x01)
+        MMI_ULDC1(%[ftmp5], %[src],  0x02)
         "punpckhbh  %[ftmp4],   %[ftmp3],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
@@ -1820,10 +1709,8 @@  static void put_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
         "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         "pmullh     %[ftmp2],   %[ftmp2],       %[ff_pw_5]              \n\t"
         "pmullh     %[ftmp1],   %[ftmp1],       %[ff_pw_5]              \n\t"
-        "uld        %[low32],   -0x02(%[src])                           \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
-        "uld        %[low32],   0x07(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        MMI_ULWC1(%[ftmp3], %[src], -0x02)
+        MMI_ULWC1(%[ftmp6], %[src], 0x07)
         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
@@ -1834,13 +1721,12 @@  static void put_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
         "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
         "psrah      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[src2])                           \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[src2])                           \n\t"
+        MMI_LDC1(%[ftmp5], %[src2],  0x00)
         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
         PTR_ADDU   "%[src],     %[src],         %[dstStride]            \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
         PTR_ADDU   "%[h],       %[h],           -0x01                   \n\t"
-        "sdc1       %[ftmp1],   0x00(%[dst])                            \n\t"
+        MMI_SDC1(%[ftmp1], %[dst], 0x00)
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
         PTR_ADDU   "%[src2],    %[src2],        %[src2Stride]           \n\t"
         "bgtz       %[h],       1b                                      \n\t"
@@ -1850,9 +1736,10 @@  static void put_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),
           [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
           [src]"+&r"(src),                  [dst]"+&r"(dst),
-          [src2]"+&r"(src2),                [h]"+&r"(h),
-          [low32]"=&r"(low32)
+          [src2]"+&r"(src2),                [h]"+&r"(h)
         : [src2Stride]"r"((mips_reg)src2Stride),
           [dstStride]"r"((mips_reg)dstStride),
           [ff_pw_5]"f"(ff_pw_5),            [ff_pw_16]"f"(ff_pw_16)
@@ -1865,35 +1752,35 @@  static void put_pixels8_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
 {
     double ftmp[7];
     uint64_t tmp0;
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     do {
         __asm__ volatile (
             "dli        %[tmp0],    0x05                                \n\t"
-            "gsldlc1    %[ftmp0],   0x07(%[src16])                      \n\t"
-            "gsldrc1    %[ftmp0],   0x00(%[src16])                      \n\t"
+            MMI_ULDC1(%[ftmp0], %[src16], 0x00)
             "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
-            "gsldlc1    %[ftmp1],   0x0f(%[src16])                      \n\t"
-            "gsldrc1    %[ftmp1],   0x08(%[src16])                      \n\t"
-            "gsldlc1    %[ftmp2],   0x37(%[src16])                      \n\t"
-            "gsldrc1    %[ftmp2],   0x30(%[src16])                      \n\t"
-            "gsldlc1    %[ftmp3],   0x3f(%[src16])                      \n\t"
-            "gsldrc1    %[ftmp3],   0x38(%[src16])                      \n\t"
+            MMI_ULDC1(%[ftmp1], %[src16], 0x08)
+            MMI_ULDC1(%[ftmp2], %[src16], 0x30)
+            MMI_ULDC1(%[ftmp3], %[src16], 0x38)
             "psrah      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
             "psrah      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
             "psrah      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
             "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
             "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t"
-            "ldc1       %[ftmp5],   0x00(%[src8])                       \n\t"
-            "gsldxc1    %[ftmp4],   0x00(%[src8],   %[src8Stride])      \n\t"
+            MMI_LDC1(%[ftmp5], %[src8], 0x00)
+            MMI_LDXC1(%[ftmp4], %[src8], %[src8Stride], 0x00)
             "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"
             "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
-            "sdc1       %[ftmp0],   0x00(%[dst])                        \n\t"
-            "gssdxc1    %[ftmp2],   0x00(%[dst],    %[dstStride])       \n\t"
+            MMI_SDC1(%[ftmp0], %[dst], 0x00)
+            MMI_SDXC1(%[ftmp2], %[dst], %[dstStride], 0x00)
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [ftmp6]"=&f"(ftmp[6]),
+              RESTRICT_ASM_ALL64
+              RESTRICT_ASM_ADDRT
               [tmp0]"=&r"(tmp0)
             : [src8]"r"(src8),              [src16]"r"(src16),
               [dst]"r"(dst),
@@ -1941,7 +1828,7 @@  static void avg_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
     int16_t *tmp = _tmp;
     double ftmp[10];
     uint64_t tmp0;
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     src -= 2*srcStride;
 
@@ -1949,18 +1836,12 @@  static void avg_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "dli        %[tmp0],    0x09                                    \n\t"
         "1:                                                             \n\t"
-        "uld        %[low32],   -0x02(%[src])                           \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "uld        %[low32],   -0x01(%[src])                           \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
-        "uld        %[low32],   0x00(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
-        "uld        %[low32],   0x01(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp4]                                \n\t"
-        "uld        %[low32],   0x02(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
-        "uld        %[low32],   0x03(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        MMI_ULWC1(%[ftmp1], %[src], -0x02)
+        MMI_ULWC1(%[ftmp2], %[src], -0x01)
+        MMI_ULWC1(%[ftmp3], %[src],  0x00)
+        MMI_ULWC1(%[ftmp4], %[src],  0x01)
+        MMI_ULWC1(%[ftmp5], %[src],  0x02)
+        MMI_ULWC1(%[ftmp6], %[src],  0x03)
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
@@ -1974,7 +1855,7 @@  static void avg_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
         "pmullh     %[ftmp8],   %[ftmp8],       %[ff_pw_5]              \n\t"
         "psubsh     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
         "paddsh     %[ftmp9],   %[ftmp7],       %[ftmp9]                \n\t"
-        "sdc1       %[ftmp9],   0x00(%[tmp])                            \n\t"
+        MMI_SDC1(%[ftmp9], %[tmp], 0x00)
         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDU   "%[src],     %[src],         %[srcStride]            \n\t"
         PTR_ADDU   "%[tmp],     %[tmp],         %[tmpStride]            \n\t"
@@ -1985,8 +1866,8 @@  static void avg_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [tmp0]"=&r"(tmp0),
-          [tmp]"+&r"(tmp),                  [src]"+&r"(src),
-          [low32]"=&r"(low32)
+          RESTRICT_ASM_LOW32
+          [tmp]"+&r"(tmp),                  [src]"+&r"(src)
         : [tmpStride]"r"(8),
           [srcStride]"r"((mips_reg)srcStride),
           [ff_pw_20]"f"(ff_pw_20),          [ff_pw_5]"f"(ff_pw_5)
@@ -2020,6 +1901,7 @@  static void avg_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
     int w = size >> 4;
     double ftmp[11];
     uint64_t tmp0;
+    DECLARE_VAR_ALL64;
 
     do {
         int h = size;
@@ -2029,27 +1911,20 @@  static void avg_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
             "dli        %[tmp0],    0x06                                \n\t"
             "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
             "1:                                                         \n\t"
-            "ldc1       %[ftmp0],   0x00(%[tmp])                        \n\t"
-            "ldc1       %[ftmp3],   0x08(%[tmp])                        \n\t"
-            "gsldlc1    %[ftmp1],   0x09(%[tmp])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x02(%[tmp])                        \n\t"
-            "gsldlc1    %[ftmp4],   0x11(%[tmp])                        \n\t"
-            "gsldrc1    %[ftmp4],   0x0a(%[tmp])                        \n\t"
-            "ldc1       %[ftmp7],   0x10(%[tmp])                        \n\t"
-            "gsldlc1    %[ftmp8],   0x19(%[tmp])                        \n\t"
-            "gsldrc1    %[ftmp8],   0x12(%[tmp])                        \n\t"
+            MMI_LDC1(%[ftmp0], %[tmp], 0x00)
+            MMI_LDC1(%[ftmp3], %[tmp], 0x08)
+            MMI_ULDC1(%[ftmp1], %[tmp], 0x02)
+            MMI_ULDC1(%[ftmp4], %[tmp], 0x0a)
+            MMI_LDC1(%[ftmp7], %[tmp], 0x10)
+            MMI_ULDC1(%[ftmp8], %[tmp], 0x12)
             "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
             "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
             "paddh      %[ftmp3],   %[ftmp3],       %[ftmp8]            \n\t"
             "paddh      %[ftmp4],   %[ftmp4],       %[ftmp7]            \n\t"
-            "gsldlc1    %[ftmp2],   0x0b(%[tmp])                        \n\t"
-            "gsldrc1    %[ftmp2],   0x04(%[tmp])                        \n\t"
-            "gsldlc1    %[ftmp5],   0x13(%[tmp])                        \n\t"
-            "gsldrc1    %[ftmp5],   0x0c(%[tmp])                        \n\t"
-            "gsldlc1    %[ftmp7],   0x0d(%[tmp])                        \n\t"
-            "gsldrc1    %[ftmp7],   0x06(%[tmp])                        \n\t"
-            "gsldlc1    %[ftmp8],   0x15(%[tmp])                        \n\t"
-            "gsldrc1    %[ftmp8],   0x0e(%[tmp])                        \n\t"
+            MMI_ULDC1(%[ftmp2], %[tmp], 0x04)
+            MMI_ULDC1(%[ftmp5], %[tmp], 0x0c)
+            MMI_ULDC1(%[ftmp7], %[tmp], 0x06)
+            MMI_ULDC1(%[ftmp8], %[tmp], 0x0e)
             "paddh      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
             "paddh      %[ftmp5],   %[ftmp5],       %[ftmp8]            \n\t"
             "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
@@ -2067,9 +1942,9 @@  static void avg_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
             "psrah      %[ftmp0],   %[ftmp0],       %[ftmp10]           \n\t"
             "psrah      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
             "packushb   %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
-            "ldc1       %[ftmp6],   0x00(%[dst])                        \n\t"
+            MMI_LDC1(%[ftmp6], %[dst], 0x00)
             "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
-            "sdc1       %[ftmp0],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp0], %[dst], 0x00)
             "addi       %[h],       %[h],           -0x01               \n\t"
             PTR_ADDI   "%[tmp],     %[tmp],         0x30                \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[dstStride]        \n\t"
@@ -2081,6 +1956,7 @@  static void avg_h264_qpel8or16_hv2_lowpass_mmi(uint8_t *dst,
               [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
               [ftmp10]"=&f"(ftmp[10]),
               [tmp0]"=&r"(tmp0),
+              RESTRICT_ASM_ALL64
               [tmp]"+&r"(tmp),              [dst]"+&r"(dst),
               [h]"+&r"(h)
             : [dstStride]"r"((mips_reg)dstStride)
@@ -2121,7 +1997,8 @@  static void avg_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
 {
     double ftmp[10];
     uint64_t tmp[2];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "dli        %[tmp1],    0x02                                    \n\t"
@@ -2131,10 +2008,8 @@  static void avg_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "mtc1       %[tmp1],    %[ftmp8]                                \n\t"
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[src])                            \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[src])                            \n\t"
-        "gsldlc1    %[ftmp2],   0x08(%[src])                            \n\t"
-        "gsldrc1    %[ftmp2],   0x01(%[src])                            \n\t"
+        MMI_ULDC1(%[ftmp1], %[src], 0x00)
+        MMI_ULDC1(%[ftmp2], %[src], 0x01)
         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp4],   %[ftmp2],       %[ftmp0]                \n\t"
@@ -2143,10 +2018,8 @@  static void avg_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
         "psllh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
         "psllh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
-        "gsldlc1    %[ftmp2],   0x06(%[src])                            \n\t"
-        "gsldrc1    %[ftmp2],   -0x01(%[src])                           \n\t"
-        "gsldlc1    %[ftmp5],   0x09(%[src])                            \n\t"
-        "gsldrc1    %[ftmp5],   0x02(%[src])                            \n\t"
+        MMI_ULDC1(%[ftmp2], %[src], -0x01)
+        MMI_ULDC1(%[ftmp5], %[src],  0x02)
         "punpckhbh  %[ftmp4],   %[ftmp2],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp6],   %[ftmp5],       %[ftmp0]                \n\t"
@@ -2157,10 +2030,8 @@  static void avg_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
         "psubh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
         "pmullh     %[ftmp1],   %[ftmp1],       %[ff_pw_5]              \n\t"
         "pmullh     %[ftmp3],   %[ftmp3],       %[ff_pw_5]              \n\t"
-        "uld        %[low32],   -0x02(%[src])                           \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
-        "uld        %[low32],   0x07(%[src])                            \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        MMI_ULWC1(%[ftmp2], %[src], -0x02)
+        MMI_ULWC1(%[ftmp6], %[src],  0x07)
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
@@ -2171,14 +2042,13 @@  static void avg_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
         "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[src2])                           \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[src2])                           \n\t"
+        MMI_LDC1(%[ftmp5], %[src2], 0x00)
         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
-        "ldc1       %[ftmp9],   0x00(%[dst])                            \n\t"
+        MMI_LDC1(%[ftmp9], %[dst], 0x00)
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
         PTR_ADDU   "%[src],     %[src],         %[dstStride]            \n\t"
-        "sdc1       %[ftmp1],   0x00(%[dst])                            \n\t"
+        MMI_SDC1(%[ftmp1], %[dst], 0x00)
         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[dstStride]            \n\t"
         PTR_ADDU   "%[src2],    %[src2],        %[src2Stride]           \n\t"
@@ -2189,9 +2059,10 @@  static void avg_h264_qpel8_h_lowpass_l2_mmi(uint8_t *dst, const uint8_t *src,
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
           [dst]"+&r"(dst),                  [src]"+&r"(src),
-          [src2]"+&r"(src2),
-          [low32]"=&r"(low32)
+          [src2]"+&r"(src2)
         : [dstStride]"r"((mips_reg)dstStride),
           [src2Stride]"r"((mips_reg)src2Stride),
           [ff_pw_5]"f"(ff_pw_5),            [ff_pw_16]"f"(ff_pw_16)
@@ -2220,39 +2091,39 @@  static void avg_pixels8_l2_shift5_mmi(uint8_t *dst, int16_t *src16,
 {
     double ftmp[8];
     uint64_t tmp0;
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     do {
         __asm__ volatile (
             "dli        %[tmp0],    0x05                                \n\t"
-            "gsldlc1    %[ftmp0],   0x07(%[src16])                      \n\t"
-            "gsldrc1    %[ftmp0],   0x00(%[src16])                      \n\t"
+            MMI_ULDC1(%[ftmp0], %[src16], 0x00)
             "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
-            "gsldlc1    %[ftmp1],   0x0f(%[src16])                      \n\t"
-            "gsldrc1    %[ftmp1],   0x08(%[src16])                      \n\t"
-            "gsldlc1    %[ftmp2],   0x37(%[src16])                      \n\t"
-            "gsldrc1    %[ftmp2],   0x30(%[src16])                      \n\t"
-            "gsldlc1    %[ftmp3],   0x3f(%[src16])                      \n\t"
-            "gsldrc1    %[ftmp3],   0x38(%[src16])                      \n\t"
+            MMI_ULDC1(%[ftmp1], %[src16], 0x08)
+            MMI_ULDC1(%[ftmp2], %[src16], 0x30)
+            MMI_ULDC1(%[ftmp3], %[src16], 0x38)
             "psrah      %[ftmp0],   %[ftmp0],       %[ftmp6]            \n\t"
             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
             "psrah      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
             "psrah      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
             "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"
-            "ldc1       %[ftmp4],   0x00(%[src8])                       \n\t"
-            "gsldxc1    %[ftmp5],   0x00(%[src8],   %[src8Stride])      \n\t"
+            MMI_LDC1(%[ftmp4], %[src8], 0x00)
+            MMI_LDXC1(%[ftmp5], %[src8], %[src8Stride], 0x00)
             "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t"
             "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]            \n\t"
             "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
-            "ldc1       %[ftmp7],   0x00(%[dst])                        \n\t"
+            MMI_LDC1(%[ftmp7], %[dst], 0x00)
             "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp7]            \n\t"
-            "sdc1       %[ftmp0],   0x00(%[dst])                        \n\t"
-            "gsldxc1    %[ftmp7],   0x00(%[dst],    %[dstStride])       \n\t"
+            MMI_SDC1(%[ftmp0], %[dst], 0x00)
+            MMI_LDXC1(%[ftmp7], %[dst], %[dstStride], 0x00)
             "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
-            "gssdxc1    %[ftmp2],   0x00(%[dst],    %[dstStride])       \n\t"
+            MMI_SDXC1(%[ftmp2], %[dst], %[dstStride], 0x00)
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              RESTRICT_ASM_ALL64
+              RESTRICT_ASM_ADDRT
               [tmp0]"=&r"(tmp0)
             : [src8]"r"(src8),              [src16]"r"(src16),
               [dst]"r"(dst),
diff --git a/libavcodec/mips/hpeldsp_mmi.c b/libavcodec/mips/hpeldsp_mmi.c
index 4c46f00..2dbef22 100644
--- a/libavcodec/mips/hpeldsp_mmi.c
+++ b/libavcodec/mips/hpeldsp_mmi.c
@@ -23,7 +23,7 @@ 
 
 #include "hpeldsp_mips.h"
 #include "libavcodec/bit_depth_template.c"
-#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
 #include "constants.h"
 
 void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
@@ -31,36 +31,34 @@  void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
 {
     double ftmp[2];
     mips_reg addr[2];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
         "1:                                                             \n\t"
         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        "uld        %[low32],   0x00(%[pixels])                         \n\t"
-        "mtc1       %[low32],   %[ftmp0]                                \n\t"
-        "uld        %[low32],   0x00(%[addr0])                          \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
-        "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
+        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
+        MMI_SWC1(%[ftmp0], %[block], 0x00)
+        MMI_SWXC1(%[ftmp1], %[block], %[line_size], 0x00)
         PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
         PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
 
         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        "uld        %[low32],   0x00(%[pixels])                         \n\t"
-        "mtc1       %[low32],   %[ftmp0]                                \n\t"
-        "uld        %[low32],   0x00(%[addr0])                          \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
-        "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
+        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
+        MMI_SWC1(%[ftmp0], %[block], 0x00)
+        MMI_SWXC1(%[ftmp1], %[block], %[line_size], 0x00)
         PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
         PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
 
         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
         "bnez       %[h],       1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-          [low32]"=&r"(low32),
           [block]"+&r"(block),              [pixels]"+&r"(pixels),
           [h]"+&r"(h)
         : [line_size]"r"((mips_reg)line_size)
@@ -72,35 +70,36 @@  void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
     ptrdiff_t line_size, int h)
 {
     double ftmp[2];
-    mips_reg addr[2];
+    mips_reg addr[3];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        PTR_ADDU   "%[addr2],   %[block],       %[line_size]            \n\t"
+        MMI_SDC1(%[ftmp1], %[addr2], 0x00)
         PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
         PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
 
-        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        PTR_ADDU   "%[addr2],   %[block],       %[line_size]            \n\t"
+        MMI_SDC1(%[ftmp1], %[addr2], 0x00)
         PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
         PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
 
         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
         "bnez       %[h],       1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          RESTRICT_ASM_ALL64
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),
           [block]"+&r"(block),              [pixels]"+&r"(pixels),
           [h]"+&r"(h)
         : [line_size]"r"((mips_reg)line_size)
@@ -113,39 +112,33 @@  void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
 {
     double ftmp[4];
     mips_reg addr[2];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
-        "gsldlc1    %[ftmp2],   0x0f(%[pixels])                         \n\t"
-        "gsldrc1    %[ftmp2],   0x08(%[pixels])                         \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp3],   0x0f(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp3],   0x08(%[addr0])                          \n\t"
-        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
-        "sdc1       %[ftmp2],   0x08(%[block])                          \n\t"
-        "gssdxc1    %[ftmp3],   0x08(%[block],  %[line_size])           \n\t"
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
+        MMI_ULDC1(%[ftmp2], %[pixels], 0x08)
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr0], 0x08)
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
+        MMI_SDC1(%[ftmp2], %[block], 0x08)
+        MMI_SDXC1(%[ftmp3], %[block], %[line_size], 0x08)
         PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
         PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
 
-        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
-        "gsldlc1    %[ftmp2],   0x0f(%[pixels])                         \n\t"
-        "gsldrc1    %[ftmp2],   0x08(%[pixels])                         \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp3],   0x0f(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp3],   0x08(%[addr0])                          \n\t"
-        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
-        "sdc1       %[ftmp2],   0x08(%[block])                          \n\t"
-        "gssdxc1    %[ftmp3],   0x08(%[block],  %[line_size])           \n\t"
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
+        MMI_ULDC1(%[ftmp2], %[pixels], 0x08)
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr0], 0x08)
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
+        MMI_SDC1(%[ftmp2], %[block], 0x08)
+        MMI_SDXC1(%[ftmp3], %[block], %[line_size], 0x08)
         PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
         PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
 
@@ -153,6 +146,8 @@  void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
         "bnez       %[h],       1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [block]"+&r"(block),              [pixels]"+&r"(pixels),
           [h]"+&r"(h)
@@ -166,42 +161,35 @@  void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
 {
     double ftmp[4];
     mips_reg addr[3];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
         "1:                                                             \n\t"
         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        "uld        %[low32],   0x00(%[pixels])                         \n\t"
-        "mtc1       %[low32],   %[ftmp0]                                \n\t"
-        "uld        %[low32],   0x00(%[addr0])                          \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
+        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
-        "uld        %[low32],   0x00(%[block])                          \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
-        "uld        %[low32],   0x00(%[addr1])                          \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        MMI_ULWC1(%[ftmp2], %[block], 0x00)
+        MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
-        "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
-        "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        MMI_SWC1(%[ftmp0], %[block], 0x00)
+        MMI_SWXC1(%[ftmp1], %[block], %[line_size], 0x00)
         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
 
         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        "uld        %[low32],   0x00(%[pixels])                         \n\t"
-        "mtc1       %[low32],   %[ftmp0]                                \n\t"
-        "uld        %[low32],   0x00(%[addr0])                          \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        MMI_ULWC1(%[ftmp0], %[pixels], 0x00)
+        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
-        "uld        %[low32],   0x00(%[block])                          \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
-        "uld        %[low32],   0x00(%[addr1])                          \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        MMI_ULWC1(%[ftmp2], %[block], 0x00)
+        MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
-        "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
-        "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        MMI_SWC1(%[ftmp0], %[block], 0x00)
+        MMI_SWXC1(%[ftmp1], %[block], %[line_size], 0x00)
         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
 
@@ -209,9 +197,10 @@  void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
         "bnez       %[h],       1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),
-          [low32]"=&r"(low32),
           [block]"+&r"(block),              [pixels]"+&r"(pixels),
           [h]"+&r"(h)
         : [line_size]"r"((mips_reg)line_size)
@@ -224,41 +213,35 @@  void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
 {
     double ftmp[4];
     mips_reg addr[3];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp2], %[block], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
 
-        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp2], %[block], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
 
@@ -266,6 +249,8 @@  void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
         "bnez       %[h],       1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),
           [block]"+&r"(block),              [pixels]"+&r"(pixels),
@@ -280,65 +265,51 @@  void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
 {
     double ftmp[8];
     mips_reg addr[3];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
-        "gsldlc1    %[ftmp4],   0x0f(%[pixels])                         \n\t"
+        MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
-        "gsldrc1    %[ftmp4],   0x08(%[pixels])                         \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
-        "gsldlc1    %[ftmp6],   0x0f(%[block])                          \n\t"
-        "gsldrc1    %[ftmp6],   0x08(%[block])                          \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
-        "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
+        MMI_ULDC1(%[ftmp2], %[block], 0x00)
+        MMI_ULDC1(%[ftmp6], %[block], 0x08)
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
+        MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
-        "sdc1       %[ftmp4],   0x08(%[block])                          \n\t"
-        "gssdxc1    %[ftmp5],   0x08(%[block],  %[line_size])           \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
+        MMI_SDC1(%[ftmp4], %[block], 0x08)
+        MMI_SDXC1(%[ftmp5], %[block], %[line_size], 0x08)
         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
 
-        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
-        "gsldlc1    %[ftmp4],   0x0f(%[pixels])                         \n\t"
+        MMI_ULDC1(%[ftmp4], %[pixels], 0x08)
         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
-        "gsldrc1    %[ftmp4],   0x08(%[pixels])                         \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
-        "gsldlc1    %[ftmp6],   0x0f(%[block])                          \n\t"
-        "gsldrc1    %[ftmp6],   0x08(%[block])                          \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
-        "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
+        MMI_ULDC1(%[ftmp2], %[block], 0x00)
+        MMI_ULDC1(%[ftmp6], %[block], 0x08)
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
+        MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
-        "sdc1       %[ftmp4],   0x08(%[block])                          \n\t"
-        "gssdxc1    %[ftmp5],   0x08(%[block],  %[line_size])           \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDXC1(%[ftmp1], %[block], %[line_size], 0x00)
+        MMI_SDC1(%[ftmp4], %[block], 0x08)
+        MMI_SDXC1(%[ftmp5], %[block], %[line_size], 0x08)
         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
 
@@ -348,6 +319,8 @@  void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),
           [block]"+&r"(block),              [pixels]"+&r"(pixels),
@@ -363,7 +336,8 @@  inline void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
 {
     double ftmp[4];
     mips_reg addr[5];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
@@ -371,38 +345,30 @@  inline void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
         "1:                                                             \n\t"
         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
-        "uld        %[low32],   0x00(%[src1])                           \n\t"
-        "mtc1       %[low32],   %[ftmp0]                                \n\t"
-        "uld        %[low32],   0x00(%[addr0])                          \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "uld        %[low32],   0x00(%[src2])                           \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        MMI_ULWC1(%[ftmp0], %[src1], 0x00)
+        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULWC1(%[ftmp2], %[src2], 0x00)
         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
-        "uld        %[low32],   0x00(%[addr1])                          \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
-        "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
-        "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        MMI_SWC1(%[ftmp0], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
 
         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
-        "uld        %[low32],   0x00(%[src1])                           \n\t"
-        "mtc1       %[low32],   %[ftmp0]                                \n\t"
-        "uld        %[low32],   0x00(%[addr0])                          \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "uld        %[low32],   0x00(%[src2])                           \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        MMI_ULWC1(%[ftmp0], %[src1], 0x00)
+        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULWC1(%[ftmp2], %[src2], 0x00)
         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
-        "uld        %[low32],   0x00(%[addr1])                          \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
-        "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
-        "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        MMI_SWC1(%[ftmp0], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
 
@@ -410,10 +376,11 @@  inline void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
         "bnez       %[h],       1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),
-          [low32]"=&r"(low32),
           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
           [src2]"+&r"(src2),                [h]"+&r"(h)
         : [dst_stride]"r"((mips_reg)dst_stride),
@@ -429,45 +396,40 @@  inline void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
 {
     double ftmp[4];
     mips_reg addr[5];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
 
-        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
 
@@ -475,6 +437,8 @@  inline void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
         "bnez       %[h],       1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),
@@ -493,69 +457,56 @@  inline void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
 {
     double ftmp[8];
     mips_reg addr[5];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
-        "gsldlc1    %[ftmp4],   0x0f(%[src1])                           \n\t"
-        "gsldrc1    %[ftmp4],   0x08(%[src1])                           \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        MMI_ULDC1(%[ftmp4], %[src1], 0x08)
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
-        "gsldlc1    %[ftmp6],   0x0f(%[src2])                           \n\t"
-        "gsldrc1    %[ftmp6],   0x08(%[src2])                           \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp6], %[src2], 0x08)
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
-        "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
-        "sdc1       %[ftmp4],   0x08(%[dst])                            \n\t"
-        "gssdxc1    %[ftmp5],   0x08(%[dst],    %[dst_stride])          \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
+        MMI_SDC1(%[ftmp4], %[dst], 0x08)
+        MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
 
-        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
-        "gsldlc1    %[ftmp4],   0x0f(%[src1])                           \n\t"
-        "gsldrc1    %[ftmp4],   0x08(%[src1])                           \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        MMI_ULDC1(%[ftmp4], %[src1], 0x08)
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp5], %[addr0], 0x08)
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
-        "gsldlc1    %[ftmp6],   0x0f(%[src2])                           \n\t"
-        "gsldrc1    %[ftmp6],   0x08(%[src2])                           \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp6], %[src2], 0x08)
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
-        "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp7], %[addr1], 0x08)
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
-        "sdc1       %[ftmp4],   0x08(%[dst])                            \n\t"
-        "gssdxc1    %[ftmp5],   0x08(%[dst],    %[dst_stride])          \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
+        MMI_SDC1(%[ftmp4], %[dst], 0x08)
+        MMI_SDXC1(%[ftmp5], %[dst], %[dst_stride], 0x08)
         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
 
@@ -565,6 +516,8 @@  inline void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),
@@ -583,60 +536,50 @@  inline void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
 {
     double ftmp[6];
     mips_reg addr[6];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+
         "1:                                                             \n\t"
         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
-        "uld        %[low32],   0x00(%[src1])                           \n\t"
-        "mtc1       %[low32],   %[ftmp0]                                \n\t"
-        "uld        %[low32],   0x00(%[addr0])                          \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "uld        %[low32],   0x00(%[src2])                           \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        MMI_ULWC1(%[ftmp0], %[src1], 0x00)
+        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULWC1(%[ftmp2], %[src2], 0x00)
         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
-        "uld        %[low32],   0x00(%[addr1])                          \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
-        "uld        %[low32],   0x00(%[dst])                            \n\t"
-        "mtc1       %[low32],   %[ftmp4]                                \n\t"
-        "uld        %[low32],   0x00(%[addr5])                          \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        MMI_ULWC1(%[ftmp4], %[dst], 0x00)
+        MMI_ULWC1(%[ftmp5], %[addr5], 0x00)
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
-        "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
-        "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        MMI_SWC1(%[ftmp0], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
 
         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
-        "uld        %[low32],   0x00(%[src1])                           \n\t"
-        "mtc1       %[low32],   %[ftmp0]                                \n\t"
-        "uld        %[low32],   0x00(%[addr0])                          \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "uld        %[low32],   0x00(%[src2])                           \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        MMI_ULWC1(%[ftmp0], %[src1], 0x00)
+        MMI_ULWC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULWC1(%[ftmp2], %[src2], 0x00)
         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
-        "uld        %[low32],   0x00(%[addr1])                          \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        MMI_ULWC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
-        "uld        %[low32],   0x00(%[dst])                            \n\t"
-        "mtc1       %[low32],   %[ftmp4]                                \n\t"
-        "uld        %[low32],   0x00(%[addr5])                          \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        MMI_ULWC1(%[ftmp4], %[dst], 0x00)
+        MMI_ULWC1(%[ftmp5], %[addr5], 0x00)
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
-        "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
-        "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        MMI_SWC1(%[ftmp0], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
 
@@ -645,10 +588,11 @@  inline void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
-          [low32]"=&r"(low32),
           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
           [src2]"+&r"(src2),                [h]"+&r"(h)
         : [dst_stride]"r"((mips_reg)dst_stride),
@@ -664,59 +608,50 @@  inline void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
 {
     double ftmp[6];
     mips_reg addr[6];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
-        "gsldlc1    %[ftmp4],   0x07(%[dst])                            \n\t"
-        "gsldrc1    %[ftmp4],   0x00(%[dst])                            \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp4], %[dst], 0x00)
+        MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
 
-        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
-        "gsldlc1    %[ftmp4],   0x07(%[dst])                            \n\t"
-        "gsldrc1    %[ftmp4],   0x00(%[dst])                            \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp4], %[dst], 0x00)
+        MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
 
@@ -725,6 +660,8 @@  inline void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
@@ -795,24 +732,23 @@  inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
 {
     double ftmp[5];
     mips_reg addr[5];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
         "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
@@ -821,22 +757,18 @@  inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
 
-        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        MMI_ULDC1(%[ftmp0], %[src1], 0x00)
         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_ULDC1(%[ftmp2], %[src2], 0x00)
         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
         "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
@@ -845,8 +777,8 @@  inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
-        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        MMI_SDC1(%[ftmp0], %[dst], 0x00)
+        MMI_SDXC1(%[ftmp1], %[dst], %[dst_stride], 0x00)
         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
 
@@ -855,6 +787,8 @@  inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),
@@ -981,6 +915,8 @@  void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
 #if 1
     double ftmp[10];
     mips_reg addr[2];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
@@ -993,11 +929,9 @@  void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
         "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
 
         "dli        %[addr0],   0x02                                    \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
         "dmtc1      %[addr0],   %[ftmp9]                                \n\t"
-        "gsldlc1    %[ftmp4],   0x08(%[pixels])                         \n\t"
-        "gsldrc1    %[ftmp4],   0x01(%[pixels])                         \n\t"
+        MMI_ULDC1(%[ftmp0], %[pixels], 0x00)
+        MMI_ULDC1(%[ftmp4], %[pixels], 0x01)
         "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
         "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
@@ -1009,12 +943,11 @@  void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
         "xor        %[addr0],   %[addr0],       %[addr0]                \n\t"
         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
         ".p2align   3                                                   \n\t"
+
         "1:                                                             \n\t"
         PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x08(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x01(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
+        MMI_ULDC1(%[ftmp2], %[addr1], 0x01)
         "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
         "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
@@ -1030,13 +963,11 @@  void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
         "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
         "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
-        "gssdxc1    %[ftmp4],   0x00(%[block],  %[addr0])               \n\t"
+        MMI_SDXC1(%[ftmp4], %[block], %[addr0], 0x00)
         PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
         PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
-        "gsldlc1    %[ftmp4],   0x08(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp4],   0x01(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
+        MMI_ULDC1(%[ftmp4], %[addr1], 0x01)
         "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
         "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
@@ -1052,7 +983,7 @@  void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
         "psrlh      %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
         "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
-        "gssdxc1    %[ftmp0],   0x00(%[block],  %[addr0])               \n\t"
+        MMI_SDXC1(%[ftmp0], %[block], %[addr0], 0x00)
         PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
         PTR_ADDU   "%[h],       %[h],           -0x02                   \n\t"
         "bnez       %[h],       1b                                      \n\t"
@@ -1061,6 +992,8 @@  void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [h]"+&r"(h),                      [pixels]"+&r"(pixels)
         : [block]"r"(block),                [line_size]"r"((mips_reg)line_size)
diff --git a/libavcodec/mips/idctdsp_mmi.c b/libavcodec/mips/idctdsp_mmi.c
index 24beb62..b797965 100644
--- a/libavcodec/mips/idctdsp_mmi.c
+++ b/libavcodec/mips/idctdsp_mmi.c
@@ -23,36 +23,40 @@ 
 
 #include "idctdsp_mips.h"
 #include "constants.h"
-#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
 
 void ff_put_pixels_clamped_mmi(const int16_t *block,
         uint8_t *av_restrict pixels, ptrdiff_t line_size)
 {
     double ftmp[8];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
-        "ldc1       %[ftmp0],   0x00(%[block])                          \n\t"
-        "ldc1       %[ftmp1],   0x08(%[block])                          \n\t"
-        "ldc1       %[ftmp2],   0x10(%[block])                          \n\t"
-        "ldc1       %[ftmp3],   0x18(%[block])                          \n\t"
-        "ldc1       %[ftmp4],   0x20(%[block])                          \n\t"
-        "ldc1       %[ftmp5],   0x28(%[block])                          \n\t"
-        "ldc1       %[ftmp6],   0x30(%[block])                          \n\t"
-        "ldc1       %[ftmp7],   0x38(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp0], %[block], 0x00)
+        MMI_LDC1(%[ftmp1], %[block], 0x08)
+        MMI_LDC1(%[ftmp2], %[block], 0x10)
+        MMI_LDC1(%[ftmp3], %[block], 0x18)
+        MMI_LDC1(%[ftmp4], %[block], 0x20)
+        MMI_LDC1(%[ftmp5], %[block], 0x28)
+        MMI_LDC1(%[ftmp6], %[block], 0x30)
+        MMI_LDC1(%[ftmp7], %[block], 0x38)
         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[pixels])                         \n\t"
-        "sdc1       %[ftmp2],   0x00(%[addr0])                          \n\t"
-        "gssdxc1    %[ftmp4],   0x00(%[addr0],  %[line_size])           \n\t"
-        "gssdxc1    %[ftmp6],   0x00(%[pixels], %[line_sizex3])         \n\t"
+        MMI_SDC1(%[ftmp0], %[pixels], 0x00)
+        MMI_SDC1(%[ftmp2], %[addr0], 0x00)
+        MMI_SDXC1(%[ftmp4], %[addr0], %[line_size], 0x00)
+        MMI_SDXC1(%[ftmp6], %[pixels], %[line_sizex3], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),
           [pixels]"+&r"(pixels)
         : [line_size]"r"((mips_reg)line_size),
@@ -65,27 +69,29 @@  void ff_put_pixels_clamped_mmi(const int16_t *block,
     block += 32;
 
     __asm__ volatile (
-        "ldc1       %[ftmp0],   0x00(%[block])                          \n\t"
-        "ldc1       %[ftmp1],   0x08(%[block])                          \n\t"
-        "ldc1       %[ftmp2],   0x10(%[block])                          \n\t"
-        "ldc1       %[ftmp3],   0x18(%[block])                          \n\t"
-        "ldc1       %[ftmp4],   0x20(%[block])                          \n\t"
-        "ldc1       %[ftmp5],   0x28(%[block])                          \n\t"
-        "ldc1       %[ftmp6],   0x30(%[block])                          \n\t"
-        "ldc1       %[ftmp7],   0x38(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp0], %[block], 0x00)
+        MMI_LDC1(%[ftmp1], %[block], 0x08)
+        MMI_LDC1(%[ftmp2], %[block], 0x10)
+        MMI_LDC1(%[ftmp3], %[block], 0x18)
+        MMI_LDC1(%[ftmp4], %[block], 0x20)
+        MMI_LDC1(%[ftmp5], %[block], 0x28)
+        MMI_LDC1(%[ftmp6], %[block], 0x30)
+        MMI_LDC1(%[ftmp7], %[block], 0x38)
         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         "packushb   %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[pixels])                         \n\t"
-        "sdc1       %[ftmp2],   0x00(%[addr0])                          \n\t"
-        "gssdxc1    %[ftmp4],   0x00(%[addr0],  %[line_size])           \n\t"
-        "gssdxc1    %[ftmp6],   0x00(%[pixels], %[line_sizex3])         \n\t"
+        MMI_SDC1(%[ftmp0], %[pixels], 0x00)
+        MMI_SDC1(%[ftmp2], %[addr0], 0x00)
+        MMI_SDXC1(%[ftmp4], %[addr0], %[line_size], 0x00)
+        MMI_SDXC1(%[ftmp6], %[pixels], %[line_sizex3], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),
           [pixels]"+&r"(pixels)
         : [line_size]"r"((mips_reg)line_size),
@@ -102,56 +108,60 @@  void ff_put_signed_pixels_clamped_mmi(const int16_t *block,
     int64_t line_skip3 = 0;
     double ftmp[5];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         PTR_ADDU   "%[line_skip3],  %[line_skip],   %[line_skip]        \n\t"
-        "ldc1       %[ftmp1],       0x00(%[block])                      \n\t"
-        "ldc1       %[ftmp0],       0x08(%[block])                      \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x00)
+        MMI_LDC1(%[ftmp0], %[block], 0x08)
         "packsshb   %[ftmp1],       %[ftmp1],       %[ftmp0]            \n\t"
-        "ldc1       %[ftmp2],       0x10(%[block])                      \n\t"
-        "ldc1       %[ftmp0],       0x18(%[block])                      \n\t"
+        MMI_LDC1(%[ftmp2], %[block], 0x10)
+        MMI_LDC1(%[ftmp0], %[block], 0x18)
         "packsshb   %[ftmp2],       %[ftmp2],       %[ftmp0]            \n\t"
-        "ldc1       %[ftmp3],       0x20(%[block])                      \n\t"
-        "ldc1       %[ftmp0],       0x28(%[block])                      \n\t"
+        MMI_LDC1(%[ftmp3], %[block], 0x20)
+        MMI_LDC1(%[ftmp0], %[block], 0x28)
         "packsshb   %[ftmp3],       %[ftmp3],       %[ftmp0]            \n\t"
-        "ldc1       %[ftmp4],       48(%[block])                        \n\t"
-        "ldc1       %[ftmp0],       56(%[block])                        \n\t"
+        MMI_LDC1(%[ftmp4], %[block], 0x30)
+        MMI_LDC1(%[ftmp0], %[block], 0x38)
         "packsshb   %[ftmp4],       %[ftmp4],       %[ftmp0]            \n\t"
         "paddb      %[ftmp1],       %[ftmp1],       %[ff_pb_80]         \n\t"
         "paddb      %[ftmp2],       %[ftmp2],       %[ff_pb_80]         \n\t"
         "paddb      %[ftmp3],       %[ftmp3],       %[ff_pb_80]         \n\t"
         "paddb      %[ftmp4],       %[ftmp4],       %[ff_pb_80]         \n\t"
-        "sdc1       %[ftmp1],       0x00(%[pixels])                     \n\t"
-        "gssdxc1    %[ftmp2],       0x00(%[pixels], %[line_skip])       \n\t"
-        "gssdxc1    %[ftmp3],       0x00(%[pixels], %[line_skip3])      \n\t"
+        MMI_SDC1(%[ftmp1], %[pixels], 0x00)
+        MMI_SDXC1(%[ftmp2], %[pixels], %[line_skip], 0x00)
+        MMI_SDXC1(%[ftmp3], %[pixels], %[line_skip3], 0x00)
         PTR_ADDU   "%[line_skip3],  %[line_skip3],  %[line_skip]        \n\t"
-        "gssdxc1    %[ftmp4],       0x00(%[pixels], %[line_skip3])      \n\t"
+        MMI_SDXC1(%[ftmp4], %[pixels], %[line_skip3], 0x00)
         PTR_ADDU   "%[addr0],       %[line_skip3],  %[line_skip]        \n\t"
         PTR_ADDU   "%[pixels],      %[pixels],      %[addr0]            \n\t"
-        "ldc1       %[ftmp1],       0x40(%[block])                      \n\t"
-        "ldc1       %[ftmp0],       0x48(%[block])                      \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x40)
+        MMI_LDC1(%[ftmp0], %[block], 0x48)
         "packsshb   %[ftmp1],       %[ftmp1],       %[ftmp0]            \n\t"
-        "ldc1       %[ftmp2],       0x50(%[block])                      \n\t"
-        "ldc1       %[ftmp0],       0x58(%[block])                      \n\t"
+        MMI_LDC1(%[ftmp2], %[block], 0x50)
+        MMI_LDC1(%[ftmp0], %[block], 0x58)
         "packsshb   %[ftmp2],       %[ftmp2],       %[ftmp0]            \n\t"
-        "ldc1       %[ftmp3],       0x60(%[block])                      \n\t"
-        "ldc1       %[ftmp0],       0x68(%[block])                      \n\t"
+        MMI_LDC1(%[ftmp3], %[block], 0x60)
+        MMI_LDC1(%[ftmp0], %[block], 0x68)
         "packsshb   %[ftmp3],       %[ftmp3],       %[ftmp0]            \n\t"
-        "ldc1       %[ftmp4],       0x70(%[block])                      \n\t"
-        "ldc1       %[ftmp0],       0x78(%[block])                      \n\t"
+        MMI_LDC1(%[ftmp4], %[block], 0x70)
+        MMI_LDC1(%[ftmp0], %[block], 0x78)
         "packsshb   %[ftmp4],       %[ftmp4],       %[ftmp0]            \n\t"
         "paddb      %[ftmp1],       %[ftmp1],       %[ff_pb_80]         \n\t"
         "paddb      %[ftmp2],       %[ftmp2],       %[ff_pb_80]         \n\t"
         "paddb      %[ftmp3],       %[ftmp3],       %[ff_pb_80]         \n\t"
         "paddb      %[ftmp4],       %[ftmp4],       %[ff_pb_80]         \n\t"
-        "sdc1       %[ftmp1],       0x00(%[pixels])                     \n\t"
-        "gssdxc1    %[ftmp2],       0x00(%[pixels], %[line_skip])       \n\t"
+        MMI_SDC1(%[ftmp1], %[pixels], 0x00)
+        MMI_SDXC1(%[ftmp2], %[pixels], %[line_skip], 0x00)
         PTR_ADDU   "%[addr0],       %[line_skip],   %[line_skip]        \n\t"
-        "gssdxc1    %[ftmp3],       0x00(%[pixels], %[addr0])           \n\t"
-        "gssdxc1    %[ftmp4],       0x00(%[pixels], %[line_skip3])      \n\t"
+        MMI_SDXC1(%[ftmp3], %[pixels], %[addr0], 0x00)
+        MMI_SDXC1(%[ftmp4], %[pixels], %[line_skip3], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0]),
           [pixels]"+&r"(pixels),            [line_skip3]"+&r"(line_skip3)
         : [block]"r"(block),
@@ -166,17 +176,20 @@  void ff_add_pixels_clamped_mmi(const int16_t *block,
 {
     double ftmp[8];
     uint64_t tmp[1];
+    mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         "li         %[tmp0],    0x04                                    \n\t"
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "1:                                                             \n\t"
-        "ldc1       %[ftmp1],   0x00(%[block])                          \n\t"
-        "ldc1       %[ftmp2],   0x08(%[block])                          \n\t"
-        "ldc1       %[ftmp3],   0x10(%[block])                          \n\t"
-        "ldc1       %[ftmp4],   0x18(%[block])                          \n\t"
-        "ldc1       %[ftmp5],   0x00(%[pixels])                         \n\t"
-        "gsldxc1    %[ftmp6],   0x00(%[pixels], %[line_size])           \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x00)
+        MMI_LDC1(%[ftmp2], %[block], 0x08)
+        MMI_LDC1(%[ftmp3], %[block], 0x10)
+        MMI_LDC1(%[ftmp4], %[block], 0x18)
+        MMI_LDC1(%[ftmp5], %[pixels], 0x00)
+        MMI_LDXC1(%[ftmp6], %[pixels], %[line_size], 0x00)
         "mov.d      %[ftmp7],   %[ftmp5]                                \n\t"
         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
@@ -189,8 +202,8 @@  void ff_add_pixels_clamped_mmi(const int16_t *block,
         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
-        "sdc1       %[ftmp1],   0x00(%[pixels])                         \n\t"
-        "gssdxc1    %[ftmp3],   0x00(%[pixels], %[line_size])           \n\t"
+        MMI_SDC1(%[ftmp1], %[pixels], 0x00)
+        MMI_SDXC1(%[ftmp3], %[pixels], %[line_size], 0x00)
         "addi       %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDIU  "%[block],   %[block],       0x20                    \n\t"
         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
@@ -201,6 +214,9 @@  void ff_add_pixels_clamped_mmi(const int16_t *block,
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0]),
           [pixels]"+&r"(pixels),            [block]"+&r"(block)
         : [line_size]"r"((mips_reg)line_size)
         : "memory"
diff --git a/libavcodec/mips/mpegvideo_mmi.c b/libavcodec/mips/mpegvideo_mmi.c
index 450a18c..18058e4 100644
--- a/libavcodec/mips/mpegvideo_mmi.c
+++ b/libavcodec/mips/mpegvideo_mmi.c
@@ -23,7 +23,7 @@ 
  */
 
 #include "mpegvideo_mips.h"
-#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
 
 void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
         int n, int qscale)
@@ -31,6 +31,7 @@  void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
     int64_t level, qmul, qadd, nCoeffs;
     double ftmp[6];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
 
     qmul = qscale << 1;
     av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
@@ -60,12 +61,11 @@  void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
         "psubh      %[ftmp0],   %[ftmp0],       %[qadd]                 \n\t"
         "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
         ".p2align   4                                                   \n\t"
+
         "1:                                                             \n\t"
         PTR_ADDU   "%[addr0],   %[block],       %[nCoeffs]              \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x0f(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x08(%[addr0])                          \n\t"
+        MMI_LDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_LDC1(%[ftmp2], %[addr0], 0x08)
         "mov.d      %[ftmp3],   %[ftmp1]                                \n\t"
         "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
         "pmullh     %[ftmp1],   %[ftmp1],       %[qmul]                 \n\t"
@@ -83,14 +83,13 @@  void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
         "pandn      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         "pandn      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
         PTR_ADDIU  "%[nCoeffs], %[nCoeffs],     0x10                    \n\t"
-        "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gssdlc1    %[ftmp2],   0x0f(%[addr0])                          \n\t"
-        "gssdrc1    %[ftmp2],   0x08(%[addr0])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_SDC1(%[ftmp2], %[addr0], 0x08)
         "blez       %[nCoeffs], 1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          RESTRICT_ASM_ALL64
           [addr0]"=&r"(addr[0])
         : [block]"r"((mips_reg)(block+nCoeffs)),
           [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
@@ -107,6 +106,7 @@  void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
     int64_t qmul, qadd, nCoeffs;
     double ftmp[6];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
 
     qmul = qscale << 1;
     qadd = (qscale - 1) | 1;
@@ -124,10 +124,8 @@  void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
         ".p2align   4                                                   \n\t"
         "1:                                                             \n\t"
         PTR_ADDU   "%[addr0],   %[block],       %[nCoeffs]              \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x0f(%[addr0])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x08(%[addr0])                          \n\t"
+        MMI_LDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_LDC1(%[ftmp2], %[addr0], 0x08)
         "mov.d      %[ftmp3],   %[ftmp1]                                \n\t"
         "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
         "pmullh     %[ftmp1],   %[ftmp1],       %[qmul]                 \n\t"
@@ -145,14 +143,13 @@  void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
         "pandn      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         "pandn      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
         PTR_ADDIU  "%[nCoeffs], %[nCoeffs],     0x10                    \n\t"
-        "gssdlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
-        "gssdrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gssdlc1    %[ftmp2],   0x0f(%[addr0])                          \n\t"
-        "gssdrc1    %[ftmp2],   0x08(%[addr0])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_SDC1(%[ftmp2], %[addr0], 0x08)
         "blez       %[nCoeffs], 1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          RESTRICT_ASM_ALL64
           [addr0]"=&r"(addr[0])
         : [block]"r"((mips_reg)(block+nCoeffs)),
           [nCoeffs]"r"((mips_reg)(2*(-nCoeffs))),
@@ -170,6 +167,8 @@  void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
     double ftmp[10];
     uint64_t tmp[1];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     av_assert2(s->block_last_index[n]>=0);
     nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
@@ -192,13 +191,14 @@  void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
         "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
         "or         %[addr0],   %[nCoeffs],     $0                      \n\t"
         ".p2align   4                                                   \n\t"
+
         "1:                                                             \n\t"
-        "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[block])               \n\t"
-        "gsldxc1    %[ftmp3],   0x08(%[addr0],  %[block])               \n\t"
+        MMI_LDXC1(%[ftmp2], %[addr0], %[block], 0x00)
+        MMI_LDXC1(%[ftmp3], %[addr0], %[block], 0x08)
         "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
         "mov.d      %[ftmp5],   %[ftmp3]                                \n\t"
-        "gsldxc1    %[ftmp6],   0x00(%[addr0],  %[quant])               \n\t"
-        "gsldxc1    %[ftmp7],   0x08(%[addr0],  %[quant])               \n\t"
+        MMI_LDXC1(%[ftmp6], %[addr0], %[quant], 0x00)
+        MMI_LDXC1(%[ftmp7], %[addr0], %[quant], 0x08)
         "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
         "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
         "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
@@ -229,8 +229,8 @@  void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
         "psubh      %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
         "pandn      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
         "pandn      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
-        "gssdxc1    %[ftmp6],   0x00(%[addr0],  %[block])               \n\t"
-        "gssdxc1    %[ftmp7],   0x08(%[addr0],  %[block])               \n\t"
+        MMI_SDXC1(%[ftmp6], %[addr0], %[block], 0x00)
+        MMI_SDXC1(%[ftmp7], %[addr0], %[block], 0x08)
         PTR_ADDIU  "%[addr0],   %[addr0],       0x10                    \n\t"
         "bltz       %[addr0],   1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
@@ -239,6 +239,8 @@  void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0])
         : [block]"r"((mips_reg)(block+nCoeffs)),
           [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
@@ -258,6 +260,8 @@  void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
     double ftmp[10];
     uint64_t tmp[1];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     av_assert2(s->block_last_index[n] >= 0);
     nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]] + 1;
@@ -273,13 +277,14 @@  void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
         "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
         "or         %[addr0],   %[nCoeffs],     $0                      \n\t"
         ".p2align   4                                                   \n\t"
+
         "1:                                                             \n\t"
-        "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[block])               \n\t"
-        "gsldxc1    %[ftmp3],   0x08(%[addr0],  %[block])               \n\t"
+        MMI_LDXC1(%[ftmp2], %[addr0], %[block], 0x00)
+        MMI_LDXC1(%[ftmp3], %[addr0], %[block], 0x08)
         "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
         "mov.d      %[ftmp5],   %[ftmp3]                                \n\t"
-        "gsldxc1    %[ftmp6],   0x00(%[addr0],  %[quant])               \n\t"
-        "gsldxc1    %[ftmp7],   0x08(%[addr0],  %[quant])               \n\t"
+        MMI_LDXC1(%[ftmp6], %[addr0], %[quant], 0x00)
+        MMI_LDXC1(%[ftmp7], %[addr0], %[quant], 0x08)
         "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
         "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
         "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
@@ -314,8 +319,8 @@  void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
         "psubh      %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
         "pandn      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
         "pandn      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
-        "gssdxc1    %[ftmp6],   0x00(%[addr0],  %[block])               \n\t"
-        "gssdxc1    %[ftmp7],   0x08(%[addr0],  %[block])               \n\t"
+        MMI_SDXC1(%[ftmp6], %[addr0], %[block], 0x00)
+        MMI_SDXC1(%[ftmp7], %[addr0], %[block], 0x08)
         PTR_ADDIU  "%[addr0],   %[addr0],       0x10                    \n\t"
         "bltz       %[addr0],   1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
@@ -324,6 +329,8 @@  void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0])
         : [block]"r"((mips_reg)(block+nCoeffs)),
           [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
@@ -342,6 +349,8 @@  void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
     double ftmp[10];
     uint64_t tmp[1];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     assert(s->block_last_index[n]>=0);
 
@@ -367,13 +376,14 @@  void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
         "packsswh   %[ftmp9],   %[ftmp9],       %[ftmp9]                \n\t"
         "or         %[addr0],   %[nCoeffs],     $0                      \n\t"
         ".p2align   4                                                   \n\t"
+
         "1:                                                             \n\t"
-        "gsldxc1    %[ftmp1],   0x00(%[addr0],  %[block])               \n\t"
-        "gsldxc1    %[ftmp2],   0x08(%[addr0],  %[block])               \n\t"
+        MMI_LDXC1(%[ftmp1], %[addr0], %[block], 0x00)
+        MMI_LDXC1(%[ftmp2], %[addr0], %[block], 0x08)
         "mov.d      %[ftmp3],   %[ftmp1]                                \n\t"
         "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
-        "gsldxc1    %[ftmp5],   0x00(%[addr0],  %[quant])               \n\t"
-        "gsldxc1    %[ftmp6],   0x00(%[addr0],  %[quant])               \n\t"
+        MMI_LDXC1(%[ftmp5], %[addr0], %[quant], 0x00)
+        MMI_LDXC1(%[ftmp6], %[addr0], %[quant], 0x08)
         "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
         "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp9]                \n\t"
         "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
@@ -401,8 +411,8 @@  void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
         "pandn      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
         "pandn      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
         PTR_ADDIU  "%[addr0],   %[addr0],       0x10                    \n\t"
-        "gssdxc1    %[ftmp5],   0x00(%[addr0],  %[block])               \n\t"
-        "gssdxc1    %[ftmp6],   0x08(%[addr0],  %[block])               \n\t"
+        MMI_SDXC1(%[ftmp5], %[addr0], %[block], 0x00)
+        MMI_SDXC1(%[ftmp6], %[addr0], %[block], 0x08)
         "blez       %[addr0],   1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
@@ -410,6 +420,8 @@  void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0])
         : [block]"r"((mips_reg)(block+nCoeffs)),
           [quant]"r"((mips_reg)(quant_matrix+nCoeffs)),
@@ -428,15 +440,16 @@  void ff_denoise_dct_mmi(MpegEncContext *s, int16_t *block)
     uint16_t *offset = s->dct_offset[intra];
     double ftmp[8];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
 
     s->dct_count[intra]++;
 
     __asm__ volatile(
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "1:                                                             \n\t"
-        "ldc1       %[ftmp1],   0x00(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x00)
         "xor        %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
-        "ldc1       %[ftmp3],   0x08(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp3], %[block], 0x08)
         "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
         "pcmpgth    %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
         "pcmpgth    %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
@@ -444,36 +457,36 @@  void ff_denoise_dct_mmi(MpegEncContext *s, int16_t *block)
         "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
         "psubh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
         "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
-        "ldc1       %[ftmp6],   0x00(%[offset])                         \n\t"
+        MMI_LDC1(%[ftmp6], %[offset], 0x00)
         "mov.d      %[ftmp5],   %[ftmp1]                                \n\t"
         "psubush    %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
-        "ldc1       %[ftmp6],   0x08(%[offset])                         \n\t"
+        MMI_LDC1(%[ftmp6], %[offset], 0x08)
         "mov.d      %[ftmp7],   %[ftmp3]                                \n\t"
         "psubush    %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
         "xor        %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
         "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
         "psubh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
         "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
-        "sdc1       %[ftmp1],   0x00(%[block])                          \n\t"
-        "sdc1       %[ftmp3],   0x08(%[block])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[block], 0x00)
+        MMI_SDC1(%[ftmp3], %[block], 0x08)
         "mov.d      %[ftmp1],   %[ftmp5]                                \n\t"
         "mov.d      %[ftmp3],   %[ftmp7]                                \n\t"
         "punpcklhw  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
         "punpckhhw  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
         "punpckhhw  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
-        "ldc1       %[ftmp2],   0x00(%[sum])                            \n\t"
+        MMI_LDC1(%[ftmp2], %[sum], 0x00)
         "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
-        "ldc1       %[ftmp2],   0x08(%[sum])                            \n\t"
+        MMI_LDC1(%[ftmp2], %[sum], 0x08)
         "paddw      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
-        "ldc1       %[ftmp2],   0x10(%[sum])                            \n\t"
+        MMI_LDC1(%[ftmp2], %[sum], 0x10)
         "paddw      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
-        "ldc1       %[ftmp2],   0x18(%[sum])                            \n\t"
+        MMI_LDC1(%[ftmp2], %[sum], 0x18)
         "paddw      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
-        "sdc1       %[ftmp5],   0x00(%[sum])                            \n\t"
-        "sdc1       %[ftmp1],   0x08(%[sum])                            \n\t"
-        "sdc1       %[ftmp7],   0x10(%[sum])                            \n\t"
-        "sdc1       %[ftmp3],   0x18(%[sum])                            \n\t"
+        MMI_SDC1(%[ftmp5], %[sum], 0x00)
+        MMI_SDC1(%[ftmp1], %[sum], 0x08)
+        MMI_SDC1(%[ftmp7], %[sum], 0x10)
+        MMI_SDC1(%[ftmp3], %[sum], 0x18)
         PTR_ADDIU  "%[block],   %[block],       0x10                    \n\t"
         PTR_ADDIU  "%[sum],     %[sum],         0x20                    \n\t"
         PTR_SUBU   "%[addr0],   %[block1],      %[block]                \n\t"
@@ -483,6 +496,7 @@  void ff_denoise_dct_mmi(MpegEncContext *s, int16_t *block)
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          RESTRICT_ASM_ALL64
           [addr0]"=&r"(addr[0]),
           [block]"+&r"(block),              [sum]"+&r"(sum),
           [offset]"+&r"(offset)
diff --git a/libavcodec/mips/pixblockdsp_mmi.c b/libavcodec/mips/pixblockdsp_mmi.c
index 3ff84c0..9f2eac3 100644
--- a/libavcodec/mips/pixblockdsp_mmi.c
+++ b/libavcodec/mips/pixblockdsp_mmi.c
@@ -23,34 +23,73 @@ 
 
 #include "pixblockdsp_mips.h"
 #include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
 
 void ff_get_pixels_8_mmi(int16_t *av_restrict block, const uint8_t *pixels,
         ptrdiff_t line_size)
 {
-    double ftmp[6];
-    mips_reg tmp[2];
+    double ftmp[7];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
-        "li         %[tmp1],    0x08                                    \n\t"
-        "move       %[tmp0],    $0                                      \n\t"
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-        "1:                                                             \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[pixels])                         \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[pixels])                         \n\t"
-        "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]                \n\t"
-        "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]                \n\t"
-        "gssdxc1    %[ftmp2],   0x00(%[block],  %[tmp0])                \n\t"
-        "gssdxc1    %[ftmp5],   0x08(%[block],  %[tmp0])                \n\t"
-        PTR_ADDI   "%[tmp1],    %[tmp1],       -0x01                    \n\t"
-        PTR_ADDIU  "%[tmp0],    %[tmp0],        0x10                    \n\t"
-        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
-        "bnez       %[tmp1],    1b                                      \n\t"
+
+        MMI_LDC1(%[ftmp1], %[pixels], 0x00)
+        MMI_LDXC1(%[ftmp2], %[pixels], %[line_size], 0x00)
+        "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
+        MMI_SDC1(%[ftmp3], %[block], 0x00)
+        MMI_SDC1(%[ftmp4], %[block], 0x08)
+        MMI_SDC1(%[ftmp5], %[block], 0x10)
+        MMI_SDC1(%[ftmp6], %[block], 0x18)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size_x2]         \n\t"
+
+        MMI_LDC1(%[ftmp1], %[pixels], 0x00)
+        MMI_LDXC1(%[ftmp2], %[pixels], %[line_size], 0x00)
+        "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
+        MMI_SDC1(%[ftmp3], %[block], 0x20)
+        MMI_SDC1(%[ftmp4], %[block], 0x28)
+        MMI_SDC1(%[ftmp5], %[block], 0x30)
+        MMI_SDC1(%[ftmp6], %[block], 0x38)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size_x2]         \n\t"
+
+        MMI_LDC1(%[ftmp1], %[pixels], 0x00)
+        MMI_LDXC1(%[ftmp2], %[pixels], %[line_size], 0x00)
+        "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
+        MMI_SDC1(%[ftmp3], %[block], 0x40)
+        MMI_SDC1(%[ftmp4], %[block], 0x48)
+        MMI_SDC1(%[ftmp5], %[block], 0x50)
+        MMI_SDC1(%[ftmp6], %[block], 0x58)
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size_x2]         \n\t"
+
+        MMI_LDC1(%[ftmp1], %[pixels], 0x00)
+        MMI_LDXC1(%[ftmp2], %[pixels], %[line_size], 0x00)
+        "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]                \n\t"
+        MMI_SDC1(%[ftmp3], %[block], 0x60)
+        MMI_SDC1(%[ftmp4], %[block], 0x68)
+        MMI_SDC1(%[ftmp5], %[block], 0x70)
+        MMI_SDC1(%[ftmp6], %[block], 0x78)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
-          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [ftmp6]"=&f"(ftmp[6]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [pixels]"+&r"(pixels)
-        : [block]"r"((mips_reg)block),      [line_size]"r"((mips_reg)line_size)
+        : [block]"r"((mips_reg)block),      [line_size]"r"((mips_reg)line_size),
+          [line_size_x2]"r"((mips_reg)(line_size<<1))
         : "memory"
     );
 }
@@ -60,16 +99,15 @@  void ff_diff_pixels_mmi(int16_t *av_restrict block, const uint8_t *src1,
 {
     double ftmp[5];
     mips_reg tmp[1];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "li         %[tmp0],    0x08                                    \n\t"
         "xor        %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
         "1:                                                             \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        MMI_LDC1(%[ftmp0], %[src1], 0x00)
         "or         %[ftmp1],   %[ftmp0],       %[ftmp0]                \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        MMI_LDC1(%[ftmp2], %[src2], 0x00)
         "or         %[ftmp3],   %[ftmp2],       %[ftmp2]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
@@ -77,10 +115,8 @@  void ff_diff_pixels_mmi(int16_t *av_restrict block, const uint8_t *src1,
         "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
         "psubh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[block])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[block])                          \n\t"
-        "gssdlc1    %[ftmp1],   0x0f(%[block])                          \n\t"
-        "gssdrc1    %[ftmp1],   0x08(%[block])                          \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDC1(%[ftmp1], %[block], 0x08)
         PTR_ADDI   "%[tmp0],    %[tmp0], -0x01                          \n\t"
         PTR_ADDIU  "%[block],   %[block], 0x10                          \n\t"
         PTR_ADDU   "%[src1],    %[src1],        %[stride]               \n\t"
@@ -90,6 +126,7 @@  void ff_diff_pixels_mmi(int16_t *av_restrict block, const uint8_t *src1,
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),
           [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
           [block]"+&r"(block),              [src1]"+&r"(src1),
           [src2]"+&r"(src2)
         : [stride]"r"((mips_reg)stride)