diff mbox series

[FFmpeg-devel,v5,1/7] avcodec/la: add LSX optimization for h264 idct.

Message ID 20230525072432.29928-2-chenhao@loongson.cn
State Accepted
Commit e1b6ecd20a01c5d902cd89c034b960ddadea778e
Headers show
Series [FFmpeg-devel,v5,1/7] avcodec/la: add LSX optimization for h264 idct. | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

陈昊 May 25, 2023, 7:24 a.m. UTC
From: Shiyou Yin <yinshiyou-hf@loongson.cn>

loongson_asm.S is LoongArch asm optimization helper.
Add functions:
  ff_h264_idct_add_8_lsx
  ff_h264_idct8_add_8_lsx
  ff_h264_idct_dc_add_8_lsx
  ff_h264_idct8_dc_add_8_lsx
  ff_h264_idct_add16_8_lsx
  ff_h264_idct8_add4_8_lsx
  ff_h264_idct_add8_8_lsx
  ff_h264_idct_add8_422_8_lsx
  ff_h264_idct_add16_intra_8_lsx
  ff_h264_luma_dc_dequant_idct_8_lsx
Replaced function(LSX is sufficient for these functions):
  ff_h264_idct_add_lasx
  ff_h264_idct4x4_addblk_dc_lasx
  ff_h264_idct_add16_lasx
  ff_h264_idct8_add4_lasx
  ff_h264_idct_add8_lasx
  ff_h264_idct_add8_422_lasx
  ff_h264_idct_add16_intra_lasx
  ff_h264_deq_idct_luma_dc_lasx
Renamed functions:
  ff_h264_idct8_addblk_lasx ==> ff_h264_idct8_add_8_lasx
  ff_h264_idct8_dc_addblk_lasx ==> ff_h264_idct8_dc_add_8_lasx

./configure --disable-lasx
ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 155fps
after:  161fps
---
 libavcodec/loongarch/Makefile                 |   3 +-
 libavcodec/loongarch/h264_deblock_lasx.c      |   2 +-
 libavcodec/loongarch/h264dsp_init_loongarch.c |  39 +-
 libavcodec/loongarch/h264dsp_lasx.c           |   2 +-
 .../{h264dsp_lasx.h => h264dsp_loongarch.h}   |  60 +-
 libavcodec/loongarch/h264idct.S               | 658 ++++++++++++
 libavcodec/loongarch/h264idct_lasx.c          | 498 ---------
 libavcodec/loongarch/h264idct_loongarch.c     | 184 ++++
 libavcodec/loongarch/loongson_asm.S           | 945 ++++++++++++++++++
 9 files changed, 1848 insertions(+), 543 deletions(-)
 rename libavcodec/loongarch/{h264dsp_lasx.h => h264dsp_loongarch.h} (68%)
 create mode 100644 libavcodec/loongarch/h264idct.S
 delete mode 100644 libavcodec/loongarch/h264idct_lasx.c
 create mode 100644 libavcodec/loongarch/h264idct_loongarch.c
 create mode 100644 libavcodec/loongarch/loongson_asm.S
diff mbox series

Patch

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index c1b5de5c44..34ebbbe133 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -12,7 +12,6 @@  OBJS-$(CONFIG_HEVC_DECODER)           += loongarch/hevcdsp_init_loongarch.o
 LASX-OBJS-$(CONFIG_H264CHROMA)        += loongarch/h264chroma_lasx.o
 LASX-OBJS-$(CONFIG_H264QPEL)          += loongarch/h264qpel_lasx.o
 LASX-OBJS-$(CONFIG_H264DSP)           += loongarch/h264dsp_lasx.o \
-                                         loongarch/h264idct_lasx.o \
                                          loongarch/h264_deblock_lasx.o
 LASX-OBJS-$(CONFIG_H264PRED)          += loongarch/h264_intrapred_lasx.o
 LASX-OBJS-$(CONFIG_VC1_DECODER)       += loongarch/vc1dsp_lasx.o
@@ -31,3 +30,5 @@  LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
                                          loongarch/hevc_mc_bi_lsx.o \
                                          loongarch/hevc_mc_uni_lsx.o \
                                          loongarch/hevc_mc_uniw_lsx.o
+LSX-OBJS-$(CONFIG_H264DSP)            += loongarch/h264idct.o \
+                                         loongarch/h264idct_loongarch.o
diff --git a/libavcodec/loongarch/h264_deblock_lasx.c b/libavcodec/loongarch/h264_deblock_lasx.c
index c89bea9a84..eead931dcf 100644
--- a/libavcodec/loongarch/h264_deblock_lasx.c
+++ b/libavcodec/loongarch/h264_deblock_lasx.c
@@ -20,7 +20,7 @@ 
  */
 
 #include "libavcodec/bit_depth_template.c"
-#include "h264dsp_lasx.h"
+#include "h264dsp_loongarch.h"
 #include "libavutil/loongarch/loongson_intrinsics.h"
 
 #define H264_LOOP_FILTER_STRENGTH_ITERATION_LASX(edges, step, mask_mv, dir, \
diff --git a/libavcodec/loongarch/h264dsp_init_loongarch.c b/libavcodec/loongarch/h264dsp_init_loongarch.c
index 37633c3e51..cb07deb398 100644
--- a/libavcodec/loongarch/h264dsp_init_loongarch.c
+++ b/libavcodec/loongarch/h264dsp_init_loongarch.c
@@ -21,13 +21,32 @@ 
  */
 
 #include "libavutil/loongarch/cpu.h"
-#include "h264dsp_lasx.h"
+#include "h264dsp_loongarch.h"
 
 av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
                                        const int chroma_format_idc)
 {
     int cpu_flags = av_get_cpu_flags();
 
+    if (have_lsx(cpu_flags)) {
+        if (bit_depth == 8) {
+            c->h264_idct_add     = ff_h264_idct_add_8_lsx;
+            c->h264_idct8_add    = ff_h264_idct8_add_8_lsx;
+            c->h264_idct_dc_add  = ff_h264_idct_dc_add_8_lsx;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lsx;
+
+            if (chroma_format_idc <= 1)
+                c->h264_idct_add8 = ff_h264_idct_add8_8_lsx;
+            else
+                c->h264_idct_add8 = ff_h264_idct_add8_422_8_lsx;
+
+            c->h264_idct_add16 = ff_h264_idct_add16_8_lsx;
+            c->h264_idct8_add4 = ff_h264_idct8_add4_8_lsx;
+            c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_8_lsx;
+            c->h264_idct_add16intra = ff_h264_idct_add16_intra_8_lsx;
+        }
+    }
+#if HAVE_LASX
     if (have_lasx(cpu_flags)) {
         if (chroma_format_idc <= 1)
             c->h264_loop_filter_strength = ff_h264_loop_filter_strength_lasx;
@@ -56,20 +75,10 @@  av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
             c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_lasx;
             c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_lasx;
 
-            c->h264_idct_add = ff_h264_idct_add_lasx;
-            c->h264_idct8_add = ff_h264_idct8_addblk_lasx;
-            c->h264_idct_dc_add = ff_h264_idct4x4_addblk_dc_lasx;
-            c->h264_idct8_dc_add = ff_h264_idct8_dc_addblk_lasx;
-            c->h264_idct_add16 = ff_h264_idct_add16_lasx;
-            c->h264_idct8_add4 = ff_h264_idct8_add4_lasx;
-
-            if (chroma_format_idc <= 1)
-                c->h264_idct_add8 = ff_h264_idct_add8_lasx;
-            else
-                c->h264_idct_add8 = ff_h264_idct_add8_422_lasx;
-
-            c->h264_idct_add16intra = ff_h264_idct_add16_intra_lasx;
-            c->h264_luma_dc_dequant_idct = ff_h264_deq_idct_luma_dc_lasx;
+            c->h264_idct8_add    = ff_h264_idct8_add_8_lasx;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lasx;
+            c->h264_idct8_add4   = ff_h264_idct8_add4_8_lasx;
         }
     }
+#endif // #if HAVE_LASX
 }
diff --git a/libavcodec/loongarch/h264dsp_lasx.c b/libavcodec/loongarch/h264dsp_lasx.c
index 7fd4cedf7e..7b2b8ff0f0 100644
--- a/libavcodec/loongarch/h264dsp_lasx.c
+++ b/libavcodec/loongarch/h264dsp_lasx.c
@@ -23,7 +23,7 @@ 
  */
 
 #include "libavutil/loongarch/loongson_intrinsics.h"
-#include "h264dsp_lasx.h"
+#include "h264dsp_loongarch.h"
 
 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,   \
                          p1_or_q1_org_in, p2_or_q2_org_in,   \
diff --git a/libavcodec/loongarch/h264dsp_lasx.h b/libavcodec/loongarch/h264dsp_loongarch.h
similarity index 68%
rename from libavcodec/loongarch/h264dsp_lasx.h
rename to libavcodec/loongarch/h264dsp_loongarch.h
index 4cf813750b..28dca2b537 100644
--- a/libavcodec/loongarch/h264dsp_lasx.h
+++ b/libavcodec/loongarch/h264dsp_loongarch.h
@@ -1,5 +1,5 @@ 
 /*
- * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
  *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
  *
@@ -20,11 +20,34 @@ 
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_LOONGARCH_H264DSP_LASX_H
-#define AVCODEC_LOONGARCH_H264DSP_LASX_H
+#ifndef AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
 
 #include "libavcodec/h264dec.h"
+#include "config.h"
 
+void ff_h264_idct_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct8_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct_dc_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_idct8_dc_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
+void ff_h264_luma_dc_dequant_idct_8_lsx(int16_t *_output, int16_t *_input, int qmul);
+void ff_h264_idct_add16_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                              int16_t *block, int32_t dst_stride,
+                              const uint8_t nzc[15 * 8]);
+void ff_h264_idct8_add4_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                              int16_t *block, int32_t dst_stride,
+                              const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add8_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+                             int16_t *block, int32_t dst_stride,
+                             const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add8_422_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+                                 int16_t *block, int32_t dst_stride,
+                                 const uint8_t nzc[15 * 8]);
+void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                                    int16_t *block, int32_t dst_stride,
+                                    const uint8_t nzc[15 * 8]);
+
+#if HAVE_LASX
 void ff_h264_h_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
                                int alpha, int beta, int8_t *tc0);
 void ff_h264_v_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
@@ -65,33 +88,16 @@  void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
 void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
 
 void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
-void ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
-void ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
-void ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src,
-                                    int32_t dst_stride);
-void ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src,
+void ff_h264_idct8_add_8_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
+void ff_h264_idct8_dc_add_8_lasx(uint8_t *dst, int16_t *src,
                                   int32_t dst_stride);
-void ff_h264_idct_add16_lasx(uint8_t *dst, const int32_t *blk_offset,
-                             int16_t *block, int32_t dst_stride,
-                             const uint8_t nzc[15 * 8]);
-void ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset,
-                             int16_t *block, int32_t dst_stride,
-                             const uint8_t nzc[15 * 8]);
-void ff_h264_idct_add8_lasx(uint8_t **dst, const int32_t *blk_offset,
-                            int16_t *block, int32_t dst_stride,
-                            const uint8_t nzc[15 * 8]);
-void ff_h264_idct_add8_422_lasx(uint8_t **dst, const int32_t *blk_offset,
-                                int16_t *block, int32_t dst_stride,
-                                const uint8_t nzc[15 * 8]);
-void ff_h264_idct_add16_intra_lasx(uint8_t *dst, const int32_t *blk_offset,
-                                   int16_t *block, int32_t dst_stride,
-                                   const uint8_t nzc[15 * 8]);
-void ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src,
-                                   int32_t de_qval);
-
+void ff_h264_idct8_add4_8_lasx(uint8_t *dst, const int32_t *blk_offset,
+                               int16_t *block, int32_t dst_stride,
+                               const uint8_t nzc[15 * 8]);
 void ff_h264_loop_filter_strength_lasx(int16_t bS[2][4][4], uint8_t nnz[40],
                                        int8_t ref[2][40], int16_t mv[2][40][2],
                                        int bidir, int edges, int step,
                                        int mask_mv0, int mask_mv1, int field);
+#endif // #if HAVE_LASX
 
-#endif  // #ifndef AVCODEC_LOONGARCH_H264DSP_LASX_H
+#endif  // #ifndef AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
diff --git a/libavcodec/loongarch/h264idct.S b/libavcodec/loongarch/h264idct.S
new file mode 100644
index 0000000000..f504cfb714
--- /dev/null
+++ b/libavcodec/loongarch/h264idct.S
@@ -0,0 +1,658 @@ 
+/*
+ * Loongson LASX optimized h264idct
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_idct_add_8_lsx
+    fld.d         f0,     a1,    0
+    fld.d         f1,     a1,    8
+    fld.d         f2,     a1,    16
+    fld.d         f3,     a1,    24
+    vxor.v        vr7,    vr7,   vr7
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    vst           vr7,    a1,    0
+    vst           vr7,    a1,    16
+
+    vadd.h        vr4,    vr0,   vr2
+    vsub.h        vr5,    vr0,   vr2
+    vsrai.h       vr6,    vr1,   1
+    vsrai.h       vr7,    vr3,   1
+    vsub.h        vr6,    vr6,   vr3
+    vadd.h        vr7,    vr1,   vr7
+    LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7,  vr0, vr1, vr2, vr3
+    LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3,  vr0, vr1, vr2, vr3,  vr4, vr5
+    vadd.h        vr4,    vr0,   vr2
+    vsub.h        vr5,    vr0,   vr2
+    vsrai.h       vr6,    vr1,   1
+    vsrai.h       vr7,    vr3,   1
+    vsub.h        vr6,    vr6,   vr3
+    vadd.h        vr7,    vr1,   vr7
+    LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7,  vr0, vr1, vr2, vr3
+
+    fld.s         f4,     a0,    0
+    fldx.s        f5,     a0,    a2
+    fldx.s        f6,     a0,    t2
+    fldx.s        f7,     a0,    t3
+
+    vsrari.h      vr0,    vr0,   6
+    vsrari.h      vr1,    vr1,   6
+    vsrari.h      vr2,    vr2,   6
+    vsrari.h      vr3,    vr3,   6
+
+    vsllwil.hu.bu vr4,    vr4,   0
+    vsllwil.hu.bu vr5,    vr5,   0
+    vsllwil.hu.bu vr6,    vr6,   0
+    vsllwil.hu.bu vr7,    vr7,   0
+    vadd.h        vr0,    vr0,   vr4
+    vadd.h        vr1,    vr1,   vr5
+    vadd.h        vr2,    vr2,   vr6
+    vadd.h        vr3,    vr3,   vr7
+    vssrarni.bu.h vr1,    vr0,   0
+    vssrarni.bu.h vr3,    vr2,   0
+
+    vbsrl.v       vr0,    vr1,   8
+    vbsrl.v       vr2,    vr3,   8
+    fst.s         f1,     a0,    0
+    fstx.s        f0,     a0,    a2
+    fstx.s        f3,     a0,    t2
+    fstx.s        f2,     a0,    t3
+endfunc
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_add_8_lsx
+    ld.h          t0,     a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    add.d         t4,     t3,    a2
+    add.d         t5,     t4,    a2
+    add.d         t6,     t5,    a2
+    add.d         t7,     t6,    a2
+    addi.w        t0,     t0,    32
+    st.h          t0,     a1,    0
+
+    vld           vr0,    a1,    0
+    vld           vr1,    a1,    16
+    vld           vr2,    a1,    32
+    vld           vr3,    a1,    48
+    vld           vr4,    a1,    64
+    vld           vr5,    a1,    80
+    vld           vr6,    a1,    96
+    vld           vr7,    a1,    112
+    vxor.v        vr8,    vr8,   vr8
+    vst           vr8,    a1,    0
+    vst           vr8,    a1,    16
+    vst           vr8,    a1,    32
+    vst           vr8,    a1,    48
+    vst           vr8,    a1,    64
+    vst           vr8,    a1,    80
+    vst           vr8,    a1,    96
+    vst           vr8,    a1,    112
+
+    vadd.h        vr18,   vr0,   vr4
+    vsub.h        vr19,   vr0,   vr4
+    vsrai.h       vr20,   vr2,   1
+    vsrai.h       vr21,   vr6,   1
+    vsub.h        vr20,   vr20,  vr6
+    vadd.h        vr21,   vr21,  vr2
+    LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21,  vr10, vr12, vr14, vr16
+    vsrai.h       vr11,   vr7,   1
+    vsrai.h       vr13,   vr3,   1
+    vsrai.h       vr15,   vr5,   1
+    vsrai.h       vr17,   vr1,   1
+    vsub.h        vr11,   vr5,   vr11
+    vsub.h        vr13,   vr7,   vr13
+    vadd.h        vr15,   vr7,   vr15
+    vadd.h        vr17,   vr5,   vr17
+    vsub.h        vr11,   vr11,  vr7
+    vsub.h        vr13,   vr13,  vr3
+    vadd.h        vr15,   vr15,  vr5
+    vadd.h        vr17,   vr17,  vr1
+    vsub.h        vr11,   vr11,  vr3
+    vadd.h        vr13,   vr13,  vr1
+    vsub.h        vr15,   vr15,  vr1
+    vadd.h        vr17,   vr17,  vr3
+    vsrai.h       vr18,   vr11,  2
+    vsrai.h       vr19,   vr13,  2
+    vsrai.h       vr20,   vr15,  2
+    vsrai.h       vr21,   vr17,  2
+    vadd.h        vr11,   vr11,  vr21
+    vadd.h        vr13,   vr13,  vr20
+    vsub.h        vr15,   vr19,  vr15
+    vsub.h        vr17,   vr17,  vr18
+    LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \
+                      vr0,  vr3,  vr1,  vr2,  vr5,  vr6,  vr4,  vr7
+
+    LSX_TRANSPOSE8x8_H vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7, \
+                       vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7, \
+                       vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17
+    vexth.w.h     vr20,   vr0
+    vexth.w.h     vr21,   vr1
+    vexth.w.h     vr22,   vr2
+    vexth.w.h     vr23,   vr3
+    vexth.w.h     vr8,    vr4
+    vexth.w.h     vr9,    vr5
+    vexth.w.h     vr18,   vr6
+    vexth.w.h     vr19,   vr7
+    vsllwil.w.h   vr0,    vr0,   0
+    vsllwil.w.h   vr1,    vr1,   0
+    vsllwil.w.h   vr2,    vr2,   0
+    vsllwil.w.h   vr3,    vr3,   0
+    vsllwil.w.h   vr4,    vr4,   0
+    vsllwil.w.h   vr5,    vr5,   0
+    vsllwil.w.h   vr6,    vr6,   0
+    vsllwil.w.h   vr7,    vr7,   0
+
+    vadd.w        vr11,   vr0,   vr4
+    vsub.w        vr13,   vr0,   vr4
+    vsrai.w       vr15,   vr2,   1
+    vsrai.w       vr17,   vr6,   1
+    vsub.w        vr15,   vr15,  vr6
+    vadd.w        vr17,   vr17,  vr2
+    LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17,  vr10, vr12, vr14, vr16
+    vsrai.w       vr11,   vr7,   1
+    vsrai.w       vr13,   vr3,   1
+    vsrai.w       vr15,   vr5,   1
+    vsrai.w       vr17,   vr1,   1
+    vsub.w        vr11,   vr5,   vr11
+    vsub.w        vr13,   vr7,   vr13
+    vadd.w        vr15,   vr7,   vr15
+    vadd.w        vr17,   vr5,   vr17
+    vsub.w        vr11,   vr11,  vr7
+    vsub.w        vr13,   vr13,  vr3
+    vadd.w        vr15,   vr15,  vr5
+    vadd.w        vr17,   vr17,  vr1
+    vsub.w        vr11,   vr11,  vr3
+    vadd.w        vr13,   vr13,  vr1
+    vsub.w        vr15,   vr15,  vr1
+    vadd.w        vr17,   vr17,  vr3
+    vsrai.w       vr0,    vr11,  2
+    vsrai.w       vr1,    vr13,  2
+    vsrai.w       vr2,    vr15,  2
+    vsrai.w       vr3,    vr17,  2
+    vadd.w        vr11,   vr11,  vr3
+    vadd.w        vr13,   vr13,  vr2
+    vsub.w        vr15,   vr1,   vr15
+    vsub.w        vr17,   vr17,  vr0
+    LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \
+                      vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7
+
+    vadd.w        vr11,    vr20,  vr8
+    vsub.w        vr13,    vr20,  vr8
+    vsrai.w       vr15,    vr22,  1
+    vsrai.w       vr17,    vr18,  1
+    vsub.w        vr15,    vr15,  vr18
+    vadd.w        vr17,    vr17,  vr22
+    LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17,  vr10, vr12, vr14, vr16
+    vsrai.w       vr11,   vr19,  1
+    vsrai.w       vr13,   vr23,  1
+    vsrai.w       vr15,   vr9,   1
+    vsrai.w       vr17,   vr21,  1
+    vsub.w        vr11,   vr9,   vr11
+    vsub.w        vr13,   vr19,  vr13
+    vadd.w        vr15,   vr19,  vr15
+    vadd.w        vr17,   vr9,   vr17
+    vsub.w        vr11,   vr11,  vr19
+    vsub.w        vr13,   vr13,  vr23
+    vadd.w        vr15,   vr15,  vr9
+    vadd.w        vr17,   vr17,  vr21
+    vsub.w        vr11,   vr11,  vr23
+    vadd.w        vr13,   vr13,  vr21
+    vsub.w        vr15,   vr15,  vr21
+    vadd.w        vr17,   vr17,  vr23
+    vsrai.w       vr20,   vr11,  2
+    vsrai.w       vr21,   vr13,  2
+    vsrai.w       vr22,   vr15,  2
+    vsrai.w       vr23,   vr17,  2
+    vadd.w        vr11,   vr11,  vr23
+    vadd.w        vr13,   vr13,  vr22
+    vsub.w        vr15,   vr21,  vr15
+    vsub.w        vr17,   vr17,  vr20
+    LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \
+                      vr20, vr21, vr22, vr23, vr8, vr9, vr18, vr19
+
+    vld           vr10,   a0,    0
+    vldx          vr11,   a0,    a2
+    vldx          vr12,   a0,    t2
+    vldx          vr13,   a0,    t3
+    vldx          vr14,   a0,    t4
+    vldx          vr15,   a0,    t5
+    vldx          vr16,   a0,    t6
+    vldx          vr17,   a0,    t7
+    vsrani.h.w    vr20,   vr0,   6
+    vsrani.h.w    vr21,   vr1,   6
+    vsrani.h.w    vr22,   vr2,   6
+    vsrani.h.w    vr23,   vr3,   6
+    vsrani.h.w    vr8,    vr4,   6
+    vsrani.h.w    vr9,    vr5,   6
+    vsrani.h.w    vr18,   vr6,   6
+    vsrani.h.w    vr19,   vr7,   6
+    vsllwil.hu.bu vr10,   vr10,  0
+    vsllwil.hu.bu vr11,   vr11,  0
+    vsllwil.hu.bu vr12,   vr12,  0
+    vsllwil.hu.bu vr13,   vr13,  0
+    vsllwil.hu.bu vr14,   vr14,  0
+    vsllwil.hu.bu vr15,   vr15,  0
+    vsllwil.hu.bu vr16,   vr16,  0
+    vsllwil.hu.bu vr17,   vr17,  0
+
+    vadd.h        vr0,    vr20,  vr10
+    vadd.h        vr1,    vr21,  vr11
+    vadd.h        vr2,    vr22,  vr12
+    vadd.h        vr3,    vr23,  vr13
+    vadd.h        vr4,    vr8,   vr14
+    vadd.h        vr5,    vr9,   vr15
+    vadd.h        vr6,    vr18,  vr16
+    vadd.h        vr7,    vr19,  vr17
+    vssrarni.bu.h vr1,    vr0,   0
+    vssrarni.bu.h vr3,    vr2,   0
+    vssrarni.bu.h vr5,    vr4,   0
+    vssrarni.bu.h vr7,    vr6,   0
+    vbsrl.v       vr0,    vr1,   8
+    vbsrl.v       vr2,    vr3,   8
+    vbsrl.v       vr4,    vr5,   8
+    vbsrl.v       vr6,    vr7,   8
+    fst.d         f1,     a0,    0
+    fstx.d        f0,     a0,    a2
+    fstx.d        f3,     a0,    t2
+    fstx.d        f2,     a0,    t3
+    fstx.d        f5,     a0,    t4
+    fstx.d        f4,     a0,    t5
+    fstx.d        f7,     a0,    t6
+    fstx.d        f6,     a0,    t7
+endfunc
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_add_8_lasx
+    ld.h          t0,     a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    add.d         t4,     t3,    a2
+    add.d         t5,     t4,    a2
+    add.d         t6,     t5,    a2
+    add.d         t7,     t6,    a2
+    addi.w        t0,     t0,    32
+    st.h          t0,     a1,    0
+
+    vld           vr0,    a1,    0
+    vld           vr1,    a1,    16
+    vld           vr2,    a1,    32
+    vld           vr3,    a1,    48
+    vld           vr4,    a1,    64
+    vld           vr5,    a1,    80
+    vld           vr6,    a1,    96
+    vld           vr7,    a1,    112
+    xvxor.v       xr8,    xr8,   xr8
+    xvst          xr8,    a1,    0
+    xvst          xr8,    a1,    32
+    xvst          xr8,    a1,    64
+    xvst          xr8,    a1,    96
+
+    vadd.h        vr18,   vr0,   vr4
+    vsub.h        vr19,   vr0,   vr4
+    vsrai.h       vr20,   vr2,   1
+    vsrai.h       vr21,   vr6,   1
+    vsub.h        vr20,   vr20,  vr6
+    vadd.h        vr21,   vr21,  vr2
+    LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21,  vr10, vr12, vr14, vr16
+    vsrai.h       vr11,   vr7,   1
+    vsrai.h       vr13,   vr3,   1
+    vsrai.h       vr15,   vr5,   1
+    vsrai.h       vr17,   vr1,   1
+    vsub.h        vr11,   vr5,   vr11
+    vsub.h        vr13,   vr7,   vr13
+    vadd.h        vr15,   vr7,   vr15
+    vadd.h        vr17,   vr5,   vr17
+    vsub.h        vr11,   vr11,  vr7
+    vsub.h        vr13,   vr13,  vr3
+    vadd.h        vr15,   vr15,  vr5
+    vadd.h        vr17,   vr17,  vr1
+    vsub.h        vr11,   vr11,  vr3
+    vadd.h        vr13,   vr13,  vr1
+    vsub.h        vr15,   vr15,  vr1
+    vadd.h        vr17,   vr17,  vr3
+    vsrai.h       vr18,   vr11,  2
+    vsrai.h       vr19,   vr13,  2
+    vsrai.h       vr20,   vr15,  2
+    vsrai.h       vr21,   vr17,  2
+    vadd.h        vr11,   vr11,  vr21
+    vadd.h        vr13,   vr13,  vr20
+    vsub.h        vr15,   vr19,  vr15
+    vsub.h        vr17,   vr17,  vr18
+    LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \
+                      vr0,  vr3,  vr1,  vr2,  vr5,  vr6,  vr4,  vr7
+
+    LSX_TRANSPOSE8x8_H vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7, \
+                       vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7, \
+                       vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17
+    vext2xv.w.h   xr0,    xr0
+    vext2xv.w.h   xr1,    xr1
+    vext2xv.w.h   xr2,    xr2
+    vext2xv.w.h   xr3,    xr3
+    vext2xv.w.h   xr4,    xr4
+    vext2xv.w.h   xr5,    xr5
+    vext2xv.w.h   xr6,    xr6
+    vext2xv.w.h   xr7,    xr7
+
+    xvadd.w       xr11,   xr0,   xr4
+    xvsub.w       xr13,   xr0,   xr4
+    xvsrai.w      xr15,   xr2,   1
+    xvsrai.w      xr17,   xr6,   1
+    xvsub.w       xr15,   xr15,  xr6
+    xvadd.w       xr17,   xr17,  xr2
+    LASX_BUTTERFLY_4_W xr11, xr13, xr15, xr17,  xr10, xr12, xr14, xr16
+    xvsrai.w      xr11,   xr7,   1
+    xvsrai.w      xr13,   xr3,   1
+    xvsrai.w      xr15,   xr5,   1
+    xvsrai.w      xr17,   xr1,   1
+    xvsub.w       xr11,   xr5,   xr11
+    xvsub.w       xr13,   xr7,   xr13
+    xvadd.w       xr15,   xr7,   xr15
+    xvadd.w       xr17,   xr5,   xr17
+    xvsub.w       xr11,   xr11,  xr7
+    xvsub.w       xr13,   xr13,  xr3
+    xvadd.w       xr15,   xr15,  xr5
+    xvadd.w       xr17,   xr17,  xr1
+    xvsub.w       xr11,   xr11,  xr3
+    xvadd.w       xr13,   xr13,  xr1
+    xvsub.w       xr15,   xr15,  xr1
+    xvadd.w       xr17,   xr17,  xr3
+    xvsrai.w      xr0,    xr11,  2
+    xvsrai.w      xr1,    xr13,  2
+    xvsrai.w      xr2,    xr15,  2
+    xvsrai.w      xr3,    xr17,  2
+    xvadd.w       xr11,   xr11,  xr3
+    xvadd.w       xr13,   xr13,  xr2
+    xvsub.w       xr15,   xr1,   xr15
+    xvsub.w       xr17,   xr17,  xr0
+    LASX_BUTTERFLY_8_W xr10, xr12, xr14, xr16, xr11, xr13, xr15, xr17, \
+                       xr0,  xr1,  xr2,  xr3,  xr4,  xr5,  xr6,  xr7
+
+    vld           vr10,   a0,    0
+    vldx          vr11,   a0,    a2
+    vldx          vr12,   a0,    t2
+    vldx          vr13,   a0,    t3
+    vldx          vr14,   a0,    t4
+    vldx          vr15,   a0,    t5
+    vldx          vr16,   a0,    t6
+    vldx          vr17,   a0,    t7
+    xvldi         xr8,    0x806     //"xvldi.w xr8 6"
+    xvsran.h.w    xr0,    xr0,   xr8
+    xvsran.h.w    xr1,    xr1,   xr8
+    xvsran.h.w    xr2,    xr2,   xr8
+    xvsran.h.w    xr3,    xr3,   xr8
+    xvsran.h.w    xr4,    xr4,   xr8
+    xvsran.h.w    xr5,    xr5,   xr8
+    xvsran.h.w    xr6,    xr6,   xr8
+    xvsran.h.w    xr7,    xr7,   xr8
+    xvpermi.d     xr0,    xr0,   0x08
+    xvpermi.d     xr1,    xr1,   0x08
+    xvpermi.d     xr2,    xr2,   0x08
+    xvpermi.d     xr3,    xr3,   0x08
+    xvpermi.d     xr4,    xr4,   0x08
+    xvpermi.d     xr5,    xr5,   0x08
+    xvpermi.d     xr6,    xr6,   0x08
+    xvpermi.d     xr7,    xr7,   0x08
+
+    vsllwil.hu.bu vr10,   vr10,  0
+    vsllwil.hu.bu vr11,   vr11,  0
+    vsllwil.hu.bu vr12,   vr12,  0
+    vsllwil.hu.bu vr13,   vr13,  0
+    vsllwil.hu.bu vr14,   vr14,  0
+    vsllwil.hu.bu vr15,   vr15,  0
+    vsllwil.hu.bu vr16,   vr16,  0
+    vsllwil.hu.bu vr17,   vr17,  0
+
+    vadd.h        vr0,    vr0,   vr10
+    vadd.h        vr1,    vr1,   vr11
+    vadd.h        vr2,    vr2,   vr12
+    vadd.h        vr3,    vr3,   vr13
+    vadd.h        vr4,    vr4,   vr14
+    vadd.h        vr5,    vr5,   vr15
+    vadd.h        vr6,    vr6,   vr16
+    vadd.h        vr7,    vr7,   vr17
+    vssrarni.bu.h vr1,    vr0,   0
+    vssrarni.bu.h vr3,    vr2,   0
+    vssrarni.bu.h vr5,    vr4,   0
+    vssrarni.bu.h vr7,    vr6,   0
+    vbsrl.v       vr0,    vr1,   8
+    vbsrl.v       vr2,    vr3,   8
+    vbsrl.v       vr4,    vr5,   8
+    vbsrl.v       vr6,    vr7,   8
+    fst.d         f1,     a0,    0
+    fstx.d        f0,     a0,    a2
+    fstx.d        f3,     a0,    t2
+    fstx.d        f2,     a0,    t3
+    fstx.d        f5,     a0,    t4
+    fstx.d        f4,     a0,    t5
+    fstx.d        f7,     a0,    t6
+    fstx.d        f6,     a0,    t7
+endfunc
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *_block, int stride)
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_idct_dc_add_8_lsx
+    vldrepl.h     vr4,    a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    fld.s         f0,     a0,    0
+    fldx.s        f1,     a0,    a2
+    fldx.s        f2,     a0,    t2
+    fldx.s        f3,     a0,    t3
+    st.h          zero,   a1,    0
+
+    vsrari.h      vr4,    vr4,   6
+    vilvl.w       vr0,    vr1,   vr0
+    vilvl.w       vr1,    vr3,   vr2
+    vsllwil.hu.bu vr0,    vr0,   0
+    vsllwil.hu.bu vr1,    vr1,   0
+    vadd.h        vr0,    vr0,   vr4
+    vadd.h        vr1,    vr1,   vr4
+    vssrarni.bu.h vr1,    vr0,   0
+
+    vbsrl.v       vr2,    vr1,   4
+    vbsrl.v       vr3,    vr1,   8
+    vbsrl.v       vr4,    vr1,   12
+    fst.s         f1,     a0,    0
+    fstx.s        f2,     a0,    a2
+    fstx.s        f3,     a0,    t2
+    fstx.s        f4,     a0,    t3
+endfunc
+
+/*
+ * #define FUNC2(a, b, c)  FUNC3(a, b, c)
+ * #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+ * void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *_block, int stride)
+ */
+function ff_h264_idct8_dc_add_8_lsx
+    vldrepl.h     vr8,    a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    add.d         t4,     t3,    a2
+    add.d         t5,     t4,    a2
+    add.d         t6,     t5,    a2
+    add.d         t7,     t6,    a2
+
+    fld.d         f0,     a0,    0
+    fldx.d        f1,     a0,    a2
+    fldx.d        f2,     a0,    t2
+    fldx.d        f3,     a0,    t3
+    fldx.d        f4,     a0,    t4
+    fldx.d        f5,     a0,    t5
+    fldx.d        f6,     a0,    t6
+    fldx.d        f7,     a0,    t7
+    st.h          zero,   a1,    0
+
+    vsrari.h      vr8,    vr8,   6
+    vsllwil.hu.bu vr0,    vr0,   0
+    vsllwil.hu.bu vr1,    vr1,   0
+    vsllwil.hu.bu vr2,    vr2,   0
+    vsllwil.hu.bu vr3,    vr3,   0
+    vsllwil.hu.bu vr4,    vr4,   0
+    vsllwil.hu.bu vr5,    vr5,   0
+    vsllwil.hu.bu vr6,    vr6,   0
+    vsllwil.hu.bu vr7,    vr7,   0
+    vadd.h        vr0,    vr0,   vr8
+    vadd.h        vr1,    vr1,   vr8
+    vadd.h        vr2,    vr2,   vr8
+    vadd.h        vr3,    vr3,   vr8
+    vadd.h        vr4,    vr4,   vr8
+    vadd.h        vr5,    vr5,   vr8
+    vadd.h        vr6,    vr6,   vr8
+    vadd.h        vr7,    vr7,   vr8
+    vssrarni.bu.h vr1,    vr0,   0
+    vssrarni.bu.h vr3,    vr2,   0
+    vssrarni.bu.h vr5,    vr4,   0
+    vssrarni.bu.h vr7,    vr6,   0
+
+    vbsrl.v       vr0,    vr1,   8
+    vbsrl.v       vr2,    vr3,   8
+    vbsrl.v       vr4,    vr5,   8
+    vbsrl.v       vr6,    vr7,   8
+    fst.d         f1,     a0,    0
+    fstx.d        f0,     a0,    a2
+    fstx.d        f3,     a0,    t2
+    fstx.d        f2,     a0,    t3
+    fstx.d        f5,     a0,    t4
+    fstx.d        f4,     a0,    t5
+    fstx.d        f7,     a0,    t6
+    fstx.d        f6,     a0,    t7
+endfunc
+function ff_h264_idct8_dc_add_8_lasx
+    xvldrepl.h    xr8,    a1,    0
+    add.d         t2,     a2,    a2
+    add.d         t3,     t2,    a2
+    add.d         t4,     t3,    a2
+    add.d         t5,     t4,    a2
+    add.d         t6,     t5,    a2
+    add.d         t7,     t6,    a2
+
+    fld.d         f0,     a0,    0
+    fldx.d        f1,     a0,    a2
+    fldx.d        f2,     a0,    t2
+    fldx.d        f3,     a0,    t3
+    fldx.d        f4,     a0,    t4
+    fldx.d        f5,     a0,    t5
+    fldx.d        f6,     a0,    t6
+    fldx.d        f7,     a0,    t7
+    st.h          zero,   a1,    0
+
+    xvsrari.h     xr8,    xr8,   6
+    xvpermi.q     xr1,    xr0,   0x20
+    xvpermi.q     xr3,    xr2,   0x20
+    xvpermi.q     xr5,    xr4,   0x20
+    xvpermi.q     xr7,    xr6,   0x20
+    xvsllwil.hu.bu xr1,   xr1,   0
+    xvsllwil.hu.bu xr3,   xr3,   0
+    xvsllwil.hu.bu xr5,   xr5,   0
+    xvsllwil.hu.bu xr7,   xr7,   0
+    xvadd.h       xr1,    xr1,   xr8
+    xvadd.h       xr3,    xr3,   xr8
+    xvadd.h       xr5,    xr5,   xr8
+    xvadd.h       xr7,    xr7,   xr8
+
+    xvssrarni.bu.h xr3,   xr1,   0
+    xvssrarni.bu.h xr7,   xr5,   0
+
+    xvpermi.q     xr1,    xr3,   0x11
+    xvpermi.q     xr5,    xr7,   0x11
+    xvbsrl.v      xr0,    xr1,   8
+    xvbsrl.v      xr2,    xr3,   8
+    xvbsrl.v      xr4,    xr5,   8
+    xvbsrl.v      xr6,    xr7,   8
+
+    fst.d         f3,     a0,    0
+    fstx.d        f1,     a0,    a2
+    fstx.d        f2,     a0,    t2
+    fstx.d        f0,     a0,    t3
+    fstx.d        f7,     a0,    t4
+    fstx.d        f5,     a0,    t5
+    fstx.d        f6,     a0,    t6
+    fstx.d        f4,     a0,    t7
+endfunc
+
+/**
+ * IDCT transforms the 16 dc values and dequantizes them.
+ * @param qmul quantization parameter
+ * void FUNCC(ff_h264_luma_dc_dequant_idct)(int16_t *_output, int16_t *_input, int qmul){
+ * LSX optimization is enough for this function.
+ */
+function ff_h264_luma_dc_dequant_idct_8_lsx
+    vld           vr0,    a1,    0
+    vld           vr1,    a1,    8
+    vld           vr2,    a1,    16
+    vld           vr3,    a1,    24
+    vreplgr2vr.w  vr8,    a2
+    LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr9, vr10
+    LSX_BUTTERFLY_4_H  vr4, vr6, vr7, vr5, vr0, vr3, vr2, vr1
+    LSX_BUTTERFLY_4_H  vr0, vr1, vr2, vr3, vr4, vr7, vr6, vr5
+    LSX_TRANSPOSE4x4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3, vr9, vr10
+    LSX_BUTTERFLY_4_H  vr0, vr1, vr3, vr2, vr4, vr7, vr6, vr5
+    LSX_BUTTERFLY_4_H  vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
+    vsllwil.w.h   vr0,    vr0,   0
+    vsllwil.w.h   vr1,    vr1,   0
+    vsllwil.w.h   vr2,    vr2,   0
+    vsllwil.w.h   vr3,    vr3,   0
+    vmul.w        vr0,    vr0,   vr8
+    vmul.w        vr1,    vr1,   vr8
+    vmul.w        vr2,    vr2,   vr8
+    vmul.w        vr3,    vr3,   vr8
+    vsrarni.h.w   vr1,    vr0,   8
+    vsrarni.h.w   vr3,    vr2,   8
+
+    vstelm.h      vr1,    a0,    0,   0
+    vstelm.h      vr1,    a0,    32,  4
+    vstelm.h      vr1,    a0,    64,  1
+    vstelm.h      vr1,    a0,    96,  5
+    vstelm.h      vr3,    a0,    128, 0
+    vstelm.h      vr3,    a0,    160, 4
+    vstelm.h      vr3,    a0,    192, 1
+    vstelm.h      vr3,    a0,    224, 5
+    addi.d        a0,     a0,    256
+    vstelm.h      vr1,    a0,    0,   2
+    vstelm.h      vr1,    a0,    32,  6
+    vstelm.h      vr1,    a0,    64,  3
+    vstelm.h      vr1,    a0,    96,  7
+    vstelm.h      vr3,    a0,    128, 2
+    vstelm.h      vr3,    a0,    160, 6
+    vstelm.h      vr3,    a0,    192, 3
+    vstelm.h      vr3,    a0,    224, 7
+endfunc
diff --git a/libavcodec/loongarch/h264idct_lasx.c b/libavcodec/loongarch/h264idct_lasx.c
deleted file mode 100644
index 46bd3b74d5..0000000000
--- a/libavcodec/loongarch/h264idct_lasx.c
+++ /dev/null
@@ -1,498 +0,0 @@ 
-/*
- * Loongson LASX optimized h264dsp
- *
- * Copyright (c) 2021 Loongson Technology Corporation Limited
- * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
- *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/loongarch/loongson_intrinsics.h"
-#include "h264dsp_lasx.h"
-#include "libavcodec/bit_depth_template.c"
-
-#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3)     \
-{                                                                    \
-   __m256i tmp0_m, tmp1_m, tmp2_m, tmp3_m;                           \
-                                                                     \
-    tmp0_m = __lasx_xvadd_h(in0, in2);                               \
-    tmp1_m = __lasx_xvsub_h(in0, in2);                               \
-    tmp2_m = __lasx_xvsrai_h(in1, 1);                                \
-    tmp2_m = __lasx_xvsub_h(tmp2_m, in3);                            \
-    tmp3_m = __lasx_xvsrai_h(in3, 1);                                \
-    tmp3_m = __lasx_xvadd_h(in1, tmp3_m);                            \
-                                                                     \
-    LASX_BUTTERFLY_4_H(tmp0_m, tmp1_m, tmp2_m, tmp3_m,               \
-                       out0, out1, out2, out3);                      \
-}
-
-void ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride)
-{
-    __m256i src0_m, src1_m, src2_m, src3_m;
-    __m256i dst0_m, dst1_m;
-    __m256i hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
-    __m256i inp0_m, inp1_m, res0_m, src1, src3;
-    __m256i src0 = __lasx_xvld(src, 0);
-    __m256i src2 = __lasx_xvld(src, 16);
-    __m256i zero = __lasx_xvldi(0);
-    int32_t dst_stride_2x = dst_stride << 1;
-    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
-
-    __lasx_xvst(zero, src, 0);
-    DUP2_ARG2(__lasx_xvilvh_d, src0, src0, src2, src2, src1, src3);
-    AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
-    LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
-    AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
-              dst, dst_stride_3x, src0_m, src1_m, src2_m, src3_m);
-    DUP4_ARG2(__lasx_xvld, dst, 0, dst + dst_stride, 0, dst + dst_stride_2x,
-              0, dst + dst_stride_3x, 0, src0_m, src1_m, src2_m, src3_m);
-    DUP2_ARG2(__lasx_xvilvl_d, vres1, vres0, vres3, vres2, inp0_m, inp1_m);
-    inp0_m = __lasx_xvpermi_q(inp1_m, inp0_m, 0x20);
-    inp0_m = __lasx_xvsrari_h(inp0_m, 6);
-    DUP2_ARG2(__lasx_xvilvl_w, src1_m, src0_m, src3_m, src2_m, dst0_m, dst1_m);
-    dst0_m = __lasx_xvilvl_d(dst1_m, dst0_m);
-    res0_m = __lasx_vext2xv_hu_bu(dst0_m);
-    res0_m = __lasx_xvadd_h(res0_m, inp0_m);
-    res0_m = __lasx_xvclip255_h(res0_m);
-    dst0_m = __lasx_xvpickev_b(res0_m, res0_m);
-    __lasx_xvstelm_w(dst0_m, dst, 0, 0);
-    __lasx_xvstelm_w(dst0_m, dst + dst_stride, 0, 1);
-    __lasx_xvstelm_w(dst0_m, dst + dst_stride_2x, 0, 4);
-    __lasx_xvstelm_w(dst0_m, dst + dst_stride_3x, 0, 5);
-}
-
-void ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src,
-                               int32_t dst_stride)
-{
-    __m256i src0, src1, src2, src3, src4, src5, src6, src7;
-    __m256i vec0, vec1, vec2, vec3;
-    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-    __m256i res0, res1, res2, res3, res4, res5, res6, res7;
-    __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-    __m256i zero = __lasx_xvldi(0);
-    int32_t dst_stride_2x = dst_stride << 1;
-    int32_t dst_stride_4x = dst_stride << 2;
-    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
-
-    src[0] += 32;
-    DUP4_ARG2(__lasx_xvld, src, 0, src, 16, src, 32, src, 48,
-              src0, src1, src2, src3);
-    DUP4_ARG2(__lasx_xvld, src, 64, src, 80, src, 96, src, 112,
-              src4, src5, src6, src7);
-    __lasx_xvst(zero, src, 0);
-    __lasx_xvst(zero, src, 32);
-    __lasx_xvst(zero, src, 64);
-    __lasx_xvst(zero, src, 96);
-
-    vec0 = __lasx_xvadd_h(src0, src4);
-    vec1 = __lasx_xvsub_h(src0, src4);
-    vec2 = __lasx_xvsrai_h(src2, 1);
-    vec2 = __lasx_xvsub_h(vec2, src6);
-    vec3 = __lasx_xvsrai_h(src6, 1);
-    vec3 = __lasx_xvadd_h(src2, vec3);
-
-    LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3);
-
-    vec0 = __lasx_xvsrai_h(src7, 1);
-    vec0 = __lasx_xvsub_h(src5, vec0);
-    vec0 = __lasx_xvsub_h(vec0, src3);
-    vec0 = __lasx_xvsub_h(vec0, src7);
-
-    vec1 = __lasx_xvsrai_h(src3, 1);
-    vec1 = __lasx_xvsub_h(src1, vec1);
-    vec1 = __lasx_xvadd_h(vec1, src7);
-    vec1 = __lasx_xvsub_h(vec1, src3);
-
-    vec2 = __lasx_xvsrai_h(src5, 1);
-    vec2 = __lasx_xvsub_h(vec2, src1);
-    vec2 = __lasx_xvadd_h(vec2, src7);
-    vec2 = __lasx_xvadd_h(vec2, src5);
-
-    vec3 = __lasx_xvsrai_h(src1, 1);
-    vec3 = __lasx_xvadd_h(src3, vec3);
-    vec3 = __lasx_xvadd_h(vec3, src5);
-    vec3 = __lasx_xvadd_h(vec3, src1);
-
-    tmp4 = __lasx_xvsrai_h(vec3, 2);
-    tmp4 = __lasx_xvadd_h(tmp4, vec0);
-    tmp5 = __lasx_xvsrai_h(vec2, 2);
-    tmp5 = __lasx_xvadd_h(tmp5, vec1);
-    tmp6 = __lasx_xvsrai_h(vec1, 2);
-    tmp6 = __lasx_xvsub_h(tmp6, vec2);
-    tmp7 = __lasx_xvsrai_h(vec0, 2);
-    tmp7 = __lasx_xvsub_h(vec3, tmp7);
-
-    LASX_BUTTERFLY_8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
-                       res0, res1, res2, res3, res4, res5, res6, res7);
-    LASX_TRANSPOSE8x8_H(res0, res1, res2, res3, res4, res5, res6, res7,
-                        res0, res1, res2, res3, res4, res5, res6, res7);
-
-    DUP4_ARG1(__lasx_vext2xv_w_h, res0, res1, res2, res3,
-              tmp0, tmp1, tmp2, tmp3);
-    DUP4_ARG1(__lasx_vext2xv_w_h, res4, res5, res6, res7,
-              tmp4, tmp5, tmp6, tmp7);
-    vec0 = __lasx_xvadd_w(tmp0, tmp4);
-    vec1 = __lasx_xvsub_w(tmp0, tmp4);
-
-    vec2 = __lasx_xvsrai_w(tmp2, 1);
-    vec2 = __lasx_xvsub_w(vec2, tmp6);
-    vec3 = __lasx_xvsrai_w(tmp6, 1);
-    vec3 = __lasx_xvadd_w(vec3, tmp2);
-
-    tmp0 = __lasx_xvadd_w(vec0, vec3);
-    tmp2 = __lasx_xvadd_w(vec1, vec2);
-    tmp4 = __lasx_xvsub_w(vec1, vec2);
-    tmp6 = __lasx_xvsub_w(vec0, vec3);
-
-    vec0 = __lasx_xvsrai_w(tmp7, 1);
-    vec0 = __lasx_xvsub_w(tmp5, vec0);
-    vec0 = __lasx_xvsub_w(vec0, tmp3);
-    vec0 = __lasx_xvsub_w(vec0, tmp7);
-
-    vec1 = __lasx_xvsrai_w(tmp3, 1);
-    vec1 = __lasx_xvsub_w(tmp1, vec1);
-    vec1 = __lasx_xvadd_w(vec1, tmp7);
-    vec1 = __lasx_xvsub_w(vec1, tmp3);
-
-    vec2 = __lasx_xvsrai_w(tmp5, 1);
-    vec2 = __lasx_xvsub_w(vec2, tmp1);
-    vec2 = __lasx_xvadd_w(vec2, tmp7);
-    vec2 = __lasx_xvadd_w(vec2, tmp5);
-
-    vec3 = __lasx_xvsrai_w(tmp1, 1);
-    vec3 = __lasx_xvadd_w(tmp3, vec3);
-    vec3 = __lasx_xvadd_w(vec3, tmp5);
-    vec3 = __lasx_xvadd_w(vec3, tmp1);
-
-    tmp1 = __lasx_xvsrai_w(vec3, 2);
-    tmp1 = __lasx_xvadd_w(tmp1, vec0);
-    tmp3 = __lasx_xvsrai_w(vec2, 2);
-    tmp3 = __lasx_xvadd_w(tmp3, vec1);
-    tmp5 = __lasx_xvsrai_w(vec1, 2);
-    tmp5 = __lasx_xvsub_w(tmp5, vec2);
-    tmp7 = __lasx_xvsrai_w(vec0, 2);
-    tmp7 = __lasx_xvsub_w(vec3, tmp7);
-
-    LASX_BUTTERFLY_4_W(tmp0, tmp2, tmp5, tmp7, res0, res1, res6, res7);
-    LASX_BUTTERFLY_4_W(tmp4, tmp6, tmp1, tmp3, res2, res3, res4, res5);
-
-    DUP4_ARG2(__lasx_xvsrai_w, res0, 6, res1, 6, res2, 6, res3, 6,
-              res0, res1, res2, res3);
-    DUP4_ARG2(__lasx_xvsrai_w, res4, 6, res5, 6, res6, 6, res7, 6,
-              res4, res5, res6, res7);
-    DUP4_ARG2(__lasx_xvpickev_h, res1, res0, res3, res2, res5, res4, res7,
-              res6, res0, res1, res2, res3);
-    DUP4_ARG2(__lasx_xvpermi_d, res0, 0xd8, res1, 0xd8, res2, 0xd8, res3, 0xd8,
-              res0, res1, res2, res3);
-
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
-              dst, dst_stride_3x, dst0, dst1, dst2, dst3);
-    dst += dst_stride_4x;
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
-              dst, dst_stride_3x, dst4, dst5, dst6, dst7);
-    dst -= dst_stride_4x;
-    DUP4_ARG2(__lasx_xvilvl_b, zero, dst0, zero, dst1, zero, dst2, zero, dst3,
-              dst0, dst1, dst2, dst3);
-    DUP4_ARG2(__lasx_xvilvl_b, zero, dst4, zero, dst5, zero, dst6, zero, dst7,
-              dst4, dst5, dst6, dst7);
-    DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5,
-              dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3);
-    res0 = __lasx_xvadd_h(res0, dst0);
-    res1 = __lasx_xvadd_h(res1, dst1);
-    res2 = __lasx_xvadd_h(res2, dst2);
-    res3 = __lasx_xvadd_h(res3, dst3);
-    DUP4_ARG1(__lasx_xvclip255_h, res0, res1, res2, res3, res0, res1,
-              res2, res3);
-    DUP2_ARG2(__lasx_xvpickev_b, res1, res0, res3, res2, res0, res1);
-    __lasx_xvstelm_d(res0, dst, 0, 0);
-    __lasx_xvstelm_d(res0, dst + dst_stride, 0, 2);
-    __lasx_xvstelm_d(res0, dst + dst_stride_2x, 0, 1);
-    __lasx_xvstelm_d(res0, dst + dst_stride_3x, 0, 3);
-    dst += dst_stride_4x;
-    __lasx_xvstelm_d(res1, dst, 0, 0);
-    __lasx_xvstelm_d(res1, dst + dst_stride, 0, 2);
-    __lasx_xvstelm_d(res1, dst + dst_stride_2x, 0, 1);
-    __lasx_xvstelm_d(res1, dst + dst_stride_3x, 0, 3);
-}
-
-void ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src,
-                                    int32_t dst_stride)
-{
-    const int16_t dc = (src[0] + 32) >> 6;
-    int32_t dst_stride_2x = dst_stride << 1;
-    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
-    __m256i pred, out;
-    __m256i src0, src1, src2, src3;
-    __m256i input_dc = __lasx_xvreplgr2vr_h(dc);
-
-    src[0] = 0;
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
-              dst, dst_stride_3x, src0, src1, src2, src3);
-    DUP2_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src0, src1);
-
-    pred = __lasx_xvpermi_q(src0, src1, 0x02);
-    pred = __lasx_xvaddw_h_h_bu(input_dc, pred);
-    pred = __lasx_xvclip255_h(pred);
-    out = __lasx_xvpickev_b(pred, pred);
-    __lasx_xvstelm_w(out, dst, 0, 0);
-    __lasx_xvstelm_w(out, dst + dst_stride, 0, 1);
-    __lasx_xvstelm_w(out, dst + dst_stride_2x, 0, 4);
-    __lasx_xvstelm_w(out, dst + dst_stride_3x, 0, 5);
-}
-
-void ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src,
-                                  int32_t dst_stride)
-{
-    int32_t dc_val;
-    int32_t dst_stride_2x = dst_stride << 1;
-    int32_t dst_stride_4x = dst_stride << 2;
-    int32_t dst_stride_3x = dst_stride_2x + dst_stride;
-    __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-    __m256i dc;
-
-    dc_val = (src[0] + 32) >> 6;
-    dc = __lasx_xvreplgr2vr_h(dc_val);
-
-    src[0] = 0;
-
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
-              dst, dst_stride_3x, dst0, dst1, dst2, dst3);
-    dst += dst_stride_4x;
-    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
-              dst, dst_stride_3x, dst4, dst5, dst6, dst7);
-    dst -= dst_stride_4x;
-    DUP4_ARG1(__lasx_vext2xv_hu_bu, dst0, dst1, dst2, dst3,
-              dst0, dst1, dst2, dst3);
-    DUP4_ARG1(__lasx_vext2xv_hu_bu, dst4, dst5, dst6, dst7,
-              dst4, dst5, dst6, dst7);
-    DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5,
-              dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3);
-    dst0 = __lasx_xvadd_h(dst0, dc);
-    dst1 = __lasx_xvadd_h(dst1, dc);
-    dst2 = __lasx_xvadd_h(dst2, dc);
-    dst3 = __lasx_xvadd_h(dst3, dc);
-    DUP4_ARG1(__lasx_xvclip255_h, dst0, dst1, dst2, dst3,
-              dst0, dst1, dst2, dst3);
-    DUP2_ARG2(__lasx_xvpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
-    __lasx_xvstelm_d(dst0, dst, 0, 0);
-    __lasx_xvstelm_d(dst0, dst + dst_stride, 0, 2);
-    __lasx_xvstelm_d(dst0, dst + dst_stride_2x, 0, 1);
-    __lasx_xvstelm_d(dst0, dst + dst_stride_3x, 0, 3);
-    dst += dst_stride_4x;
-    __lasx_xvstelm_d(dst1, dst, 0, 0);
-    __lasx_xvstelm_d(dst1, dst + dst_stride, 0, 2);
-    __lasx_xvstelm_d(dst1, dst + dst_stride_2x, 0, 1);
-    __lasx_xvstelm_d(dst1, dst + dst_stride_3x, 0, 3);
-}
-
-void ff_h264_idct_add16_lasx(uint8_t *dst,
-                             const int32_t *blk_offset,
-                             int16_t *block, int32_t dst_stride,
-                             const uint8_t nzc[15 * 8])
-{
-    int32_t i;
-
-    for (i = 0; i < 16; i++) {
-        int32_t nnz = nzc[scan8[i]];
-
-        if (nnz) {
-            if (nnz == 1 && ((dctcoef *) block)[i * 16])
-                ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i],
-                                               block + i * 16 * sizeof(pixel),
-                                               dst_stride);
-            else
-                ff_h264_idct_add_lasx(dst + blk_offset[i],
-                                      block + i * 16 * sizeof(pixel),
-                                      dst_stride);
-        }
-    }
-}
-
-void ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset,
-                             int16_t *block, int32_t dst_stride,
-                             const uint8_t nzc[15 * 8])
-{
-    int32_t cnt;
-
-    for (cnt = 0; cnt < 16; cnt += 4) {
-        int32_t nnz = nzc[scan8[cnt]];
-
-        if (nnz) {
-            if (nnz == 1 && ((dctcoef *) block)[cnt * 16])
-                ff_h264_idct8_dc_addblk_lasx(dst + blk_offset[cnt],
-                                             block + cnt * 16 * sizeof(pixel),
-                                             dst_stride);
-            else
-                ff_h264_idct8_addblk_lasx(dst + blk_offset[cnt],
-                                          block + cnt * 16 * sizeof(pixel),
-                                          dst_stride);
-        }
-    }
-}
-
-
-void ff_h264_idct_add8_lasx(uint8_t **dst,
-                            const int32_t *blk_offset,
-                            int16_t *block, int32_t dst_stride,
-                            const uint8_t nzc[15 * 8])
-{
-    int32_t i;
-
-    for (i = 16; i < 20; i++) {
-        if (nzc[scan8[i]])
-            ff_h264_idct_add_lasx(dst[0] + blk_offset[i],
-                                  block + i * 16 * sizeof(pixel),
-                                  dst_stride);
-        else if (((dctcoef *) block)[i * 16])
-            ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i],
-                                           block + i * 16 * sizeof(pixel),
-                                           dst_stride);
-    }
-    for (i = 32; i < 36; i++) {
-        if (nzc[scan8[i]])
-            ff_h264_idct_add_lasx(dst[1] + blk_offset[i],
-                                  block + i * 16 * sizeof(pixel),
-                                  dst_stride);
-        else if (((dctcoef *) block)[i * 16])
-            ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i],
-                                           block + i * 16 * sizeof(pixel),
-                                           dst_stride);
-    }
-}
-
-void ff_h264_idct_add8_422_lasx(uint8_t **dst,
-                                const int32_t *blk_offset,
-                                int16_t *block, int32_t dst_stride,
-                                const uint8_t nzc[15 * 8])
-{
-    int32_t i;
-
-    for (i = 16; i < 20; i++) {
-        if (nzc[scan8[i]])
-            ff_h264_idct_add_lasx(dst[0] + blk_offset[i],
-                                  block + i * 16 * sizeof(pixel),
-                                  dst_stride);
-        else if (((dctcoef *) block)[i * 16])
-            ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i],
-                                           block + i * 16 * sizeof(pixel),
-                                           dst_stride);
-    }
-    for (i = 32; i < 36; i++) {
-        if (nzc[scan8[i]])
-            ff_h264_idct_add_lasx(dst[1] + blk_offset[i],
-                                  block + i * 16 * sizeof(pixel),
-                                  dst_stride);
-        else if (((dctcoef *) block)[i * 16])
-            ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i],
-                                           block + i * 16 * sizeof(pixel),
-                                           dst_stride);
-    }
-    for (i = 20; i < 24; i++) {
-        if (nzc[scan8[i + 4]])
-            ff_h264_idct_add_lasx(dst[0] + blk_offset[i + 4],
-                                  block + i * 16 * sizeof(pixel),
-                                  dst_stride);
-        else if (((dctcoef *) block)[i * 16])
-            ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i + 4],
-                                           block + i * 16 * sizeof(pixel),
-                                           dst_stride);
-    }
-    for (i = 36; i < 40; i++) {
-        if (nzc[scan8[i + 4]])
-            ff_h264_idct_add_lasx(dst[1] + blk_offset[i + 4],
-                                  block + i * 16 * sizeof(pixel),
-                                  dst_stride);
-        else if (((dctcoef *) block)[i * 16])
-            ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i + 4],
-                                           block + i * 16 * sizeof(pixel),
-                                           dst_stride);
-    }
-}
-
-void ff_h264_idct_add16_intra_lasx(uint8_t *dst,
-                                   const int32_t *blk_offset,
-                                   int16_t *block,
-                                   int32_t dst_stride,
-                                   const uint8_t nzc[15 * 8])
-{
-    int32_t i;
-
-    for (i = 0; i < 16; i++) {
-        if (nzc[scan8[i]])
-            ff_h264_idct_add_lasx(dst + blk_offset[i],
-                                  block + i * 16 * sizeof(pixel), dst_stride);
-        else if (((dctcoef *) block)[i * 16])
-            ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i],
-                                           block + i * 16 * sizeof(pixel),
-                                           dst_stride);
-    }
-}
-
-void ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src,
-                                   int32_t de_qval)
-{
-#define DC_DEST_STRIDE 16
-
-    __m256i src0, src1, src2, src3;
-    __m256i vec0, vec1, vec2, vec3;
-    __m256i tmp0, tmp1, tmp2, tmp3;
-    __m256i hres0, hres1, hres2, hres3;
-    __m256i vres0, vres1, vres2, vres3;
-    __m256i de_q_vec = __lasx_xvreplgr2vr_w(de_qval);
-
-    DUP4_ARG2(__lasx_xvld, src, 0, src, 8, src, 16, src, 24,
-              src0, src1, src2, src3);
-    LASX_TRANSPOSE4x4_H(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
-    LASX_BUTTERFLY_4_H(tmp0, tmp2, tmp3, tmp1, vec0, vec3, vec2, vec1);
-    LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
-    LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3,
-                        hres0, hres1, hres2, hres3);
-    LASX_BUTTERFLY_4_H(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
-    LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3);
-    DUP4_ARG1(__lasx_vext2xv_w_h, vres0, vres1, vres2, vres3,
-              vres0, vres1, vres2, vres3);
-    DUP2_ARG3(__lasx_xvpermi_q, vres1, vres0, 0x20, vres3, vres2, 0x20,
-              vres0, vres1);
-
-    vres0 = __lasx_xvmul_w(vres0, de_q_vec);
-    vres1 = __lasx_xvmul_w(vres1, de_q_vec);
-
-    vres0 = __lasx_xvsrari_w(vres0, 8);
-    vres1 = __lasx_xvsrari_w(vres1, 8);
-    vec0 = __lasx_xvpickev_h(vres1, vres0);
-    vec0 = __lasx_xvpermi_d(vec0, 0xd8);
-    __lasx_xvstelm_h(vec0, dst + 0  * DC_DEST_STRIDE, 0, 0);
-    __lasx_xvstelm_h(vec0, dst + 2  * DC_DEST_STRIDE, 0, 1);
-    __lasx_xvstelm_h(vec0, dst + 8  * DC_DEST_STRIDE, 0, 2);
-    __lasx_xvstelm_h(vec0, dst + 10 * DC_DEST_STRIDE, 0, 3);
-    __lasx_xvstelm_h(vec0, dst + 1  * DC_DEST_STRIDE, 0, 4);
-    __lasx_xvstelm_h(vec0, dst + 3  * DC_DEST_STRIDE, 0, 5);
-    __lasx_xvstelm_h(vec0, dst + 9  * DC_DEST_STRIDE, 0, 6);
-    __lasx_xvstelm_h(vec0, dst + 11 * DC_DEST_STRIDE, 0, 7);
-    __lasx_xvstelm_h(vec0, dst + 4  * DC_DEST_STRIDE, 0, 8);
-    __lasx_xvstelm_h(vec0, dst + 6  * DC_DEST_STRIDE, 0, 9);
-    __lasx_xvstelm_h(vec0, dst + 12 * DC_DEST_STRIDE, 0, 10);
-    __lasx_xvstelm_h(vec0, dst + 14 * DC_DEST_STRIDE, 0, 11);
-    __lasx_xvstelm_h(vec0, dst + 5  * DC_DEST_STRIDE, 0, 12);
-    __lasx_xvstelm_h(vec0, dst + 7  * DC_DEST_STRIDE, 0, 13);
-    __lasx_xvstelm_h(vec0, dst + 13 * DC_DEST_STRIDE, 0, 14);
-    __lasx_xvstelm_h(vec0, dst + 15 * DC_DEST_STRIDE, 0, 15);
-
-#undef DC_DEST_STRIDE
-}
diff --git a/libavcodec/loongarch/h264idct_loongarch.c b/libavcodec/loongarch/h264idct_loongarch.c
new file mode 100644
index 0000000000..26af45503f
--- /dev/null
+++ b/libavcodec/loongarch/h264idct_loongarch.c
@@ -0,0 +1,184 @@ 
+/*
+ * Loongson LSX/LASX optimized h264idct
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Xiwei  Gu  <guxiwei-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264dsp_loongarch.h"
+#include "libavcodec/bit_depth_template.c"
+
+void ff_h264_idct_add16_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                              int16_t *block, int32_t dst_stride,
+                              const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 0; i < 16; i++) {
+        int32_t nnz = nzc[scan8[i]];
+
+        if (nnz == 1 && ((dctcoef *) block)[i * 16]) {
+            ff_h264_idct_dc_add_8_lsx(dst + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    } else if (nnz) {
+            ff_h264_idct_add_8_lsx(dst + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        }
+    }
+}
+
+void ff_h264_idct8_add4_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                              int16_t *block, int32_t dst_stride,
+                              const uint8_t nzc[15 * 8])
+{
+    int32_t cnt;
+
+    for (cnt = 0; cnt < 16; cnt += 4) {
+        int32_t nnz = nzc[scan8[cnt]];
+
+        if (nnz == 1 && ((dctcoef *) block)[cnt * 16]) {
+            ff_h264_idct8_dc_add_8_lsx(dst + blk_offset[cnt],
+                                        block + cnt * 16 * sizeof(pixel),
+                                        dst_stride);
+        } else if (nnz) {
+            ff_h264_idct8_add_8_lsx(dst + blk_offset[cnt],
+                                     block + cnt * 16 * sizeof(pixel),
+                                     dst_stride);
+        }
+    }
+}
+
+#if HAVE_LASX
+void ff_h264_idct8_add4_8_lasx(uint8_t *dst, const int32_t *blk_offset,
+                               int16_t *block, int32_t dst_stride,
+                               const uint8_t nzc[15 * 8])
+{
+    int32_t cnt;
+
+    for (cnt = 0; cnt < 16; cnt += 4) {
+        int32_t nnz = nzc[scan8[cnt]];
+
+        if (nnz == 1 && ((dctcoef *) block)[cnt * 16]) {
+            ff_h264_idct8_dc_add_8_lasx(dst + blk_offset[cnt],
+                                        block + cnt * 16 * sizeof(pixel),
+                                        dst_stride);
+        } else if (nnz) {
+            ff_h264_idct8_add_8_lasx(dst + blk_offset[cnt],
+                                     block + cnt * 16 * sizeof(pixel),
+                                     dst_stride);
+        }
+    }
+}
+#endif // #if HAVE_LASX
+
+void ff_h264_idct_add8_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+                             int16_t *block, int32_t dst_stride,
+                             const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 16; i < 20; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+    for (i = 32; i < 36; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+}
+
+void ff_h264_idct_add8_422_8_lsx(uint8_t **dst, const int32_t *blk_offset,
+                                 int16_t *block, int32_t dst_stride,
+                                 const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 16; i < 20; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+    for (i = 20; i < 24; i++) {
+        if (nzc[scan8[i + 4]])
+            ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i + 4],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i + 4],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+    for (i = 32; i < 36; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+    for (i = 36; i < 40; i++) {
+        if (nzc[scan8[i + 4]])
+            ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i + 4],
+                                   block + i * 16 * sizeof(pixel),
+                                   dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i + 4],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+}
+
+void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
+                                    int16_t *block, int32_t dst_stride,
+                                    const uint8_t nzc[15 * 8])
+{
+    int32_t i;
+
+    for (i = 0; i < 16; i++) {
+        if (nzc[scan8[i]])
+            ff_h264_idct_add_8_lsx(dst + blk_offset[i],
+                                   block + i * 16 * sizeof(pixel), dst_stride);
+        else if (((dctcoef *) block)[i * 16])
+            ff_h264_idct_dc_add_8_lsx(dst + blk_offset[i],
+                                      block + i * 16 * sizeof(pixel),
+                                      dst_stride);
+    }
+}
diff --git a/libavcodec/loongarch/loongson_asm.S b/libavcodec/loongarch/loongson_asm.S
new file mode 100644
index 0000000000..0a649f51c7
--- /dev/null
+++ b/libavcodec/loongarch/loongson_asm.S
@@ -0,0 +1,945 @@ 
+/*
+ * Loongson asm helper.
+ *
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
+ *                Shiyou Yin(yinshiyou-hf@loongson.cn)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * MAJOR version: Macro usage changes.
+ * MINOR version: Add new functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+#define LML_VERSION_MAJOR 0
+#define LML_VERSION_MINOR 2
+#define LML_VERSION_MICRO 0
+
+/*
+ *============================================================================
+ * macros for specific projetc, set them as needed.
+ * Following LoongML macros for your reference.
+ *============================================================================
+ */
+#define ASM_PREF
+#define DEFAULT_ALIGN    5
+
+.macro function name, align=DEFAULT_ALIGN
+.macro endfunc
+    jirl    $r0, $r1, 0x0
+    .size ASM_PREF\name, . - ASM_PREF\name
+    .purgem endfunc
+.endm
+.text ;
+.align \align ;
+.globl ASM_PREF\name ;
+.type  ASM_PREF\name, @function ;
+ASM_PREF\name: ;
+.endm
+
+/**
+ *  Attention: If align is not zero, the macro will use
+ *  t7 until the end of function
+ */
+.macro alloc_stack size, align=0
+.if \align
+    .macro clean_stack
+        add.d   sp, sp, t7
+    .endm
+    addi.d  sp, sp, - \size
+    andi.d  t7, sp, \align - 1
+    sub.d   sp, sp, t7
+    addi.d  t7, t7, \size
+.else
+    .macro clean_stack
+        addi.d  sp, sp, \size
+    .endm
+    addi.d  sp, sp, - \size
+.endif
+.endm
+
+.macro  const name, align=DEFAULT_ALIGN
+    .macro endconst
+    .size  \name, . - \name
+    .purgem endconst
+    .endm
+.section .rodata
+.align   \align
+\name:
+.endm
+
+/*
+ *============================================================================
+ * LoongArch register alias
+ *============================================================================
+ */
+
+#define a0 $a0
+#define a1 $a1
+#define a2 $a2
+#define a3 $a3
+#define a4 $a4
+#define a5 $a5
+#define a6 $a6
+#define a7 $a7
+
+#define t0 $t0
+#define t1 $t1
+#define t2 $t2
+#define t3 $t3
+#define t4 $t4
+#define t5 $t5
+#define t6 $t6
+#define t7 $t7
+#define t8 $t8
+
+#define s0 $s0
+#define s1 $s1
+#define s2 $s2
+#define s3 $s3
+#define s4 $s4
+#define s5 $s5
+#define s6 $s6
+#define s7 $s7
+#define s8 $s8
+
+#define zero $zero
+#define sp   $sp
+#define ra   $ra
+
+#define f0  $f0
+#define f1  $f1
+#define f2  $f2
+#define f3  $f3
+#define f4  $f4
+#define f5  $f5
+#define f6  $f6
+#define f7  $f7
+#define f8  $f8
+#define f9  $f9
+#define f10 $f10
+#define f11 $f11
+#define f12 $f12
+#define f13 $f13
+#define f14 $f14
+#define f15 $f15
+#define f16 $f16
+#define f17 $f17
+#define f18 $f18
+#define f19 $f19
+#define f20 $f20
+#define f21 $f21
+#define f22 $f22
+#define f23 $f23
+#define f24 $f24
+#define f25 $f25
+#define f26 $f26
+#define f27 $f27
+#define f28 $f28
+#define f29 $f29
+#define f30 $f30
+#define f31 $f31
+
+#define vr0 $vr0
+#define vr1 $vr1
+#define vr2 $vr2
+#define vr3 $vr3
+#define vr4 $vr4
+#define vr5 $vr5
+#define vr6 $vr6
+#define vr7 $vr7
+#define vr8 $vr8
+#define vr9 $vr9
+#define vr10 $vr10
+#define vr11 $vr11
+#define vr12 $vr12
+#define vr13 $vr13
+#define vr14 $vr14
+#define vr15 $vr15
+#define vr16 $vr16
+#define vr17 $vr17
+#define vr18 $vr18
+#define vr19 $vr19
+#define vr20 $vr20
+#define vr21 $vr21
+#define vr22 $vr22
+#define vr23 $vr23
+#define vr24 $vr24
+#define vr25 $vr25
+#define vr26 $vr26
+#define vr27 $vr27
+#define vr28 $vr28
+#define vr29 $vr29
+#define vr30 $vr30
+#define vr31 $vr31
+
+#define xr0 $xr0
+#define xr1 $xr1
+#define xr2 $xr2
+#define xr3 $xr3
+#define xr4 $xr4
+#define xr5 $xr5
+#define xr6 $xr6
+#define xr7 $xr7
+#define xr8 $xr8
+#define xr9 $xr9
+#define xr10 $xr10
+#define xr11 $xr11
+#define xr12 $xr12
+#define xr13 $xr13
+#define xr14 $xr14
+#define xr15 $xr15
+#define xr16 $xr16
+#define xr17 $xr17
+#define xr18 $xr18
+#define xr19 $xr19
+#define xr20 $xr20
+#define xr21 $xr21
+#define xr22 $xr22
+#define xr23 $xr23
+#define xr24 $xr24
+#define xr25 $xr25
+#define xr26 $xr26
+#define xr27 $xr27
+#define xr28 $xr28
+#define xr29 $xr29
+#define xr30 $xr30
+#define xr31 $xr31
+
+/*
+ *============================================================================
+ * LSX/LASX synthesize instructions
+ *============================================================================
+ */
+
+/*
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - vj, vk
+ *               Outputs - vd
+ *               Return Type - halfword
+ */
+.macro vdp2.h.bu vd, vj, vk
+    vmulwev.h.bu      \vd,    \vj,    \vk
+    vmaddwod.h.bu     \vd,    \vj,    \vk
+.endm
+
+.macro vdp2.h.bu.b vd, vj, vk
+    vmulwev.h.bu.b    \vd,    \vj,    \vk
+    vmaddwod.h.bu.b   \vd,    \vj,    \vk
+.endm
+
+.macro vdp2.w.h vd, vj, vk
+    vmulwev.w.h       \vd,    \vj,    \vk
+    vmaddwod.w.h      \vd,    \vj,    \vk
+.endm
+
+.macro xvdp2.h.bu xd, xj, xk
+    xvmulwev.h.bu    \xd,    \xj,    \xk
+    xvmaddwod.h.bu   \xd,    \xj,    \xk
+.endm
+
+.macro xvdp2.h.bu.b xd, xj, xk
+    xvmulwev.h.bu.b    \xd,  \xj,    \xk
+    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
+.endm
+
+.macro xvdp2.w.h xd, xj, xk
+    xvmulwev.w.h       \xd,  \xj,    \xk
+    xvmaddwod.w.h      \xd,  \xj,    \xk
+.endm
+
+/*
+ * Description : Dot product & addition of halfword vector elements
+ * Arguments   : Inputs  - vj, vk
+ *               Outputs - vd
+ *               Return Type - twice size of input
+ */
+.macro vdp2add.h.bu vd, vj, vk
+    vmaddwev.h.bu     \vd,    \vj,    \vk
+    vmaddwod.h.bu     \vd,    \vj,    \vk
+.endm
+
+.macro vdp2add.h.bu.b vd, vj, vk
+    vmaddwev.h.bu.b   \vd,    \vj,    \vk
+    vmaddwod.h.bu.b   \vd,    \vj,    \vk
+.endm
+
+.macro vdp2add.w.h vd, vj, vk
+    vmaddwev.w.h      \vd,    \vj,    \vk
+    vmaddwod.w.h      \vd,    \vj,    \vk
+.endm
+
+.macro xvdp2add.h.bu.b xd, xj, xk
+    xvmaddwev.h.bu.b   \xd,  \xj,    \xk
+    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
+.endm
+
+.macro xvdp2add.w.h xd, xj, xk
+    xvmaddwev.w.h      \xd,  \xj,    \xk
+    xvmaddwod.w.h      \xd,  \xj,    \xk
+.endm
+
+/*
+ * Description : Range each element of vector
+ * clip: vj > vk ? vj : vk && vj < va ? vj : va
+ * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
+ */
+.macro vclip.h  vd,  vj, vk, va
+    vmax.h    \vd,  \vj,   \vk
+    vmin.h    \vd,  \vd,   \va
+.endm
+
+.macro vclip255.w  vd, vj
+    vmaxi.w   \vd,   \vj,  0
+    vsat.wu   \vd,   \vd,  7
+.endm
+
+.macro vclip255.h  vd, vj
+    vmaxi.h   \vd,   \vj,  0
+    vsat.hu   \vd,   \vd,  7
+.endm
+
+.macro xvclip.h  xd,  xj, xk, xa
+    xvmax.h    \xd,  \xj,   \xk
+    xvmin.h    \xd,  \xd,   \xa
+.endm
+
+.macro xvclip255.h  xd, xj
+    xvmaxi.h   \xd,   \xj,  0
+    xvsat.hu   \xd,   \xd,  7
+.endm
+
+.macro xvclip255.w  xd, xj
+    xvmaxi.w   \xd,   \xj,  0
+    xvsat.wu   \xd,   \xd,  7
+.endm
+
+/*
+ * Description : Store elements of vector
+ * vd : Data vector to be stroed
+ * rk : Address of data storage
+ * ra : Offset of address
+ * si : Index of data in vd
+ */
+.macro vstelmx.b vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.b   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.h vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.h   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.w vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.w   \vd,  \rk,  0, \si
+.endm
+
+.macro vstelmx.d  vd, rk, ra, si
+    add.d      \rk,  \rk,  \ra
+    vstelm.d   \vd,  \rk,  0, \si
+.endm
+
+.macro vmov xd, xj
+    vor.v  \xd,  \xj,  \xj
+.endm
+
+.macro xmov xd, xj
+    xvor.v  \xd,  \xj,  \xj
+.endm
+
+.macro xvstelmx.d  xd, rk, ra, si
+    add.d      \rk, \rk,  \ra
+    xvstelm.d  \xd, \rk,  0, \si
+.endm
+
+/*
+ *============================================================================
+ * LSX/LASX custom macros
+ *============================================================================
+ */
+
+/*
+ * Load 4 float, double, V128, v256 elements with stride.
+ */
+.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    fld.s     \out0,    \src,    0
+    fldx.s    \out1,    \src,    \stride
+    fldx.s    \out2,    \src,    \stride2
+    fldx.s    \out3,    \src,    \stride3
+.endm
+
+.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    fld.d     \out0,    \src,    0
+    fldx.d    \out1,    \src,    \stride
+    fldx.d    \out2,    \src,    \stride2
+    fldx.d    \out3,    \src,    \stride3
+.endm
+
+.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    vld     \out0,    \src,    0
+    vldx    \out1,    \src,    \stride
+    vldx    \out2,    \src,    \stride2
+    vldx    \out3,    \src,    \stride3
+.endm
+
+.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
+    xvld    \out0,    \src,    0
+    xvldx   \out1,    \src,    \stride
+    xvldx   \out2,    \src,    \stride2
+    xvldx   \out3,    \src,    \stride3
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                          tmp0, tmp1
+    vilvl.h   \tmp0,  \in1,   \in0
+    vilvl.h   \tmp1,  \in3,   \in2
+    vilvl.w   \out0,  \tmp1,  \tmp0
+    vilvh.w   \out2,  \tmp1,  \tmp0
+    vilvh.d   \out1,  \out0,  \out0
+    vilvh.d   \out3,  \out0,  \out2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4            1, 5, 9,13
+ *               5, 6, 7, 8    to      2, 6,10,14
+ *               9,10,11,12  =====>    3, 7,11,15
+ *              13,14,15,16            4, 8,12,16
+ */
+.macro LSX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+                          _tmp0, _tmp1
+
+    vilvl.w    \_tmp0,   \_in1,    \_in0
+    vilvh.w    \_out1,   \_in1,    \_in0
+    vilvl.w    \_tmp1,   \_in3,    \_in2
+    vilvh.w    \_out3,   \_in3,    \_in2
+
+    vilvl.d    \_out0,   \_tmp1,   \_tmp0
+    vilvl.d    \_out2,   \_out3,   \_out1
+    vilvh.d    \_out3,   \_out3,   \_out1
+    vilvh.d    \_out1,   \_tmp1,   \_tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,   \
+                          out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
+                          tmp3, tmp4, tmp5, tmp6, tmp7
+    vilvl.h      \tmp0,    \in6,   \in4
+    vilvl.h      \tmp1,    \in7,   \in5
+    vilvl.h      \tmp2,    \in2,   \in0
+    vilvl.h      \tmp3,    \in3,   \in1
+
+    vilvl.h      \tmp4,    \tmp1,  \tmp0
+    vilvh.h      \tmp5,    \tmp1,  \tmp0
+    vilvl.h      \tmp6,    \tmp3,  \tmp2
+    vilvh.h      \tmp7,    \tmp3,  \tmp2
+
+    vilvh.h      \tmp0,    \in6,   \in4
+    vilvh.h      \tmp1,    \in7,   \in5
+    vilvh.h      \tmp2,    \in2,   \in0
+    vilvh.h      \tmp3,    \in3,   \in1
+
+    vpickev.d    \out0,    \tmp4,  \tmp6
+    vpickod.d    \out1,    \tmp4,  \tmp6
+    vpickev.d    \out2,    \tmp5,  \tmp7
+    vpickod.d    \out3,    \tmp5,  \tmp7
+
+    vilvl.h      \tmp4,    \tmp1,  \tmp0
+    vilvh.h      \tmp5,    \tmp1,  \tmp0
+    vilvl.h      \tmp6,    \tmp3,  \tmp2
+    vilvh.h      \tmp7,    \tmp3,  \tmp2
+
+    vpickev.d    \out4,    \tmp4,  \tmp6
+    vpickod.d    \out5,    \tmp4,  \tmp6
+    vpickev.d    \out6,    \tmp5,  \tmp7
+    vpickod.d    \out7,    \tmp5,  \tmp7
+.endm
+
+/*
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7,        \
+                            in8, in9, in10, in11, in12, in13, in14, in15,  \
+                            out0, out1, out2, out3, out4, out5, out6, out7,\
+                            tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+    xvilvl.b   \tmp0,    \in2,     \in0
+    xvilvl.b   \tmp1,    \in3,     \in1
+    xvilvl.b   \tmp2,    \in6,     \in4
+    xvilvl.b   \tmp3,    \in7,     \in5
+    xvilvl.b   \tmp4,    \in10,    \in8
+    xvilvl.b   \tmp5,    \in11,    \in9
+    xvilvl.b   \tmp6,    \in14,    \in12
+    xvilvl.b   \tmp7,    \in15,    \in13
+    xvilvl.b   \out0,    \tmp1,    \tmp0
+    xvilvh.b   \out1,    \tmp1,    \tmp0
+    xvilvl.b   \out2,    \tmp3,    \tmp2
+    xvilvh.b   \out3,    \tmp3,    \tmp2
+    xvilvl.b   \out4,    \tmp5,    \tmp4
+    xvilvh.b   \out5,    \tmp5,    \tmp4
+    xvilvl.b   \out6,    \tmp7,    \tmp6
+    xvilvh.b   \out7,    \tmp7,    \tmp6
+    xvilvl.w   \tmp0,    \out2,    \out0
+    xvilvh.w   \tmp2,    \out2,    \out0
+    xvilvl.w   \tmp4,    \out3,    \out1
+    xvilvh.w   \tmp6,    \out3,    \out1
+    xvilvl.w   \tmp1,    \out6,    \out4
+    xvilvh.w   \tmp3,    \out6,    \out4
+    xvilvl.w   \tmp5,    \out7,    \out5
+    xvilvh.w   \tmp7,    \out7,    \out5
+    xvilvl.d   \out0,    \tmp1,    \tmp0
+    xvilvh.d   \out1,    \tmp1,    \tmp0
+    xvilvl.d   \out2,    \tmp3,    \tmp2
+    xvilvh.d   \out3,    \tmp3,    \tmp2
+    xvilvl.d   \out4,    \tmp5,    \tmp4
+    xvilvh.d   \out5,    \tmp5,    \tmp4
+    xvilvl.d   \out6,    \tmp7,    \tmp6
+    xvilvh.d   \out7,    \tmp7,    \tmp6
+.endm
+
+/*
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LSX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7,        \
+                           in8, in9, in10, in11, in12, in13, in14, in15,  \
+                           out0, out1, out2, out3, out4, out5, out6, out7,\
+                           tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+    vilvl.b   \tmp0,    \in2,     \in0
+    vilvl.b   \tmp1,    \in3,     \in1
+    vilvl.b   \tmp2,    \in6,     \in4
+    vilvl.b   \tmp3,    \in7,     \in5
+    vilvl.b   \tmp4,    \in10,    \in8
+    vilvl.b   \tmp5,    \in11,    \in9
+    vilvl.b   \tmp6,    \in14,    \in12
+    vilvl.b   \tmp7,    \in15,    \in13
+
+    vilvl.b   \out0,    \tmp1,    \tmp0
+    vilvh.b   \out1,    \tmp1,    \tmp0
+    vilvl.b   \out2,    \tmp3,    \tmp2
+    vilvh.b   \out3,    \tmp3,    \tmp2
+    vilvl.b   \out4,    \tmp5,    \tmp4
+    vilvh.b   \out5,    \tmp5,    \tmp4
+    vilvl.b   \out6,    \tmp7,    \tmp6
+    vilvh.b   \out7,    \tmp7,    \tmp6
+    vilvl.w   \tmp0,    \out2,    \out0
+    vilvh.w   \tmp2,    \out2,    \out0
+    vilvl.w   \tmp4,    \out3,    \out1
+    vilvh.w   \tmp6,    \out3,    \out1
+    vilvl.w   \tmp1,    \out6,    \out4
+    vilvh.w   \tmp3,    \out6,    \out4
+    vilvl.w   \tmp5,    \out7,    \out5
+    vilvh.w   \tmp7,    \out7,    \out5
+    vilvl.d   \out0,    \tmp1,    \tmp0
+    vilvh.d   \out1,    \tmp1,    \tmp0
+    vilvl.d   \out2,    \tmp3,    \tmp2
+    vilvh.d   \out3,    \tmp3,    \tmp2
+    vilvl.d   \out4,    \tmp5,    \tmp4
+    vilvh.d   \out5,    \tmp5,    \tmp4
+    vilvl.d   \out6,    \tmp7,    \tmp6
+    vilvh.d   \out7,    \tmp7,    \tmp6
+.endm
+
+/*
+ * Description : Transpose 4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                           tmp0, tmp1
+    xvilvl.h   \tmp0,  \in1,   \in0
+    xvilvl.h   \tmp1,  \in3,   \in2
+    xvilvl.w   \out0,  \tmp1,  \tmp0
+    xvilvh.w   \out2,  \tmp1,  \tmp0
+    xvilvh.d   \out1,  \out0,  \out0
+    xvilvh.d   \out3,  \out0,  \out2
+.endm
+
+/*
+ * Description : Transpose 4x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                           tmp0, tmp1
+    xvilvl.h      \tmp0,    \in2,   \in0
+    xvilvl.h      \tmp1,    \in3,   \in1
+    xvilvl.h      \out2,    \tmp1,  \tmp0
+    xvilvh.h      \out3,    \tmp1,  \tmp0
+
+    xvilvl.d      \out0,    \out2,  \out2
+    xvilvh.d      \out1,    \out2,  \out2
+    xvilvl.d      \out2,    \out3,  \out3
+    xvilvh.d      \out3,    \out3,  \out3
+.endm
+
+/*
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ */
+.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7,         \
+                           out0, out1, out2, out3, out4, out5, out6, out7, \
+                           tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
+    xvilvl.h     \tmp0,   \in6,     \in4
+    xvilvl.h     \tmp1,   \in7,     \in5
+    xvilvl.h     \tmp2,   \in2,     \in0
+    xvilvl.h     \tmp3,   \in3,     \in1
+
+    xvilvl.h     \tmp4,   \tmp1,    \tmp0
+    xvilvh.h     \tmp5,   \tmp1,    \tmp0
+    xvilvl.h     \tmp6,   \tmp3,    \tmp2
+    xvilvh.h     \tmp7,   \tmp3,    \tmp2
+
+    xvilvh.h     \tmp0,   \in6,     \in4
+    xvilvh.h     \tmp1,   \in7,     \in5
+    xvilvh.h     \tmp2,   \in2,     \in0
+    xvilvh.h     \tmp3,   \in3,     \in1
+
+    xvpickev.d   \out0,   \tmp4,    \tmp6
+    xvpickod.d   \out1,   \tmp4,    \tmp6
+    xvpickev.d   \out2,   \tmp5,    \tmp7
+    xvpickod.d   \out3,   \tmp5,    \tmp7
+
+    xvilvl.h     \tmp4,   \tmp1,    \tmp0
+    xvilvh.h     \tmp5,   \tmp1,    \tmp0
+    xvilvl.h     \tmp6,   \tmp3,    \tmp2
+    xvilvh.h     \tmp7,   \tmp3,    \tmp2
+
+    xvpickev.d   \out4,   \tmp4,    \tmp6
+    xvpickod.d   \out5,   \tmp4,    \tmp6
+    xvpickev.d   \out6,   \tmp5,    \tmp7
+    xvpickod.d   \out7,   \tmp5,    \tmp7
+.endm
+
+/*
+ * Description : Transpose 2x4x4 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ */
+.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
+                             tmp0, tmp1, tmp2
+    xvilvh.h   \tmp1,    \in0,     \in1
+    xvilvl.h   \out1,    \in0,     \in1
+    xvilvh.h   \tmp0,    \in2,     \in3
+    xvilvl.h   \out3,    \in2,     \in3
+
+    xvilvh.w   \tmp2,    \out3,    \out1
+    xvilvl.w   \out3,    \out3,    \out1
+
+    xvilvl.w   \out2,    \tmp0,    \tmp1
+    xvilvh.w   \tmp1,    \tmp0,    \tmp1
+
+    xvilvh.d   \out0,    \out2,    \out3
+    xvilvl.d   \out2,    \out2,    \out3
+    xvilvh.d   \out1,    \tmp1,    \tmp2
+    xvilvl.d   \out3,    \tmp1,    \tmp2
+.endm
+
+/*
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4,  1, 2, 3, 4        1,5, 9,13, 1,5, 9,13
+ *               5, 6, 7, 8,  5, 6, 7, 8   to   2,6,10,14, 2,6,10,14
+ *               9,10,11,12,  9,10,11,12 =====> 3,7,11,15, 3,7,11,15
+ *              13,14,15,16, 13,14,15,16        4,8,12,16, 4,8,12,16
+ */
+.macro LASX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+                           _tmp0, _tmp1
+
+    xvilvl.w    \_tmp0,   \_in1,    \_in0
+    xvilvh.w    \_out1,   \_in1,    \_in0
+    xvilvl.w    \_tmp1,   \_in3,    \_in2
+    xvilvh.w    \_out3,   \_in3,    \_in2
+
+    xvilvl.d    \_out0,   \_tmp1,   \_tmp0
+    xvilvl.d    \_out2,   \_out3,   \_out1
+    xvilvh.d    \_out3,   \_out3,   \_out1
+    xvilvh.d    \_out1,   \_tmp1,   \_tmp0
+.endm
+
+/*
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Example     : LASX_TRANSPOSE8x8_W
+ *        _in0 : 1,2,3,4,5,6,7,8
+ *        _in1 : 2,2,3,4,5,6,7,8
+ *        _in2 : 3,2,3,4,5,6,7,8
+ *        _in3 : 4,2,3,4,5,6,7,8
+ *        _in4 : 5,2,3,4,5,6,7,8
+ *        _in5 : 6,2,3,4,5,6,7,8
+ *        _in6 : 7,2,3,4,5,6,7,8
+ *        _in7 : 8,2,3,4,5,6,7,8
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8
+ *       _out1 : 2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8
+ */
+.macro LASX_TRANSPOSE8x8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,\
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7,\
+                           _tmp0, _tmp1, _tmp2, _tmp3
+    xvilvl.w    \_tmp0,   \_in2,    \_in0
+    xvilvl.w    \_tmp1,   \_in3,    \_in1
+    xvilvh.w    \_tmp2,   \_in2,    \_in0
+    xvilvh.w    \_tmp3,   \_in3,    \_in1
+    xvilvl.w    \_out0,   \_tmp1,   \_tmp0
+    xvilvh.w    \_out1,   \_tmp1,   \_tmp0
+    xvilvl.w    \_out2,   \_tmp3,   \_tmp2
+    xvilvh.w    \_out3,   \_tmp3,   \_tmp2
+
+    xvilvl.w    \_tmp0,   \_in6,    \_in4
+    xvilvl.w    \_tmp1,   \_in7,    \_in5
+    xvilvh.w    \_tmp2,   \_in6,    \_in4
+    xvilvh.w    \_tmp3,   \_in7,    \_in5
+    xvilvl.w    \_out4,   \_tmp1,   \_tmp0
+    xvilvh.w    \_out5,   \_tmp1,   \_tmp0
+    xvilvl.w    \_out6,   \_tmp3,   \_tmp2
+    xvilvh.w    \_out7,   \_tmp3,   \_tmp2
+
+    xmov        \_tmp0,   \_out0
+    xmov        \_tmp1,   \_out1
+    xmov        \_tmp2,   \_out2
+    xmov        \_tmp3,   \_out3
+    xvpermi.q   \_out0,   \_out4,   0x02
+    xvpermi.q   \_out1,   \_out5,   0x02
+    xvpermi.q   \_out2,   \_out6,   0x02
+    xvpermi.q   \_out3,   \_out7,   0x02
+    xvpermi.q   \_out4,   \_tmp0,   0x31
+    xvpermi.q   \_out5,   \_tmp1,   0x31
+    xvpermi.q   \_out6,   \_tmp2,   0x31
+    xvpermi.q   \_out7,   \_tmp3,   0x31
+.endm
+
+/*
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Example     : LASX_TRANSPOSE4x4_D
+ *        _in0 : 1,2,3,4
+ *        _in1 : 1,2,3,4
+ *        _in2 : 1,2,3,4
+ *        _in3 : 1,2,3,4
+ *
+ *       _out0 : 1,1,1,1
+ *       _out1 : 2,2,2,2
+ *       _out2 : 3,3,3,3
+ *       _out3 : 4,4,4,4
+ */
+.macro LASX_TRANSPOSE4x4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
+                           _tmp0, _tmp1
+    xvilvl.d    \_tmp0,   \_in1,    \_in0
+    xvilvh.d    \_out1,   \_in1,    \_in0
+    xvilvh.d    \_tmp1,   \_in3,    \_in2
+    xvilvl.d    \_out2,   \_in3,    \_in2
+
+    xvor.v      \_out0,   \_tmp0,   \_tmp0
+    xvor.v      \_out3,   \_tmp1,   \_tmp1
+
+    xvpermi.q   \_out0,   \_out2,   0x02
+    xvpermi.q   \_out2,   \_tmp0,   0x31
+    xvpermi.q   \_out3,   \_out1,   0x31
+    xvpermi.q   \_out1,   \_tmp1,   0x02
+.endm
+
+/*
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Details     : Butterfly operation
+ * Example     : LSX_BUTTERFLY_4
+ *               _out0 = _in0 + _in3;
+ *               _out1 = _in1 + _in2;
+ *               _out2 = _in1 - _in2;
+ *               _out3 = _in0 - _in3;
+ */
+.macro LSX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    vadd.b   \_out0,   \_in0,   \_in3
+    vadd.b   \_out1,   \_in1,   \_in2
+    vsub.b   \_out2,   \_in1,   \_in2
+    vsub.b   \_out3,   \_in0,   \_in3
+.endm
+.macro LSX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    vadd.h   \_out0,   \_in0,   \_in3
+    vadd.h   \_out1,   \_in1,   \_in2
+    vsub.h   \_out2,   \_in1,   \_in2
+    vsub.h   \_out3,   \_in0,   \_in3
+.endm
+.macro LSX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    vadd.w   \_out0,   \_in0,   \_in3
+    vadd.w   \_out1,   \_in1,   \_in2
+    vsub.w   \_out2,   \_in1,   \_in2
+    vsub.w   \_out3,   \_in0,   \_in3
+.endm
+.macro LSX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    vadd.d   \_out0,   \_in0,   \_in3
+    vadd.d   \_out1,   \_in1,   \_in2
+    vsub.d   \_out2,   \_in1,   \_in2
+    vsub.d   \_out3,   \_in0,   \_in3
+.endm
+
+.macro LASX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    xvadd.b   \_out0,   \_in0,   \_in3
+    xvadd.b   \_out1,   \_in1,   \_in2
+    xvsub.b   \_out2,   \_in1,   \_in2
+    xvsub.b   \_out3,   \_in0,   \_in3
+.endm
+.macro LASX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    xvadd.h   \_out0,   \_in0,   \_in3
+    xvadd.h   \_out1,   \_in1,   \_in2
+    xvsub.h   \_out2,   \_in1,   \_in2
+    xvsub.h   \_out3,   \_in0,   \_in3
+.endm
+.macro LASX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    xvadd.w   \_out0,   \_in0,   \_in3
+    xvadd.w   \_out1,   \_in1,   \_in2
+    xvsub.w   \_out2,   \_in1,   \_in2
+    xvsub.w   \_out3,   \_in0,   \_in3
+.endm
+.macro LASX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
+    xvadd.d   \_out0,   \_in0,   \_in3
+    xvadd.d   \_out1,   \_in1,   \_in2
+    xvsub.d   \_out2,   \_in1,   \_in2
+    xvsub.d   \_out3,   \_in0,   \_in3
+.endm
+
+/*
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_8
+ *               _out0 = _in0 + _in7;
+ *               _out1 = _in1 + _in6;
+ *               _out2 = _in2 + _in5;
+ *               _out3 = _in3 + _in4;
+ *               _out4 = _in3 - _in4;
+ *               _out5 = _in2 - _in5;
+ *               _out6 = _in1 - _in6;
+ *               _out7 = _in0 - _in7;
+ */
+.macro LSX_BUTTERFLY_8_B _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                         _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    vadd.b    \_out0,    \_in0,    \_in7
+    vadd.b    \_out1,    \_in1,    \_in6
+    vadd.b    \_out2,    \_in2,    \_in5
+    vadd.b    \_out3,    \_in3,    \_in4
+    vsub.b    \_out4,    \_in3,    \_in4
+    vsub.b    \_out5,    \_in2,    \_in5
+    vsub.b    \_out6,    \_in1,    \_in6
+    vsub.b    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_H _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                         _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    vadd.h    \_out0,    \_in0,    \_in7
+    vadd.h    \_out1,    \_in1,    \_in6
+    vadd.h    \_out2,    \_in2,    \_in5
+    vadd.h    \_out3,    \_in3,    \_in4
+    vsub.h    \_out4,    \_in3,    \_in4
+    vsub.h    \_out5,    \_in2,    \_in5
+    vsub.h    \_out6,    \_in1,    \_in6
+    vsub.h    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_W _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                         _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    vadd.w    \_out0,    \_in0,    \_in7
+    vadd.w    \_out1,    \_in1,    \_in6
+    vadd.w    \_out2,    \_in2,    \_in5
+    vadd.w    \_out3,    \_in3,    \_in4
+    vsub.w    \_out4,    \_in3,    \_in4
+    vsub.w    \_out5,    \_in2,    \_in5
+    vsub.w    \_out6,    \_in1,    \_in6
+    vsub.w    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LSX_BUTTERFLY_8_D _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                         _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    vadd.d    \_out0,    \_in0,    \_in7
+    vadd.d    \_out1,    \_in1,    \_in6
+    vadd.d    \_out2,    \_in2,    \_in5
+    vadd.d    \_out3,    \_in3,    \_in4
+    vsub.d    \_out4,    \_in3,    \_in4
+    vsub.d    \_out5,    \_in2,    \_in5
+    vsub.d    \_out6,    \_in1,    \_in6
+    vsub.d    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_B _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    xvadd.b    \_out0,    \_in0,    \_in7
+    xvadd.b    \_out1,    \_in1,    \_in6
+    xvadd.b    \_out2,    \_in2,    \_in5
+    xvadd.b    \_out3,    \_in3,    \_in4
+    xvsub.b    \_out4,    \_in3,    \_in4
+    xvsub.b    \_out5,    \_in2,    \_in5
+    xvsub.b    \_out6,    \_in1,    \_in6
+    xvsub.b    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_H _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    xvadd.h    \_out0,    \_in0,    \_in7
+    xvadd.h    \_out1,    \_in1,    \_in6
+    xvadd.h    \_out2,    \_in2,    \_in5
+    xvadd.h    \_out3,    \_in3,    \_in4
+    xvsub.h    \_out4,    \_in3,    \_in4
+    xvsub.h    \_out5,    \_in2,    \_in5
+    xvsub.h    \_out6,    \_in1,    \_in6
+    xvsub.h    \_out7,    \_in0,    \_in7
+.endm
+
+.macro LASX_BUTTERFLY_8_W _in0,  _in1,  _in2,  _in3,  _in4,  _in5,  _in6,  _in7, \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+    xvadd.w    \_out0,    \_in0,    \_in7
+    xvadd.w    \_out1,    \_in1,    \_in6
+    xvadd.w    \_out2,    \_in2,    \_in5
+    xvadd.w    \_out3,    \_in3,    \_in4
+    xvsub.w    \_out4,    \_in3,    \_in4
+    xvsub.w    \_out5,    \_in2,    \_in5
+    xvsub.w    \_out6,    \_in1,    \_in6
+    xvsub.w    \_out7,    \_in0,    \_in7
+.endm