diff mbox series

[FFmpeg-devel,v2,2/7] avcodec: [loongarch] Optimize h264_chroma_mc with LASX.

Message ID 20211214133316.8978-3-chenhao@loongson.cn
State Superseded
Headers show
Series [FFmpeg-devel,v2,1/7] avutil: [loongarch] Add support for loongarch SIMD. | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

陈昊 Dec. 14, 2021, 1:33 p.m. UTC
From: Shiyou Yin <yinshiyou-hf@loongson.cn>

./ffmpeg -i ../1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before:170
after :183

Change-Id: I42ff23cc2dc7c32bd1b7e4274da9d9ec87065f20
---
 libavcodec/h264chroma.c                       |    2 +
 libavcodec/h264chroma.h                       |    1 +
 libavcodec/loongarch/Makefile                 |    2 +
 .../loongarch/h264chroma_init_loongarch.c     |   37 +
 libavcodec/loongarch/h264chroma_lasx.c        | 1280 +++++++++++
 libavcodec/loongarch/h264chroma_lasx.h        |   36 +
 libavutil/loongarch/loongson_intrinsics.h     | 1877 +++++++++++++++++
 7 files changed, 3235 insertions(+)
 create mode 100644 libavcodec/loongarch/Makefile
 create mode 100644 libavcodec/loongarch/h264chroma_init_loongarch.c
 create mode 100644 libavcodec/loongarch/h264chroma_lasx.c
 create mode 100644 libavcodec/loongarch/h264chroma_lasx.h
 create mode 100644 libavutil/loongarch/loongson_intrinsics.h

Comments

Michael Niedermayer Dec. 14, 2021, 4:26 p.m. UTC | #1
On Tue, Dec 14, 2021 at 09:33:11PM +0800, Hao Chen wrote:
> From: Shiyou Yin <yinshiyou-hf@loongson.cn>
> 
> ./ffmpeg -i ../1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
> before:170
> after :183
> 
> Change-Id: I42ff23cc2dc7c32bd1b7e4274da9d9ec87065f20

Theres something odd with the encoding of this mail

  I     1 <no description> [text/plain, 8bit, y, 159K]
  I     2 <no description> [text/plain, 7bit, us-ascii, 0,2K]

and git seems to dislike it
  
Applying: libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.
Press any key to continue...
error: cannot convert from y to UTF-8
fatal: could not parse patch


[...]
Shiyou Yin Dec. 15, 2021, 11:12 a.m. UTC | #2
> 2021年12月15日 上午12:26,Michael Niedermayer <michael@niedermayer.cc> 写道:
> 
> On Tue, Dec 14, 2021 at 09:33:11PM +0800, Hao Chen wrote:
>> From: Shiyou Yin <yinshiyou-hf@loongson.cn>
>> 
>> ./ffmpeg -i ../1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
>> before:170
>> after :183
>> 
>> Change-Id: I42ff23cc2dc7c32bd1b7e4274da9d9ec87065f20
> 
> Theres something odd with the encoding of this mail
> 
>  I     1 <no description> [text/plain, 8bit, y, 159K]
>  I     2 <no description> [text/plain, 7bit, us-ascii, 0,2K]
> 
> and git seems to dislike it
> 
> Applying: libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.
> Press any key to continue...
> error: cannot convert from y to UTF-8
> fatal: could not parse patch
> 

Thank you for your reply. My colleague has uploaded a V3.
This patch set is optimization for h264 on loongarch.
patches for other format will be uploaded in next few days.
Hope it’s not too late for 5.0.
diff mbox series

Patch

diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
index c2f1f30f5a..0ae6c793e1 100644
--- a/libavcodec/h264chroma.c
+++ b/libavcodec/h264chroma.c
@@ -56,4 +56,6 @@  av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
         ff_h264chroma_init_x86(c, bit_depth);
     if (ARCH_MIPS)
         ff_h264chroma_init_mips(c, bit_depth);
+    if (ARCH_LOONGARCH64)
+        ff_h264chroma_init_loongarch(c, bit_depth);
 }
diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
index 5c89fd12df..3259b4935f 100644
--- a/libavcodec/h264chroma.h
+++ b/libavcodec/h264chroma.h
@@ -36,5 +36,6 @@  void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth);
 void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth);
+void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth);
 
 #endif /* AVCODEC_H264CHROMA_H */
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
new file mode 100644
index 0000000000..f8fb54c925
--- /dev/null
+++ b/libavcodec/loongarch/Makefile
@@ -0,0 +1,2 @@ 
+OBJS-$(CONFIG_H264CHROMA)             += loongarch/h264chroma_init_loongarch.o
+LASX-OBJS-$(CONFIG_H264CHROMA)        += loongarch/h264chroma_lasx.o
diff --git a/libavcodec/loongarch/h264chroma_init_loongarch.c b/libavcodec/loongarch/h264chroma_init_loongarch.c
new file mode 100644
index 0000000000..0ca24ecc47
--- /dev/null
+++ b/libavcodec/loongarch/h264chroma_init_loongarch.c
@@ -0,0 +1,37 @@ 
+/*
+ * Copyright (c) 2020 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264chroma_lasx.h"
+#include "libavutil/attributes.h"
+#include "libavutil/loongarch/cpu.h"
+#include "libavcodec/h264chroma.h"
+
+av_cold void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_lasx(cpu_flags)) {
+        if (bit_depth <= 8) {
+            c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_lasx;
+            c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_lasx;
+            c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_lasx;
+        }
+    }
+}
diff --git a/libavcodec/loongarch/h264chroma_lasx.c b/libavcodec/loongarch/h264chroma_lasx.c
new file mode 100644
index 0000000000..824a78dfc8
--- /dev/null
+++ b/libavcodec/loongarch/h264chroma_lasx.c
@@ -0,0 +1,1280 @@ 
+/*
+ * Loongson LASX optimized h264chroma
+ *
+ * Copyright (c) 2020 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264chroma_lasx.h"
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+static const uint8_t chroma_mask_arr[64] = {
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+static av_always_inline void avc_chroma_hv_8x4_lasx(uint8_t *src, uint8_t *dst,
+                             ptrdiff_t stride, uint32_t coef_hor0,
+                             uint32_t coef_hor1, uint32_t coef_ver0,
+                             uint32_t coef_ver1)
+{
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+    ptrdiff_t stride_4x = stride_2x << 1;
+    __m256i src0, src1, src2, src3, src4, out;
+    __m256i res_hz0, res_hz1, res_hz2, res_vt0, res_vt1;
+    __m256i mask;
+    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
+    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
+    __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
+    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
+    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
+
+    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
+    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
+              src1, src2, src3, src4);
+    DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3);
+    src0 = __lasx_xvshuf_b(src0, src0, mask);
+    DUP2_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src1, src3);
+    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
+    res_hz2 = __lasx_xvdp2_h_bu(src3, coeff_hz_vec);
+    res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
+    res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
+    res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
+    res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
+    res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
+    res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
+    out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6);
+    __lasx_xvstelm_d(out, dst, 0, 0);
+    __lasx_xvstelm_d(out, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
+}
+
+static av_always_inline void avc_chroma_hv_8x8_lasx(uint8_t *src, uint8_t *dst,
+                             ptrdiff_t stride, uint32_t coef_hor0,
+                             uint32_t coef_hor1, uint32_t coef_ver0,
+                             uint32_t coef_ver1)
+{
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+    ptrdiff_t stride_4x = stride << 2;
+    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m256i out0, out1;
+    __m256i res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
+    __m256i res_vt0, res_vt1, res_vt2, res_vt3;
+    __m256i mask;
+    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
+    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
+    __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
+    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
+    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
+
+    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
+    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
+              src1, src2, src3, src4);
+    src += stride_4x;
+    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
+              src5, src6, src7, src8);
+    DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20,
+              src8, src7, 0x20, src1, src3, src5, src7);
+    src0 = __lasx_xvshuf_b(src0, src0, mask);
+    DUP4_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src5, src5, mask, src7,
+              src7, mask, src1, src3, src5, src7);
+    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src3,
+              coeff_hz_vec, src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
+    res_hz4 = __lasx_xvdp2_h_bu(src7, coeff_hz_vec);
+    res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
+    res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
+    res_vt2 = __lasx_xvmul_h(res_hz3, coeff_vt_vec0);
+    res_vt3 = __lasx_xvmul_h(res_hz4, coeff_vt_vec0);
+    res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
+    res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
+    res_hz2 = __lasx_xvpermi_q(res_hz2, res_hz3, 0x3);
+    res_hz3 = __lasx_xvpermi_q(res_hz3, res_hz4, 0x3);
+    DUP4_ARG3(__lasx_xvmadd_h, res_vt0, res_hz0, coeff_vt_vec1, res_vt1, res_hz1, coeff_vt_vec1,
+              res_vt2, res_hz2, coeff_vt_vec1, res_vt3, res_hz3, coeff_vt_vec1,
+              res_vt0, res_vt1, res_vt2, res_vt3);
+    DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6, out0, out1);
+    __lasx_xvstelm_d(out0, dst, 0, 0);
+    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
+    dst += stride_4x;
+    __lasx_xvstelm_d(out1, dst, 0, 0);
+    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
+}
+
+static av_always_inline void avc_chroma_hz_8x4_lasx(uint8_t *src, uint8_t *dst,
+                             ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
+{
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+    __m256i src0, src1, src2, src3, out;
+    __m256i res0, res1;
+    __m256i mask;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
+    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
+    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src1, src2);
+    src3 = __lasx_xvldx(src, stride_3x);
+    DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
+    DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
+    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
+    out = __lasx_xvssrarni_bu_h(res1, res0, 6);
+    __lasx_xvstelm_d(out, dst, 0, 0);
+    __lasx_xvstelm_d(out, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
+
+}
+
+static av_always_inline void avc_chroma_hz_8x8_lasx(uint8_t *src, uint8_t *dst,
+                             ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
+{
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+    ptrdiff_t stride_4x = stride << 2;
+    __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m256i out0, out1;
+    __m256i res0, res1, res2, res3;
+    __m256i mask;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
+    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
+    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
+              src1, src2, src3, src4);
+    src += stride_4x;
+    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src5, src6);
+    src7 = __lasx_xvldx(src, stride_3x);
+    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20,
+              src7, src6, 0x20, src0, src2, src4, src6);
+    DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4, mask,
+              src6, src6, mask, src0, src2, src4, src6);
+    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
+              coeff_vec, res0, res1, res2, res3);
+    DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
+    __lasx_xvstelm_d(out0, dst, 0, 0);
+    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
+    dst += stride_4x;
+    __lasx_xvstelm_d(out1, dst, 0, 0);
+    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
+}
+
+static av_always_inline void avc_chroma_hz_nonmult_lasx(uint8_t *src,
+                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
+                             uint32_t coeff1, int32_t height)
+{
+    uint32_t row;
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+    ptrdiff_t stride_4x = stride << 2;
+    __m256i src0, src1, src2, src3, out;
+    __m256i res0, res1;
+    __m256i mask;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    mask = __lasx_xvld(chroma_mask_arr, 0);
+    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
+
+    for (row = height >> 2; row--;) {
+        DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
+                  src0, src1, src2, src3);
+        src += stride_4x;
+        DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
+        DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
+        DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
+        out = __lasx_xvssrarni_bu_h(res1, res0, 6);
+        __lasx_xvstelm_d(out, dst, 0, 0);
+        __lasx_xvstelm_d(out, dst + stride, 0, 2);
+        __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
+        __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
+        dst += stride_4x;
+    }
+
+    if ((height & 3)) {
+        src0 = __lasx_xvld(src, 0);
+        src1 = __lasx_xvldx(src, stride);
+        src1 = __lasx_xvpermi_q(src1, src0, 0x20);
+        src0 = __lasx_xvshuf_b(src1, src1, mask);
+        res0 = __lasx_xvdp2_h_bu(src0, coeff_vec);
+        out  = __lasx_xvssrarni_bu_h(res0, res0, 6);
+        __lasx_xvstelm_d(out, dst, 0, 0);
+        dst += stride;
+        __lasx_xvstelm_d(out, dst, 0, 2);
+    }
+}
+
+static av_always_inline void avc_chroma_vt_8x4_lasx(uint8_t *src, uint8_t *dst,
+                             ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
+{
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+    __m256i src0, src1, src2, src3, src4, out;
+    __m256i res0, res1;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
+    src0 = __lasx_xvld(src, 0);
+    src += stride;
+    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
+              src1, src2, src3, src4);
+    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
+              src4, src3, 0x20, src0, src1, src2, src3);
+    DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2);
+    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
+    out  = __lasx_xvssrarni_bu_h(res1, res0, 6);
+    __lasx_xvstelm_d(out, dst, 0, 0);
+    __lasx_xvstelm_d(out, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
+}
+
+static av_always_inline void avc_chroma_vt_8x8_lasx(uint8_t *src, uint8_t *dst,
+                             ptrdiff_t stride, uint32_t coeff0, uint32_t coeff1)
+{
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+    ptrdiff_t stride_4x = stride << 2;
+    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m256i out0, out1;
+    __m256i res0, res1, res2, res3;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
+    src0 = __lasx_xvld(src, 0);
+    src += stride;
+    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
+              src1, src2, src3, src4);
+    src += stride_4x;
+    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
+              src5, src6, src7, src8);
+    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
+              src4, src3, 0x20, src0, src1, src2, src3);
+    DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 0x20,
+              src8, src7, 0x20, src4, src5, src6, src7);
+    DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src5, src4, src7, src6,
+              src0, src2, src4, src6);
+    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec,
+              src6, coeff_vec, res0, res1, res2, res3);
+    DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
+    __lasx_xvstelm_d(out0, dst, 0, 0);
+    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
+    dst += stride_4x;
+    __lasx_xvstelm_d(out1, dst, 0, 0);
+    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
+}
+
+static av_always_inline void copy_width8x8_lasx(uint8_t *src, uint8_t *dst,
+                             ptrdiff_t stride)
+{
+    uint64_t tmp[8];
+    ptrdiff_t stride_2, stride_3, stride_4;
+    __asm__ volatile (
+        "slli.d   %[stride_2],     %[stride],     1             \n\t"
+        "add.d    %[stride_3],     %[stride_2],   %[stride]     \n\t"
+        "slli.d   %[stride_4],     %[stride_2],   1             \n\t"
+        "ld.d     %[tmp0],         %[src],        0x0           \n\t"
+        "ldx.d    %[tmp1],         %[src],        %[stride]     \n\t"
+        "ldx.d    %[tmp2],         %[src],        %[stride_2]   \n\t"
+        "ldx.d    %[tmp3],         %[src],        %[stride_3]   \n\t"
+        "add.d    %[src],          %[src],        %[stride_4]   \n\t"
+        "ld.d     %[tmp4],         %[src],        0x0           \n\t"
+        "ldx.d    %[tmp5],         %[src],        %[stride]     \n\t"
+        "ldx.d    %[tmp6],         %[src],        %[stride_2]   \n\t"
+        "ldx.d    %[tmp7],         %[src],        %[stride_3]   \n\t"
+
+        "st.d     %[tmp0],         %[dst],        0x0           \n\t"
+        "stx.d    %[tmp1],         %[dst],        %[stride]     \n\t"
+        "stx.d    %[tmp2],         %[dst],        %[stride_2]   \n\t"
+        "stx.d    %[tmp3],         %[dst],        %[stride_3]   \n\t"
+        "add.d    %[dst],          %[dst],        %[stride_4]   \n\t"
+        "st.d     %[tmp4],         %[dst],        0x0           \n\t"
+        "stx.d    %[tmp5],         %[dst],        %[stride]     \n\t"
+        "stx.d    %[tmp6],         %[dst],        %[stride_2]   \n\t"
+        "stx.d    %[tmp7],         %[dst],        %[stride_3]   \n\t"
+        : [tmp0]"=&r"(tmp[0]),        [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),        [tmp3]"=&r"(tmp[3]),
+          [tmp4]"=&r"(tmp[4]),        [tmp5]"=&r"(tmp[5]),
+          [tmp6]"=&r"(tmp[6]),        [tmp7]"=&r"(tmp[7]),
+          [dst]"+&r"(dst),            [src]"+&r"(src),
+          [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
+          [stride_4]"=&r"(stride_4)
+        : [stride]"r"(stride)
+        : "memory"
+    );
+}
+
+static av_always_inline void copy_width8x4_lasx(uint8_t *src, uint8_t *dst,
+                             ptrdiff_t stride)
+{
+    uint64_t tmp[4];
+    ptrdiff_t stride_2, stride_3;
+    __asm__ volatile (
+        "slli.d   %[stride_2],     %[stride],     1             \n\t"
+        "add.d    %[stride_3],     %[stride_2],   %[stride]     \n\t"
+        "ld.d     %[tmp0],         %[src],        0x0           \n\t"
+        "ldx.d    %[tmp1],         %[src],        %[stride]     \n\t"
+        "ldx.d    %[tmp2],         %[src],        %[stride_2]   \n\t"
+        "ldx.d    %[tmp3],         %[src],        %[stride_3]   \n\t"
+
+        "st.d     %[tmp0],         %[dst],        0x0           \n\t"
+        "stx.d    %[tmp1],         %[dst],        %[stride]     \n\t"
+        "stx.d    %[tmp2],         %[dst],        %[stride_2]   \n\t"
+        "stx.d    %[tmp3],         %[dst],        %[stride_3]   \n\t"
+        : [tmp0]"=&r"(tmp[0]),        [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),        [tmp3]"=&r"(tmp[3]),
+          [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3)
+        : [stride]"r"(stride), [dst]"r"(dst), [src]"r"(src)
+        : "memory"
+    );
+}
+
+static void avc_chroma_hv_8w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1,
+                                  int32_t height)
+{
+    if (4 == height) {
+        avc_chroma_hv_8x4_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
+                               coef_ver1);
+    } else if (8 == height) {
+        avc_chroma_hv_8x8_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
+                               coef_ver1);
+    }
+}
+
+static void avc_chroma_hv_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                                   uint32_t coef_hor0, uint32_t coef_hor1,
+                                   uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    ptrdiff_t stride_2 = stride << 1;
+    __m256i src0, src1, src2;
+    __m256i res_hz, res_vt;
+    __m256i mask;
+    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
+    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
+    __m256i coeff_hz_vec  = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
+    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
+    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
+    __m256i coeff_vt_vec  = __lasx_xvpermi_q(coeff_vt_vec1, coeff_vt_vec0, 0x02);
+
+    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
+    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2);
+    DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src0, src1);
+    src0 = __lasx_xvpermi_q(src0, src1, 0x02);
+    res_hz = __lasx_xvdp2_h_bu(src0, coeff_hz_vec);
+    res_vt = __lasx_xvmul_h(res_hz, coeff_vt_vec);
+    res_hz = __lasx_xvpermi_q(res_hz, res_vt, 0x01);
+    res_vt = __lasx_xvadd_h(res_hz, res_vt);
+    res_vt = __lasx_xvssrarni_bu_h(res_vt, res_vt, 6);
+    __lasx_xvstelm_w(res_vt, dst, 0, 0);
+    __lasx_xvstelm_w(res_vt, dst + stride, 0, 1);
+}
+
+static void avc_chroma_hv_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                                   uint32_t coef_hor0, uint32_t coef_hor1,
+                                   uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    ptrdiff_t stride_2 = stride << 1;
+    ptrdiff_t stride_3 = stride_2 + stride;
+    ptrdiff_t stride_4 = stride_2 << 1;
+    __m256i src0, src1, src2, src3, src4;
+    __m256i res_hz0, res_hz1, res_vt0, res_vt1;
+    __m256i mask;
+    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
+    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
+    __m256i coeff_hz_vec  = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
+    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
+    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
+
+    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
+    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
+              src, stride_4, src1, src2, src3, src4);
+    DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask,
+              src4, src3, mask, src0, src1, src2, src3);
+    DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src1, src3, 0x02, src0, src1);
+    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
+    DUP2_ARG2(__lasx_xvmul_h, res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
+    res_hz0 = __lasx_xvadd_h(res_vt0, res_vt1);
+    res_hz0 = __lasx_xvssrarni_bu_h(res_hz0, res_hz0, 6);
+    __lasx_xvstelm_w(res_hz0, dst, 0, 0);
+    __lasx_xvstelm_w(res_hz0, dst + stride, 0, 1);
+    __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 4);
+    __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 5);
+}
+
+static void avc_chroma_hv_4x8_lasx(uint8_t *src, uint8_t * dst, ptrdiff_t stride,
+                                   uint32_t coef_hor0, uint32_t coef_hor1,
+                                   uint32_t coef_ver0, uint32_t coef_ver1)
+{
+    ptrdiff_t stride_2 = stride << 1;
+    ptrdiff_t stride_3 = stride_2 + stride;
+    ptrdiff_t stride_4 = stride_2 << 1;
+    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m256i res_hz0, res_hz1, res_hz2, res_hz3;
+    __m256i res_vt0, res_vt1, res_vt2, res_vt3;
+    __m256i mask;
+    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
+    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
+    __m256i coeff_hz_vec  = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
+    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
+    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
+
+    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
+    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
+              src, stride_4, src1, src2, src3, src4);
+    src += stride_4;
+    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
+              src, stride_4, src5, src6, src7, src8);
+    DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src2, src1, mask, src3, src2, mask,
+              src4, src3, mask, src0, src1, src2, src3);
+    DUP4_ARG3(__lasx_xvshuf_b, src5, src4, mask, src6, src5, mask, src7, src6, mask,
+              src8, src7, mask, src4, src5, src6, src7);
+    DUP4_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src1, src3, 0x02, src4, src6, 0x02,
+              src5, src7, 0x02, src0, src1, src4, src5);
+    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src4, coeff_hz_vec,
+              src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
+    DUP4_ARG2(__lasx_xvmul_h, res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+              coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
+    DUP2_ARG2(__lasx_xvadd_h, res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2);
+    res_hz0 = __lasx_xvssrarni_bu_h(res_vt2, res_vt0, 6);
+    __lasx_xvstelm_w(res_hz0, dst, 0, 0);
+    __lasx_xvstelm_w(res_hz0, dst + stride, 0, 1);
+    __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 4);
+    __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 5);
+    dst += stride_4;
+    __lasx_xvstelm_w(res_hz0, dst, 0, 2);
+    __lasx_xvstelm_w(res_hz0, dst + stride, 0, 3);
+    __lasx_xvstelm_w(res_hz0, dst + stride_2, 0, 6);
+    __lasx_xvstelm_w(res_hz0, dst + stride_3, 0, 7);
+}
+
+static void avc_chroma_hv_4w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                                  uint32_t coef_hor0, uint32_t coef_hor1,
+                                  uint32_t coef_ver0, uint32_t coef_ver1,
+                                  int32_t height)
+{
+    if (8 == height) {
+        avc_chroma_hv_4x8_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
+                               coef_ver1);
+    } else if (4 == height) {
+        avc_chroma_hv_4x4_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
+                               coef_ver1);
+    } else if (2 == height) {
+        avc_chroma_hv_4x2_lasx(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
+                               coef_ver1);
+    }
+}
+
+static void avc_chroma_hz_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                                   uint32_t coeff0, uint32_t coeff1)
+{
+    __m256i src0, src1;
+    __m256i res, mask;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
+    src1 = __lasx_xvldx(src, stride);
+    src0 = __lasx_xvshuf_b(src1, src0, mask);
+    res = __lasx_xvdp2_h_bu(src0, coeff_vec);
+    res = __lasx_xvslli_h(res, 3);
+    res = __lasx_xvssrarni_bu_h(res, res, 6);
+    __lasx_xvstelm_w(res, dst, 0, 0);
+    __lasx_xvstelm_w(res, dst + stride, 0, 1);
+}
+
+static void avc_chroma_hz_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                                   uint32_t coeff0, uint32_t coeff1)
+{
+    ptrdiff_t stride_2 = stride << 1;
+    ptrdiff_t stride_3 = stride_2 + stride;
+    __m256i src0, src1, src2, src3;
+    __m256i res, mask;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
+    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src1, src2);
+    src3 = __lasx_xvldx(src, stride_3);
+    DUP2_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src0, src2);
+    src0 = __lasx_xvpermi_q(src0, src2, 0x02);
+    res = __lasx_xvdp2_h_bu(src0, coeff_vec);
+    res = __lasx_xvslli_h(res, 3);
+    res = __lasx_xvssrarni_bu_h(res, res, 6);
+    __lasx_xvstelm_w(res, dst, 0, 0);
+    __lasx_xvstelm_w(res, dst + stride, 0, 1);
+    __lasx_xvstelm_w(res, dst + stride_2, 0, 4);
+    __lasx_xvstelm_w(res, dst + stride_3, 0, 5);
+}
+
+static void avc_chroma_hz_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                                   uint32_t coeff0, uint32_t coeff1)
+{
+    ptrdiff_t stride_2 = stride << 1;
+    ptrdiff_t stride_3 = stride_2 + stride;
+    ptrdiff_t stride_4 = stride_2 << 1;
+    __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m256i res0, res1, mask;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
+    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 32, src, 0, mask, src0);
+    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
+              src, stride_4, src1, src2, src3, src4);
+    src += stride_4;
+    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride_2, src5, src6);
+    src7 = __lasx_xvldx(src, stride_3);
+    DUP4_ARG3(__lasx_xvshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask,
+              src7, src6, mask, src0, src2, src4, src6);
+    DUP2_ARG3(__lasx_xvpermi_q, src0, src2, 0x02, src4, src6, 0x02, src0, src4);
+    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src4, coeff_vec, res0, res1);
+    res0 = __lasx_xvssrarni_bu_h(res1, res0, 6);
+    __lasx_xvstelm_w(res0, dst, 0, 0);
+    __lasx_xvstelm_w(res0, dst + stride, 0, 1);
+    __lasx_xvstelm_w(res0, dst + stride_2, 0, 4);
+    __lasx_xvstelm_w(res0, dst + stride_3, 0, 5);
+    dst += stride_4;
+    __lasx_xvstelm_w(res0, dst, 0, 2);
+    __lasx_xvstelm_w(res0, dst + stride, 0, 3);
+    __lasx_xvstelm_w(res0, dst + stride_2, 0, 6);
+    __lasx_xvstelm_w(res0, dst + stride_3, 0, 7);
+}
+
+static void avc_chroma_hz_4w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                                  uint32_t coeff0, uint32_t coeff1,
+                                  int32_t height)
+{
+    if (8 == height) {
+        avc_chroma_hz_4x8_lasx(src, dst, stride, coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_hz_4x4_lasx(src, dst, stride, coeff0, coeff1);
+    } else if (2 == height) {
+        avc_chroma_hz_4x2_lasx(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_hz_8w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                                  uint32_t coeff0, uint32_t coeff1,
+                                  int32_t height)
+{
+    if (4 == height) {
+        avc_chroma_hz_8x4_lasx(src, dst, stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_hz_8x8_lasx(src, dst, stride, coeff0, coeff1);
+    } else {
+        avc_chroma_hz_nonmult_lasx(src, dst, stride, coeff0, coeff1, height);
+    }
+}
+
+static void avc_chroma_vt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                                   uint32_t coeff0, uint32_t coeff1)
+{
+    __m256i src0, src1, src2;
+    __m256i tmp0, tmp1;
+    __m256i res;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    src0 = __lasx_xvld(src, 0);
+    DUP2_ARG2(__lasx_xvldx, src, stride, src, stride << 1, src1, src2);
+    DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, tmp0, tmp1);
+    tmp0 = __lasx_xvilvl_d(tmp1, tmp0);
+    res  = __lasx_xvdp2_h_bu(tmp0, coeff_vec);
+    res  = __lasx_xvslli_h(res, 3);
+    res  = __lasx_xvssrarni_bu_h(res, res, 6);
+    __lasx_xvstelm_w(res, dst, 0, 0);
+    __lasx_xvstelm_w(res, dst + stride, 0, 1);
+}
+
+static void avc_chroma_vt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                                   uint32_t coeff0, uint32_t coeff1)
+{
+    ptrdiff_t stride_2 = stride << 1;
+    ptrdiff_t stride_3 = stride_2 + stride;
+    ptrdiff_t stride_4 = stride_2 << 1;
+    __m256i src0, src1, src2, src3, src4;
+    __m256i tmp0, tmp1, tmp2, tmp3;
+    __m256i res;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    src0 = __lasx_xvld(src, 0);
+    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
+              src, stride_4, src1, src2, src3, src4);
+    DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
+    tmp0 = __lasx_xvpermi_q(tmp0, tmp2, 0x02);
+    res = __lasx_xvdp2_h_bu(tmp0, coeff_vec);
+    res = __lasx_xvslli_h(res, 3);
+    res = __lasx_xvssrarni_bu_h(res, res, 6);
+    __lasx_xvstelm_w(res, dst, 0, 0);
+    __lasx_xvstelm_w(res, dst + stride, 0, 1);
+    __lasx_xvstelm_w(res, dst + stride_2, 0, 4);
+    __lasx_xvstelm_w(res, dst + stride_3, 0, 5);
+}
+
+static void avc_chroma_vt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                                   uint32_t coeff0, uint32_t coeff1)
+{
+    ptrdiff_t stride_2 = stride << 1;
+    ptrdiff_t stride_3 = stride_2 + stride;
+    ptrdiff_t stride_4 = stride_2 << 1;
+    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    __m256i res0, res1;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec  = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
+    src0 = __lasx_xvld(src, 0);
+    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
+              src, stride_4, src1, src2, src3, src4);
+    src += stride_4;
+    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2, src, stride_3,
+              src, stride_4, src5, src6, src7, src8);
+    DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lasx_xvilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
+              tmp0, tmp2, tmp4, tmp6);
+    tmp0 = __lasx_xvpermi_q(tmp0, tmp2, 0x02);
+    tmp4 = __lasx_xvpermi_q(tmp4, tmp6, 0x02);
+    DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, coeff_vec, tmp4, coeff_vec, res0, res1);
+    res0 = __lasx_xvssrarni_bu_h(res1, res0, 6);
+    __lasx_xvstelm_w(res0, dst, 0, 0);
+    __lasx_xvstelm_w(res0, dst + stride, 0, 1);
+    __lasx_xvstelm_w(res0, dst + stride_2, 0, 4);
+    __lasx_xvstelm_w(res0, dst + stride_3, 0, 5);
+    dst += stride_4;
+    __lasx_xvstelm_w(res0, dst, 0, 2);
+    __lasx_xvstelm_w(res0, dst + stride, 0, 3);
+    __lasx_xvstelm_w(res0, dst + stride_2, 0, 6);
+    __lasx_xvstelm_w(res0, dst + stride_3, 0, 7);
+}
+
+static void avc_chroma_vt_4w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                                  uint32_t coeff0, uint32_t coeff1,
+                                  int32_t height)
+{
+    if (8 == height) {
+        avc_chroma_vt_4x8_lasx(src, dst, stride, coeff0, coeff1);
+    } else if (4 == height) {
+        avc_chroma_vt_4x4_lasx(src, dst, stride, coeff0, coeff1);
+    } else if (2 == height) {
+        avc_chroma_vt_4x2_lasx(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_vt_8w_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                                  uint32_t coeff0, uint32_t coeff1,
+                                  int32_t height)
+{
+    if (4 == height) {
+        avc_chroma_vt_8x4_lasx(src, dst, stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_vt_8x8_lasx(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void copy_width4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                             int32_t height)
+{
+    uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+
+    if (8 == height) {
+        ptrdiff_t stride_2, stride_3, stride_4;
+
+        __asm__ volatile (
+        "slli.d   %[stride_2],     %[stride],     1             \n\t"
+        "add.d    %[stride_3],     %[stride_2],   %[stride]     \n\t"
+        "slli.d   %[stride_4],     %[stride_2],   1             \n\t"
+        "ld.wu    %[tp0],          %[src],        0             \n\t"
+        "ldx.wu   %[tp1],          %[src],        %[stride]     \n\t"
+        "ldx.wu   %[tp2],          %[src],        %[stride_2]   \n\t"
+        "ldx.wu   %[tp3],          %[src],        %[stride_3]   \n\t"
+        "add.d    %[src],          %[src],        %[stride_4]   \n\t"
+        "ld.wu    %[tp4],          %[src],        0             \n\t"
+        "ldx.wu   %[tp5],          %[src],        %[stride]     \n\t"
+        "ldx.wu   %[tp6],          %[src],        %[stride_2]   \n\t"
+        "ldx.wu   %[tp7],          %[src],        %[stride_3]   \n\t"
+        "st.w     %[tp0],          %[dst],        0             \n\t"
+        "stx.w    %[tp1],          %[dst],        %[stride]     \n\t"
+        "stx.w    %[tp2],          %[dst],        %[stride_2]   \n\t"
+        "stx.w    %[tp3],          %[dst],        %[stride_3]   \n\t"
+        "add.d    %[dst],          %[dst],        %[stride_4]   \n\t"
+        "st.w     %[tp4],          %[dst],        0             \n\t"
+        "stx.w    %[tp5],          %[dst],        %[stride]     \n\t"
+        "stx.w    %[tp6],          %[dst],        %[stride_2]   \n\t"
+        "stx.w    %[tp7],          %[dst],        %[stride_3]   \n\t"
+        : [stride_2]"+&r"(stride_2), [stride_3]"+&r"(stride_3), [stride_4]"+&r"(stride_4),
+          [src]"+&r"(src), [dst]"+&r"(dst), [tp0]"+&r"(tp0), [tp1]"+&r"(tp1),
+          [tp2]"+&r"(tp2), [tp3]"+&r"(tp3), [tp4]"+&r"(tp4), [tp5]"+&r"(tp5),
+          [tp6]"+&r"(tp6), [tp7]"+&r"(tp7)
+        : [stride]"r"(stride)
+        : "memory"
+        );
+    } else if (4 == height) {
+        ptrdiff_t stride_2, stride_3;
+
+        __asm__ volatile (
+        "slli.d   %[stride_2],     %[stride],     1             \n\t"
+        "add.d    %[stride_3],     %[stride_2],   %[stride]     \n\t"
+        "ld.wu    %[tp0],          %[src],        0             \n\t"
+        "ldx.wu   %[tp1],          %[src],        %[stride]     \n\t"
+        "ldx.wu   %[tp2],          %[src],        %[stride_2]   \n\t"
+        "ldx.wu   %[tp3],          %[src],        %[stride_3]   \n\t"
+        "st.w     %[tp0],          %[dst],        0             \n\t"
+        "stx.w    %[tp1],          %[dst],        %[stride]     \n\t"
+        "stx.w    %[tp2],          %[dst],        %[stride_2]   \n\t"
+        "stx.w    %[tp3],          %[dst],        %[stride_3]   \n\t"
+        : [stride_2]"+&r"(stride_2), [stride_3]"+&r"(stride_3),
+          [src]"+&r"(src), [dst]"+&r"(dst), [tp0]"+&r"(tp0), [tp1]"+&r"(tp1),
+          [tp2]"+&r"(tp2), [tp3]"+&r"(tp3)
+        : [stride]"r"(stride)
+        : "memory"
+        );
+    } else if (2 == height) {
+        __asm__ volatile (
+        "ld.wu    %[tp0],          %[src],        0             \n\t"
+        "ldx.wu   %[tp1],          %[src],        %[stride]     \n\t"
+        "st.w     %[tp0],          %[dst],        0             \n\t"
+        "stx.w    %[tp1],          %[dst],        %[stride]     \n\t"
+        : [tp0]"+&r"(tp0), [tp1]"+&r"(tp1)
+        : [src]"r"(src), [dst]"r"(dst), [stride]"r"(stride)
+        : "memory"
+        );
+    }
+}
+
+static void copy_width8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                             int32_t height)
+{
+    if (8 == height) {
+        copy_width8x8_lasx(src, dst, stride);
+    } else if (4 == height) {
+        copy_width8x4_lasx(src, dst, stride);
+    }
+}
+
+void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int height, int x, int y)
+{
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if(x && y) {
+        avc_chroma_hv_4w_lasx(src, dst, stride, x, (8 - x), y, (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_4w_lasx(src, dst, stride, x, (8 - x), height);
+    } else if (y) {
+        avc_chroma_vt_4w_lasx(src, dst, stride, y, (8 - y), height);
+    } else {
+        copy_width4_lasx(src, dst, stride, height);
+    }
+}
+
+void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int height, int x, int y)
+{
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (!(x || y)) {
+        copy_width8_lasx(src, dst, stride, height);
+    } else if (x && y) {
+        avc_chroma_hv_8w_lasx(src, dst, stride, x, (8 - x), y, (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_8w_lasx(src, dst, stride, x, (8 - x), height);
+    } else {
+        avc_chroma_vt_8w_lasx(src, dst, stride, y, (8 - y), height);
+    }
+}
+
+static av_always_inline void avc_chroma_hv_and_aver_dst_8x4_lasx(uint8_t *src,
+                             uint8_t *dst, ptrdiff_t stride, uint32_t coef_hor0,
+                             uint32_t coef_hor1, uint32_t coef_ver0,
+                             uint32_t coef_ver1)
+{
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+    ptrdiff_t stride_4x = stride << 2;
+    __m256i tp0, tp1, tp2, tp3;
+    __m256i src0, src1, src2, src3, src4, out;
+    __m256i res_hz0, res_hz1, res_hz2, res_vt0, res_vt1;
+    __m256i mask;
+    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
+    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
+    __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
+    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
+    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
+
+    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
+    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
+              src1, src2, src3, src4);
+    DUP2_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src1, src3);
+    src0 = __lasx_xvshuf_b(src0, src0, mask);
+    DUP2_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src1, src3);
+    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, res_hz0, res_hz1);
+    res_hz2 = __lasx_xvdp2_h_bu(src3, coeff_hz_vec);
+    res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
+    res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
+    res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
+    res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
+    res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
+    res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
+    out = __lasx_xvssrarni_bu_h(res_vt1, res_vt0, 6);
+    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
+              tp0, tp1, tp2, tp3);
+    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
+    tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
+    out = __lasx_xvavgr_bu(out, tp0);
+    __lasx_xvstelm_d(out, dst, 0, 0);
+    __lasx_xvstelm_d(out, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
+}
+
+static av_always_inline void avc_chroma_hv_and_aver_dst_8x8_lasx(uint8_t *src,
+                             uint8_t *dst, ptrdiff_t stride, uint32_t coef_hor0,
+                             uint32_t coef_hor1, uint32_t coef_ver0,
+                             uint32_t coef_ver1)
+{
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+    ptrdiff_t stride_4x = stride << 2;
+    __m256i tp0, tp1, tp2, tp3, dst0, dst1;
+    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m256i out0, out1;
+    __m256i res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
+    __m256i res_vt0, res_vt1, res_vt2, res_vt3;
+    __m256i mask;
+    __m256i coeff_hz_vec0 = __lasx_xvreplgr2vr_b(coef_hor0);
+    __m256i coeff_hz_vec1 = __lasx_xvreplgr2vr_b(coef_hor1);
+    __m256i coeff_vt_vec0 = __lasx_xvreplgr2vr_h(coef_ver0);
+    __m256i coeff_vt_vec1 = __lasx_xvreplgr2vr_h(coef_ver1);
+    __m256i coeff_hz_vec = __lasx_xvilvl_b(coeff_hz_vec0, coeff_hz_vec1);
+
+    DUP2_ARG2(__lasx_xvld, chroma_mask_arr, 0, src, 0, mask, src0);
+    src += stride;
+    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
+              src1, src2, src3, src4);
+    src += stride_4x;
+    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
+              src5, src6, src7, src8);
+    DUP4_ARG3(__lasx_xvpermi_q, src2, src1, 0x20, src4, src3, 0x20, src6, src5, 0x20,
+              src8, src7, 0x20, src1, src3, src5, src7);
+    src0 = __lasx_xvshuf_b(src0, src0, mask);
+    DUP4_ARG3(__lasx_xvshuf_b, src1, src1, mask, src3, src3, mask, src5, src5, mask, src7,
+              src7, mask, src1, src3, src5, src7);
+    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_hz_vec, src1, coeff_hz_vec, src3,
+              coeff_hz_vec, src5, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
+    res_hz4 = __lasx_xvdp2_h_bu(src7, coeff_hz_vec);
+    res_vt0 = __lasx_xvmul_h(res_hz1, coeff_vt_vec0);
+    res_vt1 = __lasx_xvmul_h(res_hz2, coeff_vt_vec0);
+    res_vt2 = __lasx_xvmul_h(res_hz3, coeff_vt_vec0);
+    res_vt3 = __lasx_xvmul_h(res_hz4, coeff_vt_vec0);
+    res_hz0 = __lasx_xvpermi_q(res_hz1, res_hz0, 0x20);
+    res_hz1 = __lasx_xvpermi_q(res_hz1, res_hz2, 0x3);
+    res_hz2 = __lasx_xvpermi_q(res_hz2, res_hz3, 0x3);
+    res_hz3 = __lasx_xvpermi_q(res_hz3, res_hz4, 0x3);
+    res_vt0 = __lasx_xvmadd_h(res_vt0, res_hz0, coeff_vt_vec1);
+    res_vt1 = __lasx_xvmadd_h(res_vt1, res_hz1, coeff_vt_vec1);
+    res_vt2 = __lasx_xvmadd_h(res_vt2, res_hz2, coeff_vt_vec1);
+    res_vt3 = __lasx_xvmadd_h(res_vt3, res_hz3, coeff_vt_vec1);
+    DUP2_ARG3(__lasx_xvssrarni_bu_h, res_vt1, res_vt0, 6, res_vt3, res_vt2, 6,
+              out0, out1);
+    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
+              tp0, tp1, tp2, tp3);
+    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
+    dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
+    dst += stride_4x;
+    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
+              tp0, tp1, tp2, tp3);
+    dst -= stride_4x;
+    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
+    dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
+    out0 = __lasx_xvavgr_bu(out0, dst0);
+    out1 = __lasx_xvavgr_bu(out1, dst1);
+    __lasx_xvstelm_d(out0, dst, 0, 0);
+    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
+    dst += stride_4x;
+    __lasx_xvstelm_d(out1, dst, 0, 0);
+    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
+}
+
+static av_always_inline void avc_chroma_hz_and_aver_dst_8x4_lasx(uint8_t *src,
+                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
+                             uint32_t coeff1)
+{
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+    __m256i tp0, tp1, tp2, tp3;
+    __m256i src0, src1, src2, src3, out;
+    __m256i res0, res1;
+    __m256i mask;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
+    mask = __lasx_xvld(chroma_mask_arr, 0);
+    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
+              src0, src1, src2, src3);
+    DUP2_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src0, src2);
+    DUP2_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src0, src2);
+    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
+    out = __lasx_xvssrarni_bu_h(res1, res0, 6);
+    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
+              tp0, tp1, tp2, tp3);
+    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
+    tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
+    out = __lasx_xvavgr_bu(out, tp0);
+    __lasx_xvstelm_d(out, dst, 0, 0);
+    __lasx_xvstelm_d(out, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
+}
+
+static av_always_inline void avc_chroma_hz_and_aver_dst_8x8_lasx(uint8_t *src,
+                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
+                             uint32_t coeff1)
+{
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+    ptrdiff_t stride_4x = stride << 2;
+    __m256i tp0, tp1, tp2, tp3, dst0, dst1;
+    __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+    __m256i out0, out1;
+    __m256i res0, res1, res2, res3;
+    __m256i mask;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
+    mask = __lasx_xvld(chroma_mask_arr, 0);
+    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
+              src0, src1, src2, src3);
+    src += stride_4x;
+    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
+              src4, src5, src6, src7);
+    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src3, src2, 0x20, src5, src4, 0x20,
+              src7, src6, 0x20, src0, src2, src4, src6);
+    DUP4_ARG3(__lasx_xvshuf_b, src0, src0, mask, src2, src2, mask, src4, src4,
+              mask, src6, src6, mask, src0, src2, src4, src6);
+    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
+              coeff_vec, res0, res1, res2, res3);
+    DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
+    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
+              tp0, tp1, tp2, tp3);
+    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
+    dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
+    dst += stride_4x;
+    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
+              tp0, tp1, tp2, tp3);
+    dst -= stride_4x;
+    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
+    dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
+    out0 = __lasx_xvavgr_bu(out0, dst0);
+    out1 = __lasx_xvavgr_bu(out1, dst1);
+    __lasx_xvstelm_d(out0, dst, 0, 0);
+    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
+    dst += stride_4x;
+    __lasx_xvstelm_d(out1, dst, 0, 0);
+    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
+}
+
+static av_always_inline void avc_chroma_vt_and_aver_dst_8x4_lasx(uint8_t *src,
+                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
+                             uint32_t coeff1)
+{
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+    ptrdiff_t stride_4x = stride << 2;
+    __m256i tp0, tp1, tp2, tp3;
+    __m256i src0, src1, src2, src3, src4, out;
+    __m256i res0, res1;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
+    src0 = __lasx_xvld(src, 0);
+    DUP4_ARG2(__lasx_xvldx, src, stride, src, stride_2x, src, stride_3x, src, stride_4x,
+              src1, src2, src3, src4);
+    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
+              src4, src3, 0x20, src0, src1, src2, src3);
+    DUP2_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src0, src2);
+    DUP2_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, res0, res1);
+    out = __lasx_xvssrarni_bu_h(res1, res0, 6);
+    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
+              tp0, tp1, tp2, tp3);
+    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
+    tp0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
+    out = __lasx_xvavgr_bu(out, tp0);
+    __lasx_xvstelm_d(out, dst, 0, 0);
+    __lasx_xvstelm_d(out, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out, dst + stride_3x, 0, 3);
+}
+
+static av_always_inline void avc_chroma_vt_and_aver_dst_8x8_lasx(uint8_t *src,
+                             uint8_t *dst, ptrdiff_t stride, uint32_t coeff0,
+                             uint32_t coeff1)
+{
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+    ptrdiff_t stride_4x = stride << 2;
+    __m256i tp0, tp1, tp2, tp3, dst0, dst1;
+    __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    __m256i out0, out1;
+    __m256i res0, res1, res2, res3;
+    __m256i coeff_vec0 = __lasx_xvreplgr2vr_b(coeff0);
+    __m256i coeff_vec1 = __lasx_xvreplgr2vr_b(coeff1);
+    __m256i coeff_vec = __lasx_xvilvl_b(coeff_vec0, coeff_vec1);
+
+    coeff_vec = __lasx_xvslli_b(coeff_vec, 3);
+    src0 = __lasx_xvld(src, 0);
+    src += stride;
+    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
+              src1, src2, src3, src4);
+    src += stride_4x;
+    DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x, src, stride_3x,
+              src5, src6, src7, src8);
+    DUP4_ARG3(__lasx_xvpermi_q, src1, src0, 0x20, src2, src1, 0x20, src3, src2, 0x20,
+              src4, src3, 0x20, src0, src1, src2, src3);
+    DUP4_ARG3(__lasx_xvpermi_q, src5, src4, 0x20, src6, src5, 0x20, src7, src6, 0x20,
+              src8, src7, 0x20, src4, src5, src6, src7);
+    DUP4_ARG2(__lasx_xvilvl_b, src1, src0, src3, src2, src5, src4, src7, src6,
+              src0, src2, src4, src6);
+    DUP4_ARG2(__lasx_xvdp2_h_bu, src0, coeff_vec, src2, coeff_vec, src4, coeff_vec, src6,
+              coeff_vec, res0, res1, res2, res3);
+    DUP2_ARG3(__lasx_xvssrarni_bu_h, res1, res0, 6, res3, res2, 6, out0, out1);
+    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
+              tp0, tp1, tp2, tp3);
+    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
+    dst0 = __lasx_xvpermi_q(tp2, tp0, 0x20);
+    dst += stride_4x;
+    DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x, dst, stride_3x,
+              tp0, tp1, tp2, tp3);
+    dst -= stride_4x;
+    DUP2_ARG2(__lasx_xvilvl_d, tp2, tp0, tp3, tp1, tp0, tp2);
+    dst1 = __lasx_xvpermi_q(tp2, tp0, 0x20);
+    out0 = __lasx_xvavgr_bu(out0, dst0);
+    out1 = __lasx_xvavgr_bu(out1, dst1);
+    __lasx_xvstelm_d(out0, dst, 0, 0);
+    __lasx_xvstelm_d(out0, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out0, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out0, dst + stride_3x, 0, 3);
+    dst += stride_4x;
+    __lasx_xvstelm_d(out1, dst, 0, 0);
+    __lasx_xvstelm_d(out1, dst + stride, 0, 2);
+    __lasx_xvstelm_d(out1, dst + stride_2x, 0, 1);
+    __lasx_xvstelm_d(out1, dst + stride_3x, 0, 3);
+}
+
+static av_always_inline void avg_width8x8_lasx(uint8_t *src, uint8_t *dst,
+                                               ptrdiff_t stride)
+{
+    __m256i src0, src1, src2, src3;
+    __m256i dst0, dst1, dst2, dst3;
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+    ptrdiff_t stride_4x = stride << 2;
+
+    src0 = __lasx_xvldrepl_d(src, 0);
+    src1 = __lasx_xvldrepl_d(src + stride, 0);
+    src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
+    src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
+    dst0 = __lasx_xvldrepl_d(dst, 0);
+    dst1 = __lasx_xvldrepl_d(dst + stride, 0);
+    dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
+    dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
+    src0 = __lasx_xvpackev_d(src1,src0);
+    src2 = __lasx_xvpackev_d(src3,src2);
+    src0 = __lasx_xvpermi_q(src0, src2, 0x02);
+    dst0 = __lasx_xvpackev_d(dst1,dst0);
+    dst2 = __lasx_xvpackev_d(dst3,dst2);
+    dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
+    dst0 = __lasx_xvavgr_bu(src0, dst0);
+    __lasx_xvstelm_d(dst0, dst, 0, 0);
+    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
+    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
+    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
+
+    src += stride_4x;
+    dst += stride_4x;
+    src0 = __lasx_xvldrepl_d(src, 0);
+    src1 = __lasx_xvldrepl_d(src + stride, 0);
+    src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
+    src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
+    dst0 = __lasx_xvldrepl_d(dst, 0);
+    dst1 = __lasx_xvldrepl_d(dst + stride, 0);
+    dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
+    dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
+    src0 = __lasx_xvpackev_d(src1,src0);
+    src2 = __lasx_xvpackev_d(src3,src2);
+    src0 = __lasx_xvpermi_q(src0, src2, 0x02);
+    dst0 = __lasx_xvpackev_d(dst1,dst0);
+    dst2 = __lasx_xvpackev_d(dst3,dst2);
+    dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
+    dst0 = __lasx_xvavgr_bu(src0, dst0);
+    __lasx_xvstelm_d(dst0, dst, 0, 0);
+    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
+    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
+    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
+}
+
+static av_always_inline void avg_width8x4_lasx(uint8_t *src, uint8_t *dst,
+                                               ptrdiff_t stride)
+{
+    __m256i src0, src1, src2, src3;
+    __m256i dst0, dst1, dst2, dst3;
+    ptrdiff_t stride_2x = stride << 1;
+    ptrdiff_t stride_3x = stride_2x + stride;
+
+    src0 = __lasx_xvldrepl_d(src, 0);
+    src1 = __lasx_xvldrepl_d(src + stride, 0);
+    src2 = __lasx_xvldrepl_d(src + stride_2x, 0);
+    src3 = __lasx_xvldrepl_d(src + stride_3x, 0);
+    dst0 = __lasx_xvldrepl_d(dst, 0);
+    dst1 = __lasx_xvldrepl_d(dst + stride, 0);
+    dst2 = __lasx_xvldrepl_d(dst + stride_2x, 0);
+    dst3 = __lasx_xvldrepl_d(dst + stride_3x, 0);
+    src0 = __lasx_xvpackev_d(src1,src0);
+    src2 = __lasx_xvpackev_d(src3,src2);
+    src0 = __lasx_xvpermi_q(src0, src2, 0x02);
+    dst0 = __lasx_xvpackev_d(dst1,dst0);
+    dst2 = __lasx_xvpackev_d(dst3,dst2);
+    dst0 = __lasx_xvpermi_q(dst0, dst2, 0x02);
+    dst0 = __lasx_xvavgr_bu(src0, dst0);
+    __lasx_xvstelm_d(dst0, dst, 0, 0);
+    __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
+    __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
+    __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
+}
+
+static void avc_chroma_hv_and_aver_dst_8w_lasx(uint8_t *src, uint8_t *dst,
+                                               ptrdiff_t stride,
+                                               uint32_t coef_hor0,
+                                               uint32_t coef_hor1,
+                                               uint32_t coef_ver0,
+                                               uint32_t coef_ver1,
+                                               int32_t height)
+{
+    if (4 == height) {
+        avc_chroma_hv_and_aver_dst_8x4_lasx(src, dst, stride, coef_hor0,
+                                            coef_hor1, coef_ver0, coef_ver1);
+    } else if (8 == height) {
+        avc_chroma_hv_and_aver_dst_8x8_lasx(src, dst, stride, coef_hor0,
+                                            coef_hor1, coef_ver0, coef_ver1);
+    }
+}
+
+static void avc_chroma_hz_and_aver_dst_8w_lasx(uint8_t *src, uint8_t *dst,
+                                               ptrdiff_t stride, uint32_t coeff0,
+                                               uint32_t coeff1, int32_t height)
+{
+    if (4 == height) {
+        avc_chroma_hz_and_aver_dst_8x4_lasx(src, dst, stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_hz_and_aver_dst_8x8_lasx(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avc_chroma_vt_and_aver_dst_8w_lasx(uint8_t *src, uint8_t *dst,
+                                               ptrdiff_t stride, uint32_t coeff0,
+                                               uint32_t coeff1, int32_t height)
+{
+    if (4 == height) {
+        avc_chroma_vt_and_aver_dst_8x4_lasx(src, dst, stride, coeff0, coeff1);
+    } else if (8 == height) {
+        avc_chroma_vt_and_aver_dst_8x8_lasx(src, dst, stride, coeff0, coeff1);
+    }
+}
+
+static void avg_width8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
+                            int32_t height)
+{
+    if (8 == height) {
+        avg_width8x8_lasx(src, dst, stride);
+    } else if (4 == height) {
+        avg_width8x4_lasx(src, dst, stride);
+    }
+}
+
+void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int height, int x, int y)
+{
+    av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
+
+    if (!(x || y)) {
+        avg_width8_lasx(src, dst, stride, height);
+    } else if (x && y) {
+        avc_chroma_hv_and_aver_dst_8w_lasx(src, dst, stride, x, (8 - x), y,
+                                           (8 - y), height);
+    } else if (x) {
+        avc_chroma_hz_and_aver_dst_8w_lasx(src, dst, stride, x, (8 - x), height);
+    } else {
+        avc_chroma_vt_and_aver_dst_8w_lasx(src, dst, stride, y, (8 - y), height);
+    }
+}
diff --git a/libavcodec/loongarch/h264chroma_lasx.h b/libavcodec/loongarch/h264chroma_lasx.h
new file mode 100644
index 0000000000..4aac8db8cb
--- /dev/null
+++ b/libavcodec/loongarch/h264chroma_lasx.h
@@ -0,0 +1,36 @@ 
+/*
+ * Copyright (c) 2020 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_H264CHROMA_LASX_H
+#define AVCODEC_LOONGARCH_H264CHROMA_LASX_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "libavcodec/h264.h"
+
+void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+        int h, int x, int y);
+void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+        int h, int x, int y);
+void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+        int h, int x, int y);
+
+#endif /* AVCODEC_LOONGARCH_H264CHROMA_LASX_H */
diff --git a/libavutil/loongarch/loongson_intrinsics.h b/libavutil/loongarch/loongson_intrinsics.h
new file mode 100644
index 0000000000..6e0439f829
--- /dev/null
+++ b/libavutil/loongarch/loongson_intrinsics.h
@@ -0,0 +1,1877 @@ 
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * All rights reserved.
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Xiwei Gu   <guxiwei-hf@loongson.cn>
+ *                Lu Wang    <wanglu@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H
+#define AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H
+
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * All rights reserved.
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Xiwei Gu   <guxiwei-hf@loongson.cn>
+ *                Lu Wang    <wanglu@loongson.cn>
+ *
+ * This file is a header file for loongarch builtin extention.
+ *
+ */
+
+#ifndef LOONGSON_INTRINSICS_H
+#define LOONGSON_INTRINSICS_H
+
+/**
+ * MAJOR version: Macro usage changes.
+ * MINOR version: Add new functions, or bug fix.
+ * MICRO version: Comment changes or implementation changes.
+ */
+#define LSOM_VERSION_MAJOR 1
+#define LSOM_VERSION_MINOR 0
+#define LSOM_VERSION_MICRO 3
+
+#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
+{ \
+    _OUT0 = _INS(_IN0); \
+    _OUT1 = _INS(_IN1); \
+}
+
+#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
+{ \
+    _OUT0 = _INS(_IN0, _IN1); \
+    _OUT1 = _INS(_IN2, _IN3); \
+}
+
+#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
+{ \
+    _OUT0 = _INS(_IN0, _IN1, _IN2); \
+    _OUT1 = _INS(_IN3, _IN4, _IN5); \
+}
+
+#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
+{ \
+    DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
+    DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
+}
+
+#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
+                  _OUT0, _OUT1, _OUT2, _OUT3) \
+{ \
+    DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
+    DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
+}
+
+#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
+                  _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
+{ \
+    DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4,  _IN5,  _OUT0, _OUT1); \
+    DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
+}
+
+#ifdef __loongarch_sx
+#include <lsxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Retrun Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               Then the results plus to signed half word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l)
+{
+    __m128i out;
+
+    out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
+    out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Retrun Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               unsigned byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               The results plus to signed half word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_l)
+{
+    __m128i out;
+
+    out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
+    out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of half word vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Retrun Type - __m128i
+ * Details     : Signed half word elements from in_h are multiplied by
+ *               signed half word elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ *               Then the results plus to signed word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l)
+{
+    __m128i out;
+
+    out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
+    out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Retrun Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
+{
+    __m128i out;
+
+    out = __lsx_vmulwev_h_b(in_h, in_l);
+    out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Retrun Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               unsigned byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
+{
+    __m128i out;
+
+    out = __lsx_vmulwev_h_bu(in_h, in_l);
+    out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Retrun Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
+ *         out : 22,38,38,22, 22,38,38,6
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
+{
+    __m128i out;
+
+    out = __lsx_vmulwev_h_bu_b(in_h, in_l);
+    out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Retrun Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get results with the twice size of input.
+ * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
+{
+    __m128i out;
+
+    out = __lsx_vmulwev_w_h(in_h, in_l);
+    out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : (_in))
+ * Arguments   : Inputs  - _in  (input vector)
+ *                       - min  (min threshold)
+ *                       - max  (max threshold)
+ *               Outputs - out  (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : out = __lsx_vclip_h(_in)
+ *         _in : -8,2,280,249, -8,255,280,249
+ *         min : 1,1,1,1, 1,1,1,1
+ *         max : 9,9,9,9, 9,9,9,9
+ *         out : 1,2,9,9, 1,9,9,9
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max)
+{
+    __m128i out;
+
+    out = __lsx_vmax_h(min, _in);
+    out = __lsx_vmin_h(max, out);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments   : Inputs  - _in
+ *               Outputs - out
+ *               Retrun Type - halfword
+ * Details     : Signed byte elements from _in are clamped between 0 and 255.
+ * Example     : out = __lsx_vclip255_h(_in)
+ *         _in : -8,255,280,249, -8,255,280,249
+ *         out : 0,255,255,249, 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_h(__m128i _in)
+{
+    __m128i out;
+
+    out = __lsx_vmaxi_h(_in, 0);
+    out = __lsx_vsat_hu(out, 7);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments   : Inputs  - _in
+ *               Outputs - out
+ *               Retrun Type - word
+ * Details     : Signed byte elements from _in are clamped between 0 and 255.
+ * Example     : out = __lsx_vclip255_w(_in)
+ *         _in : -8,255,280,249
+ *         out : 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_w(__m128i _in)
+{
+    __m128i out;
+
+    out = __lsx_vmaxi_w(_in, 0);
+    out = __lsx_vsat_wu(out, 7);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Swap two variables
+ * Arguments   : Inputs  - _in0, _in1
+ *               Outputs - _in0, _in1 (in-place)
+ * Details     : Swapping of two input variables using xor
+ * Example     : LSX_SWAP(_in0, _in1)
+ *        _in0 : 1,2,3,4
+ *        _in1 : 5,6,7,8
+ *   _in0(out) : 5,6,7,8
+ *   _in1(out) : 1,2,3,4
+ * =============================================================================
+ */
+#define LSX_SWAP(_in0, _in1)                                            \
+{                                                                       \
+    _in0 = __lsx_vxor_v(_in0, _in1);                                    \
+    _in1 = __lsx_vxor_v(_in0, _in1);                                    \
+    _in0 = __lsx_vxor_v(_in0, _in1);                                    \
+}                                                                       \
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4            1, 5, 9,13
+ *               5, 6, 7, 8    to      2, 6,10,14
+ *               9,10,11,12  =====>    3, 7,11,15
+ *              13,14,15,16            4, 8,12,16
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+{                                                                              \
+    __m128i _t0, _t1, _t2, _t3;                                                \
+                                                                               \
+    _t0   = __lsx_vilvl_w(_in1, _in0);                                         \
+    _t1   = __lsx_vilvh_w(_in1, _in0);                                         \
+    _t2   = __lsx_vilvl_w(_in3, _in2);                                         \
+    _t3   = __lsx_vilvh_w(_in3, _in2);                                         \
+    _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
+    _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
+    _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
+    _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with byte elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ * Details     : The rows of the matrix become columns, and the columns become rows.
+ * Example     : LSX_TRANSPOSE8x8_B
+ *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
+ *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
+ *        _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
+ *        _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
+ *        _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
+ *        _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
+ *        _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
+ *        _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
+ *
+ *      _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ *      _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ *      _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ *      _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ *      _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
+ *      _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
+ *      _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
+ *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
+{                                                                                 \
+   __m128i zero = {0};                                                            \
+   __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110};                      \
+   __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                                \
+                                                                                  \
+   _t0 = __lsx_vilvl_b(_in2, _in0);                                               \
+   _t1 = __lsx_vilvl_b(_in3, _in1);                                               \
+   _t2 = __lsx_vilvl_b(_in6, _in4);                                               \
+   _t3 = __lsx_vilvl_b(_in7, _in5);                                               \
+   _t4 = __lsx_vilvl_b(_t1, _t0);                                                 \
+   _t5 = __lsx_vilvh_b(_t1, _t0);                                                 \
+   _t6 = __lsx_vilvl_b(_t3, _t2);                                                 \
+   _t7 = __lsx_vilvh_b(_t3, _t2);                                                 \
+   _out0 = __lsx_vilvl_w(_t6, _t4);                                               \
+   _out2 = __lsx_vilvh_w(_t6, _t4);                                               \
+   _out4 = __lsx_vilvl_w(_t7, _t5);                                               \
+   _out6 = __lsx_vilvh_w(_t7, _t5);                                               \
+   _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                                     \
+   _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                                     \
+   _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                                     \
+   _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                                     \
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with half word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details     :
+ * Example     :
+ *              00,01,02,03,04,05,06,07           00,10,20,30,40,50,60,70
+ *              10,11,12,13,14,15,16,17           01,11,21,31,41,51,61,71
+ *              20,21,22,23,24,25,26,27           02,12,22,32,42,52,62,72
+ *              30,31,32,33,34,35,36,37    to     03,13,23,33,43,53,63,73
+ *              40,41,42,43,44,45,46,47  ======>  04,14,24,34,44,54,64,74
+ *              50,51,52,53,54,55,56,57           05,15,25,35,45,55,65,75
+ *              60,61,62,63,64,65,66,67           06,16,26,36,46,56,66,76
+ *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
+{                                                                                 \
+    __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                     \
+                                                                                  \
+    _s0 = __lsx_vilvl_h(_in6, _in4);                                              \
+    _s1 = __lsx_vilvl_h(_in7, _in5);                                              \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                                \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                                \
+    _s0 = __lsx_vilvh_h(_in6, _in4);                                              \
+    _s1 = __lsx_vilvh_h(_in7, _in5);                                              \
+    _t2 = __lsx_vilvl_h(_s1, _s0);                                                \
+    _t3 = __lsx_vilvh_h(_s1, _s0);                                                \
+    _s0 = __lsx_vilvl_h(_in2, _in0);                                              \
+    _s1 = __lsx_vilvl_h(_in3, _in1);                                              \
+    _t4 = __lsx_vilvl_h(_s1, _s0);                                                \
+    _t5 = __lsx_vilvh_h(_s1, _s0);                                                \
+    _s0 = __lsx_vilvh_h(_in2, _in0);                                              \
+    _s1 = __lsx_vilvh_h(_in3, _in1);                                              \
+    _t6 = __lsx_vilvl_h(_s1, _s0);                                                \
+    _t7 = __lsx_vilvh_h(_s1, _s0);                                                \
+                                                                                  \
+    _out0 = __lsx_vpickev_d(_t0, _t4);                                            \
+    _out2 = __lsx_vpickev_d(_t1, _t5);                                            \
+    _out4 = __lsx_vpickev_d(_t2, _t6);                                            \
+    _out6 = __lsx_vpickev_d(_t3, _t7);                                            \
+    _out1 = __lsx_vpickod_d(_t0, _t4);                                            \
+    _out3 = __lsx_vpickod_d(_t1, _t5);                                            \
+    _out5 = __lsx_vpickod_d(_t2, _t6);                                            \
+    _out7 = __lsx_vpickod_d(_t3, _t7);                                            \
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x4 byte block into 4x8
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
+ *               Return Type - as per RTYPE
+ * Details     : The rows of the matrix become columns, and the columns become rows.
+ * Example     : LSX_TRANSPOSE8x4_B
+ *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
+ *
+ *       _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ *       _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ *       _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,       \
+                           _out0, _out1, _out2, _out3)                           \
+{                                                                                \
+    __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                  \
+                                                                                 \
+    _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                       \
+    _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                       \
+    _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                                   \
+    _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                       \
+    _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                       \
+                                                                                 \
+    _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                                   \
+    _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                                   \
+    _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                                   \
+                                                                                 \
+    _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                                     \
+    _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                                     \
+    _out1 = __lsx_vilvh_d(_out2, _out0);                                         \
+    _out3 = __lsx_vilvh_d(_out0, _out2);                                         \
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, in8
+ *                         in9, in10, in11, in12, in13, in14, in15
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details     :
+ * Example     :
+ *              000,001,002,003,004,005,006,007
+ *              008,009,010,011,012,013,014,015
+ *              016,017,018,019,020,021,022,023
+ *              024,025,026,027,028,029,030,031
+ *              032,033,034,035,036,037,038,039
+ *              040,041,042,043,044,045,046,047        000,008,...,112,120
+ *              048,049,050,051,052,053,054,055        001,009,...,113,121
+ *              056,057,058,059,060,061,062,063   to   002,010,...,114,122
+ *              064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
+ *              072,073,074,075,076,077,078,079        004,012,...,116,124
+ *              080,081,082,083,084,085,086,087        005,013,...,117,125
+ *              088,089,090,091,092,093,094,095        006,014,...,118,126
+ *              096,097,098,099,100,101,102,103        007,015,...,119,127
+ *              104,105,106,107,108,109,110,111
+ *              112,113,114,115,116,117,118,119
+ *              120,121,122,123,124,125,126,127
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _in8,  \
+                            _in9, _in10, _in11, _in12, _in13, _in14, _in15, _out0, \
+                            _out1, _out2, _out3, _out4, _out5, _out6, _out7)       \
+{                                                                                  \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;                \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                                \
+    DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5,       \
+              _tmp0, _tmp1, _tmp2, _tmp3);                                         \
+    DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,        \
+              _in13, _tmp4, _tmp5, _tmp6, _tmp7);                                  \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);                \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);                \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);                \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);                \
+    DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);                    \
+    DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);                    \
+    DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);                    \
+    DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);                    \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);            \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);            \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);            \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);            \
+}
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     : Butterfly operation
+ * Example     :
+ *               out0 = in0 + in3;
+ *               out1 = in1 + in2;
+ *               out2 = in1 - in2;
+ *               out3 = in0 - in3;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+{                                                                             \
+    _out0 = __lsx_vadd_b(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_b(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_b(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_b(_in0, _in3);                                         \
+}
+#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+{                                                                             \
+    _out0 = __lsx_vadd_h(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_h(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_h(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_h(_in0, _in3);                                         \
+}
+#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+{                                                                             \
+    _out0 = __lsx_vadd_w(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_w(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_w(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_w(_in0, _in3);                                         \
+}
+#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+{                                                                             \
+    _out0 = __lsx_vadd_d(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_d(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_d(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_d(_in0, _in3);                                         \
+}
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     :
+ *              _out0 = _in0 + _in7;
+ *              _out1 = _in1 + _in6;
+ *              _out2 = _in2 + _in5;
+ *              _out3 = _in3 + _in4;
+ *              _out4 = _in3 - _in4;
+ *              _out5 = _in2 - _in5;
+ *              _out6 = _in1 - _in6;
+ *              _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
+{                                                                                \
+    _out0 = __lsx_vadd_b(_in0, _in7);                                            \
+    _out1 = __lsx_vadd_b(_in1, _in6);                                            \
+    _out2 = __lsx_vadd_b(_in2, _in5);                                            \
+    _out3 = __lsx_vadd_b(_in3, _in4);                                            \
+    _out4 = __lsx_vsub_b(_in3, _in4);                                            \
+    _out5 = __lsx_vsub_b(_in2, _in5);                                            \
+    _out6 = __lsx_vsub_b(_in1, _in6);                                            \
+    _out7 = __lsx_vsub_b(_in0, _in7);                                            \
+}
+
+#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
+{                                                                                \
+    _out0 = __lsx_vadd_h(_in0, _in7);                                            \
+    _out1 = __lsx_vadd_h(_in1, _in6);                                            \
+    _out2 = __lsx_vadd_h(_in2, _in5);                                            \
+    _out3 = __lsx_vadd_h(_in3, _in4);                                            \
+    _out4 = __lsx_vsub_h(_in3, _in4);                                            \
+    _out5 = __lsx_vsub_h(_in2, _in5);                                            \
+    _out6 = __lsx_vsub_h(_in1, _in6);                                            \
+    _out7 = __lsx_vsub_h(_in0, _in7);                                            \
+}
+
+#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
+{                                                                                \
+    _out0 = __lsx_vadd_w(_in0, _in7);                                            \
+    _out1 = __lsx_vadd_w(_in1, _in6);                                            \
+    _out2 = __lsx_vadd_w(_in2, _in5);                                            \
+    _out3 = __lsx_vadd_w(_in3, _in4);                                            \
+    _out4 = __lsx_vsub_w(_in3, _in4);                                            \
+    _out5 = __lsx_vsub_w(_in2, _in5);                                            \
+    _out6 = __lsx_vsub_w(_in1, _in6);                                            \
+    _out7 = __lsx_vsub_w(_in0, _in7);                                            \
+}
+
+#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
+{                                                                                \
+    _out0 = __lsx_vadd_d(_in0, _in7);                                            \
+    _out1 = __lsx_vadd_d(_in1, _in6);                                            \
+    _out2 = __lsx_vadd_d(_in2, _in5);                                            \
+    _out3 = __lsx_vadd_d(_in3, _in4);                                            \
+    _out4 = __lsx_vsub_d(_in3, _in4);                                            \
+    _out5 = __lsx_vsub_d(_in2, _in5);                                            \
+    _out6 = __lsx_vsub_d(_in1, _in6);                                            \
+    _out7 = __lsx_vsub_d(_in0, _in7);                                            \
+}
+
+#endif //LSX
+
+#ifdef __loongarch_asx
+#include <lasxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvmulwev_h_bu(in_h, in_l);
+    out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Signed byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this iniplication results of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvmulwev_h_b(in_h, in_l);
+    out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the out vector.
+ * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvmulwev_w_h(in_h, in_l);
+    out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of word vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Retrun Type - signed double
+ * Details     : Signed word elements from in_h are multiplied with
+ *               signed word elements from in_l producing a result
+ *               twice the size of input i.e. signed double word.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the out vector.
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvmulwev_d_w(in_h, in_l);
+    out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. unsigned word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
+    out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Retrun Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then this multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
+    out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - per RTYPE
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
+    out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               unsigned halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
+    out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
+    out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Unsigned Dot Product and Subtract
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added together and subtracted from double width elements
+ *               in_c vector.
+ * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvmulwev_h_bu(in_h, in_l);
+    out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+    out = __lasx_xvsub_h(in_c, out);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Signed Dot Product and Subtract
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               Signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added together and subtracted from double width elements
+ *               in_c vector.
+ * Example     : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ *        in_c : 0,0,0,0, 0,0,0,0
+ *        in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
+ *        in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ *         out : -7,-3,0,0, 0,-1,0,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvmulwev_w_h(in_h, in_l);
+    out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+    out = __lasx_xvsub_w(in_c, out);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are iniplied with
+ *               signed halfword elements from in_l producing a result
+ *               four times the size of input i.e. signed doubleword.
+ *               Then this iniplication results of four adjacent elements
+ *               are added together and stored to the out vector.
+ * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
+ *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
+ *        in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
+ *         out : -2,0,1,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvmulwev_w_h(in_h, in_l);
+    out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+    out = __lasx_xvhaddw_d_w(out, out);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               higher half of the two-fold sign extension (signed byte
+ *               to signed halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvilvh_b(in_h, in_l);
+    out = __lasx_xvhaddw_h_b(out, out);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               higher half of the two-fold sign extension (signed halfword
+ *               to signed word) and stored to the out vector.
+ * Example     : out = __lasx_xvaddwh_w_h(in_h, in_l)
+ *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ *         out : 1,0,0,-1, 1,0,0, 2
+ * =============================================================================
+ */
+ static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvilvh_h(in_h, in_l);
+    out = __lasx_xvhaddw_w_h(out, out);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               lower half of the two-fold sign extension (signed byte
+ *               to signed halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvilvl_b(in_h, in_l);
+    out = __lasx_xvhaddw_h_b(out, out);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               lower half of the two-fold sign extension (signed halfword
+ *               to signed word) and stored to the out vector.
+ * Example     : out = __lasx_xvaddwl_w_h(in_h, in_l)
+ *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ *         out : 5,-1,4,2, 1,0,2,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvilvl_h(in_h, in_l);
+    out = __lasx_xvhaddw_w_h(out, out);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The out vector and the out vector are added after the
+ *               lower half of the two-fold zero extension (unsigned byte
+ *               to unsigned halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvilvl_b(in_h, in_l);
+    out = __lasx_xvhaddw_hu_bu(out, out);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_l vector after double zero extension (unsigned byte to
+ *               signed halfword),added to the in_h vector.
+ * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvsllwil_hu_bu(in_l, 0);
+    out = __lasx_xvadd_h(in_h, out);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_l vector after double sign extension (signed halfword to
+ *               signed word), added to the in_h vector.
+ * Example     : out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ *        in_h : 0, 1,0,0, -1,0,0,1,
+ *        in_l : 2,-1,1,2,  1,0,0,0, 0,0,1,0, 1,0,0,1,
+ *         out : 2, 0,1,2, -1,0,1,1,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
+{
+    __m256i out;
+
+    out = __lasx_xvsllwil_w_h(in_l, 0);
+    out = __lasx_xvadd_w(in_h, out);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ *               of the lower half of the vector.
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed halfword
+ *               to signed word), and the result is added to the vector in_c,
+ *               then stored to the out vector.
+ * Example     : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 5,6,7,8
+ *        in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
+ *        in_l : 200, 300, 400, 500,  2000, 3000, 4000, 5000,
+ *              -200,-300,-400,-500, -2000,-3000,-4000,-5000
+ *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
+{
+    __m256i tmp0, tmp1, out;
+
+    tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+    tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+    tmp0 = __lasx_xvmul_w(tmp0, tmp1);
+    out  = __lasx_xvadd_w(tmp0, in_c);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ *               of the higher half of the vector.
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the higher half of the two-fold sign extension (signed
+ *               halfword to signed word), and the result is added to
+ *               the vector in_c, then stored to the out vector.
+ * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
+{
+    __m256i tmp0, tmp1, out;
+
+    tmp0 = __lasx_xvilvh_h(in_h, in_h);
+    tmp1 = __lasx_xvilvh_h(in_l, in_l);
+    tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
+    out  = __lasx_xvadd_w(tmp0, in_c);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ *               half of the vector.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed
+ *               halfword to signed word), then stored to the out vector.
+ * Example     : out = __lasx_xvmulwl_w_h(in_h, in_l)
+ *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ *         out : 6,1,3,0, 0,0,1,0
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
+{
+    __m256i tmp0, tmp1, out;
+
+    tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+    tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+    out  = __lasx_xvmul_w(tmp0, tmp1);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ *               half of the vector.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed
+ *               halfword to signed word), then stored to the out vector.
+ * Example     : out = __lasx_xvmulwh_w_h(in_h, in_l)
+ *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ *         out : 0,0,0,0, 0,0,0,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l)
+{
+    __m256i tmp0, tmp1, out;
+
+    tmp0 = __lasx_xvilvh_h(in_h, in_h);
+    tmp1 = __lasx_xvilvh_h(in_l, in_l);
+    out  = __lasx_xvmulwev_w_h(tmp0, tmp1);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added saturately after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector adds the in_l vector saturately after the lower
+ *               half of the two-fold zero extension (unsigned byte to unsigned
+ *               halfword) and the results are stored to the out vector.
+ * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
+ *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
+ *         out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
+{
+    __m256i tmp1, out;
+    __m256i zero = {0};
+
+    tmp1 = __lasx_xvilvl_b(zero, in_l);
+    out  = __lasx_xvsadd_hu(in_h, tmp1);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ *               out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
+ * Arguments   : Inputs  - in    (input vector)
+ *                       - min   (min threshold)
+ *                       - max   (max threshold)
+ *               Outputs - in    (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : out = __lasx_xvclip_h(in, min, max)
+ *          in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
+ *         min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
+ *         max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
+ *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max)
+{
+    __m256i out;
+
+    out = __lasx_xvmax_h(min, in);
+    out = __lasx_xvmin_h(max, out);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed halfword elements of input vector
+ *               between 0 & 255
+ * Arguments   : Inputs  - in   (input vector)
+ *               Outputs - out  (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : See out = __lasx_xvclip255_w(in)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_h(__m256i in)
+{
+    __m256i out;
+
+    out = __lasx_xvmaxi_h(in, 0);
+    out = __lasx_xvsat_hu(out, 7);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed word elements of input vector
+ *               between 0 & 255
+ * Arguments   : Inputs - in   (input vector)
+ *               Output - out  (output vector with clipped elements)
+ *               Return Type - signed word
+ * Example     : out = __lasx_xvclip255_w(in)
+ *          in : -8,255,280,249, -8,255,280,249
+ *         out :  0,255,255,249,  0,255,255,249
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_w(__m256i in)
+{
+    __m256i out;
+
+    out = __lasx_xvmaxi_w(in, 0);
+    out = __lasx_xvsat_wu(out, 7);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ *               elements in output vector. If 'indx < 8' use xvsplati_l_*,
+ *               if 'indx >= 8' use xvsplati_h_*.
+ * Arguments   : Inputs - in, idx
+ *               Output - out
+ * Details     : Idx element value from in vector is replicated to all
+ *               elements in out vector.
+ *               Valid index range for halfword operation is 0-7
+ * Example     : out = __lasx_xvsplati_l_h(in, idx)
+ *          in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
+ *         idx : 0x02
+ *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx)
+{
+    __m256i out;
+
+    out = __lasx_xvpermi_q(in, in, 0x02);
+    out = __lasx_xvreplve_h(out, idx);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ *               elements in output vector. If 'indx < 8' use xvsplati_l_*,
+ *               if 'indx >= 8' use xvsplati_h_*.
+ * Arguments   : Inputs - in, idx
+ *               Output - out
+ * Details     : Idx element value from in vector is replicated to all
+ *               elements in out vector.
+ *               Valid index range for halfword operation is 0-7
+ * Example     : out = __lasx_xvsplati_h_h(in, idx)
+ *          in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
+ *         idx : 0x09
+ *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
+{
+    __m256i out;
+
+    out = __lasx_xvpermi_q(in, in, 0x13);
+    out = __lasx_xvreplve_h(out, idx);
+    return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with double word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Example     : LASX_TRANSPOSE4x4_D
+ *        _in0 : 1,2,3,4
+ *        _in1 : 1,2,3,4
+ *        _in2 : 1,2,3,4
+ *        _in3 : 1,2,3,4
+ *
+ *       _out0 : 1,1,1,1
+ *       _out1 : 2,2,2,2
+ *       _out2 : 3,3,3,3
+ *       _out3 : 4,4,4,4
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+{                                                                               \
+    __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                         \
+    _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                        \
+    _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                        \
+    _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                        \
+    _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                        \
+    _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                               \
+    _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                               \
+    _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                               \
+    _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                               \
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ * Example     : LASX_TRANSPOSE8x8_W
+ *        _in0 : 1,2,3,4,5,6,7,8
+ *        _in1 : 2,2,3,4,5,6,7,8
+ *        _in2 : 3,2,3,4,5,6,7,8
+ *        _in3 : 4,2,3,4,5,6,7,8
+ *        _in4 : 5,2,3,4,5,6,7,8
+ *        _in5 : 6,2,3,4,5,6,7,8
+ *        _in6 : 7,2,3,4,5,6,7,8
+ *        _in7 : 8,2,3,4,5,6,7,8
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8
+ *       _out1 : 2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,         \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
+{                                                                                   \
+    __m256i _s0_m, _s1_m;                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                     \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                     \
+                                                                                    \
+    _s0_m   = __lasx_xvilvl_w(_in2, _in0);                                          \
+    _s1_m   = __lasx_xvilvl_w(_in3, _in1);                                          \
+    _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                        \
+    _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                        \
+    _s0_m   = __lasx_xvilvh_w(_in2, _in0);                                          \
+    _s1_m   = __lasx_xvilvh_w(_in3, _in1);                                          \
+    _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                        \
+    _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                        \
+    _s0_m   = __lasx_xvilvl_w(_in6, _in4);                                          \
+    _s1_m   = __lasx_xvilvl_w(_in7, _in5);                                          \
+    _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                        \
+    _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                        \
+    _s0_m   = __lasx_xvilvh_w(_in6, _in4);                                          \
+    _s1_m   = __lasx_xvilvh_w(_in7, _in5);                                          \
+    _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                        \
+    _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                        \
+    _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                               \
+    _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                               \
+    _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                               \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                               \
+    _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                               \
+    _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                               \
+    _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                               \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                               \
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ *                         (input 16x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ *                         (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become rows.
+ * Example     : See LASX_TRANSPOSE16x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,         \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15,   \
+                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
+{                                                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                      \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                      \
+                                                                                     \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                           \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                           \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                           \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                           \
+    _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                          \
+    _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                          \
+    _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                         \
+    _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                         \
+    _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                       \
+    _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                       \
+    _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                       \
+    _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                       \
+    _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                       \
+    _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                       \
+    _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                       \
+    _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                       \
+    _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                         \
+    _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                         \
+    _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                         \
+    _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                         \
+    _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                         \
+    _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                         \
+    _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                         \
+    _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                         \
+    _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                       \
+    _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                       \
+    _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                       \
+    _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                       \
+    _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                       \
+    _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                       \
+    _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                       \
+    _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                       \
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ *                         (input 16x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ *                         (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become rows.
+ * Example     : LASX_TRANSPOSE16x8_H
+ *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
+ *       _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,         \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15,   \
+                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
+   {                                                                                 \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                      \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                      \
+    __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                                  \
+                                                                                     \
+    _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                           \
+    _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                           \
+    _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                           \
+    _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                           \
+    _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                          \
+    _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                          \
+    _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                         \
+    _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                         \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                         \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                         \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                         \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                         \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                         \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                         \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                         \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                         \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                             \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                             \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                             \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                             \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                             \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                             \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                             \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                             \
+    _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                                \
+    _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                                \
+    _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                                \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                                \
+                                                                                     \
+    _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                           \
+    _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                           \
+    _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                           \
+    _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                           \
+    _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                          \
+    _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                          \
+    _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                         \
+    _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                         \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                         \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                         \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                         \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                         \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                         \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                         \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                         \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                         \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                             \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                             \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                             \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                             \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                             \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                             \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                             \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                             \
+    _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                                \
+    _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                                \
+    _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                                \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                                \
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with halfword elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ *               Return Type - signed halfword
+ * Details     : The rows of the matrix become columns, and the columns become rows.
+ * Example     : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)     \
+{                                                                                   \
+    __m256i _s0_m, _s1_m;                                                           \
+                                                                                    \
+    _s0_m = __lasx_xvilvl_h(_in1, _in0);                                            \
+    _s1_m = __lasx_xvilvl_h(_in3, _in2);                                            \
+    _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                                          \
+    _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                                          \
+    _out1 = __lasx_xvilvh_d(_out0, _out0);                                          \
+    _out3 = __lasx_xvilvh_d(_out2, _out2);                                          \
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *                         (input 8x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
+ *                         (output 8x8 byte block)
+ * Example     : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0,  \
+                            _out1, _out2, _out3, _out4, _out5, _out6, _out7)        \
+{                                                                                   \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                     \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                     \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                          \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                          \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                          \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                          \
+    _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                    \
+    _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                    \
+    _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                    \
+    _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                    \
+    _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                                      \
+    _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                                      \
+    _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                                      \
+    _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                                      \
+    _out1 = __lasx_xvbsrl_v(_out0, 8);                                              \
+    _out3 = __lasx_xvbsrl_v(_out2, 8);                                              \
+    _out5 = __lasx_xvbsrl_v(_out4, 8);                                              \
+    _out7 = __lasx_xvbsrl_v(_out6, 8);                                              \
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with halfword elements in vectors.
+ * Arguments   : Inputs  - _in0, _in1, ~
+ *               Outputs - _out0, _out1, ~
+ * Details     : The rows of the matrix become columns, and the columns become rows.
+ * Example     : LASX_TRANSPOSE8x8_H
+ *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ *        _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ *        _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *        _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *
+ *       _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
+ *       _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ *       _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
+ *       _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
+ *       _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
+ *       _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
+ *       _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
+ *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0,  \
+                            _out1, _out2, _out3, _out4, _out5, _out6, _out7)        \
+{                                                                                   \
+    __m256i _s0_m, _s1_m;                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                     \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                     \
+                                                                                    \
+    _s0_m   = __lasx_xvilvl_h(_in6, _in4);                                          \
+    _s1_m   = __lasx_xvilvl_h(_in7, _in5);                                          \
+    _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                        \
+    _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                        \
+    _s0_m   = __lasx_xvilvh_h(_in6, _in4);                                          \
+    _s1_m   = __lasx_xvilvh_h(_in7, _in5);                                          \
+    _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                        \
+    _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                        \
+                                                                                    \
+    _s0_m   = __lasx_xvilvl_h(_in2, _in0);                                          \
+    _s1_m   = __lasx_xvilvl_h(_in3, _in1);                                          \
+    _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                        \
+    _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                        \
+    _s0_m   = __lasx_xvilvh_h(_in2, _in0);                                          \
+    _s1_m   = __lasx_xvilvh_h(_in3, _in1);                                          \
+    _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                        \
+    _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                        \
+                                                                                    \
+    _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                                    \
+    _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                                    \
+    _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                                    \
+    _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                                    \
+    _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                                    \
+    _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                                    \
+    _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                                    \
+    _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                                    \
+}
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_4
+ *               _out0 = _in0 + _in3;
+ *               _out1 = _in1 + _in2;
+ *               _out2 = _in1 - _in2;
+ *               _out3 = _in0 - _in3;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)  \
+{                                                                               \
+    _out0 = __lasx_xvadd_b(_in0, _in3);                                         \
+    _out1 = __lasx_xvadd_b(_in1, _in2);                                         \
+    _out2 = __lasx_xvsub_b(_in1, _in2);                                         \
+    _out3 = __lasx_xvsub_b(_in0, _in3);                                         \
+}
+#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)  \
+{                                                                               \
+    _out0 = __lasx_xvadd_h(_in0, _in3);                                         \
+    _out1 = __lasx_xvadd_h(_in1, _in2);                                         \
+    _out2 = __lasx_xvsub_h(_in1, _in2);                                         \
+    _out3 = __lasx_xvsub_h(_in0, _in3);                                         \
+}
+#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)  \
+{                                                                               \
+    _out0 = __lasx_xvadd_w(_in0, _in3);                                         \
+    _out1 = __lasx_xvadd_w(_in1, _in2);                                         \
+    _out2 = __lasx_xvsub_w(_in1, _in2);                                         \
+    _out3 = __lasx_xvsub_w(_in0, _in3);                                         \
+}
+#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)  \
+{                                                                               \
+    _out0 = __lasx_xvadd_d(_in0, _in3);                                         \
+    _out1 = __lasx_xvadd_d(_in1, _in2);                                         \
+    _out2 = __lasx_xvsub_d(_in1, _in2);                                         \
+    _out3 = __lasx_xvsub_d(_in0, _in3);                                         \
+}
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_8
+ *               _out0 = _in0 + _in7;
+ *               _out1 = _in1 + _in6;
+ *               _out2 = _in2 + _in5;
+ *               _out3 = _in3 + _in4;
+ *               _out4 = _in3 - _in4;
+ *               _out5 = _in2 - _in5;
+ *               _out6 = _in1 - _in6;
+ *               _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
+{                                                                                 \
+    _out0 = __lasx_xvadd_b(_in0, _in7);                                           \
+    _out1 = __lasx_xvadd_b(_in1, _in6);                                           \
+    _out2 = __lasx_xvadd_b(_in2, _in5);                                           \
+    _out3 = __lasx_xvadd_b(_in3, _in4);                                           \
+    _out4 = __lasx_xvsub_b(_in3, _in4);                                           \
+    _out5 = __lasx_xvsub_b(_in2, _in5);                                           \
+    _out6 = __lasx_xvsub_b(_in1, _in6);                                           \
+    _out7 = __lasx_xvsub_b(_in0, _in7);                                           \
+}
+
+#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
+{                                                                                 \
+    _out0 = __lasx_xvadd_h(_in0, _in7);                                           \
+    _out1 = __lasx_xvadd_h(_in1, _in6);                                           \
+    _out2 = __lasx_xvadd_h(_in2, _in5);                                           \
+    _out3 = __lasx_xvadd_h(_in3, _in4);                                           \
+    _out4 = __lasx_xvsub_h(_in3, _in4);                                           \
+    _out5 = __lasx_xvsub_h(_in2, _in5);                                           \
+    _out6 = __lasx_xvsub_h(_in1, _in6);                                           \
+    _out7 = __lasx_xvsub_h(_in0, _in7);                                           \
+}
+
+#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
+{                                                                                 \
+    _out0 = __lasx_xvadd_w(_in0, _in7);                                           \
+    _out1 = __lasx_xvadd_w(_in1, _in6);                                           \
+    _out2 = __lasx_xvadd_w(_in2, _in5);                                           \
+    _out3 = __lasx_xvadd_w(_in3, _in4);                                           \
+    _out4 = __lasx_xvsub_w(_in3, _in4);                                           \
+    _out5 = __lasx_xvsub_w(_in2, _in5);                                           \
+    _out6 = __lasx_xvsub_w(_in1, _in6);                                           \
+    _out7 = __lasx_xvsub_w(_in0, _in7);                                           \
+}
+
+#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
+{                                                                                 \
+    _out0 = __lasx_xvadd_d(_in0, _in7);                                           \
+    _out1 = __lasx_xvadd_d(_in1, _in6);                                           \
+    _out2 = __lasx_xvadd_d(_in2, _in5);                                           \
+    _out3 = __lasx_xvadd_d(_in3, _in4);                                           \
+    _out4 = __lasx_xvsub_d(_in3, _in4);                                           \
+    _out5 = __lasx_xvsub_d(_in2, _in5);                                           \
+    _out6 = __lasx_xvsub_d(_in1, _in6);                                           \
+    _out7 = __lasx_xvsub_d(_in0, _in7);                                           \
+}
+
+#endif //LASX
+
+/*
+ * =============================================================================
+ * Description : Print out elements in vector.
+ * Arguments   : Inputs  - RTYPE, _element_num, _in0, _enter
+ *               Outputs -
+ * Details     : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
+ *               '_enter' is TRUE, prefix "\nVP:" will be added first.
+ * Example     : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
+ *               VP:1,2,3,4,
+ * =============================================================================
+ */
+#define VECT_PRINT(RTYPE, element_num, in0, enter)    \
+{                                                     \
+    RTYPE _tmp0 = (RTYPE)in0;                         \
+    int _i = 0;                                       \
+    if (enter)                                        \
+        printf("\nVP:");                              \
+    for(_i = 0; _i < element_num; _i++)               \
+        printf("%d,",_tmp0[_i]);                      \
+}
+
+#endif /* LOONGSON_INTRINSICS_H */
+#endif /* AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H */