diff mbox series

[FFmpeg-devel,v3,4/7] avcodec/la: Add LSX optimization for h264 qpel.

Message ID 20230520015649.8325-5-chenhao@loongson.cn
State Superseded
Headers show
Series [FFmpeg-devel,v3,1/7] avcodec/la: add LSX optimization for h264 idct. | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 fail Make fate failed
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

陈昊 May 20, 2023, 1:56 a.m. UTC
From: yuanhecai <yuanhecai@loongson.cn>

./configure --disable-lasx
ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 214fps
after:  274fps
---
 libavcodec/loongarch/Makefile                 |    2 +
 libavcodec/loongarch/h264qpel.S               | 1686 +++++++++++++++++
 .../loongarch/h264qpel_init_loongarch.c       |   74 +-
 libavcodec/loongarch/h264qpel_lasx.c          |  401 +---
 libavcodec/loongarch/h264qpel_lasx.h          |  158 --
 libavcodec/loongarch/h264qpel_loongarch.h     |  312 +++
 libavcodec/loongarch/h264qpel_lsx.c           |  487 +++++
 7 files changed, 2561 insertions(+), 559 deletions(-)
 create mode 100644 libavcodec/loongarch/h264qpel.S
 delete mode 100644 libavcodec/loongarch/h264qpel_lasx.h
 create mode 100644 libavcodec/loongarch/h264qpel_loongarch.h
 create mode 100644 libavcodec/loongarch/h264qpel_lsx.c
diff mbox series

Patch

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index a563055161..06cfab5c20 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -31,5 +31,7 @@  LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
 LSX-OBJS-$(CONFIG_H264DSP)            += loongarch/h264idct.o \
                                          loongarch/h264idct_loongarch.o \
                                          loongarch/h264dsp.o
+LSX-OBJS-$(CONFIG_H264QPEL)           += loongarch/h264qpel.o \
+                                         loongarch/h264qpel_lsx.o
 LSX-OBJS-$(CONFIG_H264CHROMA)         += loongarch/h264chroma.o
 LSX-OBJS-$(CONFIG_H264PRED)           += loongarch/h264intrapred.o
diff --git a/libavcodec/loongarch/h264qpel.S b/libavcodec/loongarch/h264qpel.S
new file mode 100644
index 0000000000..3f885b6ce2
--- /dev/null
+++ b/libavcodec/loongarch/h264qpel.S
@@ -0,0 +1,1686 @@ 
+/*
+ * Loongson LSX optimized h264qpel
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+.macro VLD_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4
+    vld           vr0,    \in4,   0
+    vldx          vr1,    \in4,   a2
+    QPEL8_H_LSX   \in0,   \in1
+    vssrani.bu.h  \in0,   \in2,   5
+    vssrani.bu.h  \in1,   \in3,   5
+.endm
+
+.macro VLDX_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4
+    vldx          vr0,    \in4,   t1
+    vldx          vr1,    \in4,   t2
+    QPEL8_H_LSX   \in0,   \in1
+    vssrani.bu.h  \in0,   \in2,   5
+    vssrani.bu.h  \in1,   \in3,   5
+.endm
+
+.macro VLD_DOUBLE_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8
+    vld           vr0,    \in8,   0
+    vldx          vr1,    \in8,   a2
+    QPEL8_H_LSX   \in0,   \in1
+    vssrani.bu.h  \in0,   \in4,   5
+    vssrani.bu.h  \in1,   \in5,   5
+    vldx          vr0,    \in8,   t1
+    vldx          vr1,    \in8,   t2
+    QPEL8_H_LSX   \in2,   \in3
+    vssrani.bu.h  \in2,   \in6,   5
+    vssrani.bu.h  \in3,   \in7,   5
+.endm
+
+function ff_put_h264_qpel16_mc00_lsx
+    slli.d        t0,     a2,     1
+    add.d         t1,     t0,     a2
+    slli.d        t2,     t0,     1
+.rept 4
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+    vstx          vr2,    a0,     t0
+    vstx          vr3,    a0,     t1
+    add.d         a0,     a0,     t2
+.endr
+endfunc
+
+.macro QPEL8_H_LSX out0, out1
+    vbsrl.v       vr2,    vr0,    1
+    vbsrl.v       vr3,    vr1,    1
+    vbsrl.v       vr4,    vr0,    2
+    vbsrl.v       vr5,    vr1,    2
+    vbsrl.v       vr6,    vr0,    3
+    vbsrl.v       vr7,    vr1,    3
+    vbsrl.v       vr8,    vr0,    4
+    vbsrl.v       vr9,    vr1,    4
+    vbsrl.v       vr10,   vr0,    5
+    vbsrl.v       vr11,   vr1,    5
+
+    vilvl.b       vr6,    vr4,    vr6
+    vilvl.b       vr7,    vr5,    vr7
+    vilvl.b       vr8,    vr2,    vr8
+    vilvl.b       vr9,    vr3,    vr9
+    vilvl.b       vr10,   vr0,    vr10
+    vilvl.b       vr11,   vr1,    vr11
+    vhaddw.hu.bu  vr6,    vr6,    vr6
+    vhaddw.hu.bu  vr7,    vr7,    vr7
+    vhaddw.hu.bu  vr8,    vr8,    vr8
+    vhaddw.hu.bu  vr9,    vr9,    vr9
+    vhaddw.hu.bu  vr10,   vr10,   vr10
+    vhaddw.hu.bu  vr11,   vr11,   vr11
+    vmul.h        vr2,    vr6,    vr20
+    vmul.h        vr3,    vr7,    vr20
+    vmul.h        vr4,    vr8,    vr21
+    vmul.h        vr5,    vr9,    vr21
+    vssub.h       vr2,    vr2,    vr4
+    vssub.h       vr3,    vr3,    vr5
+    vsadd.h       vr2,    vr2,    vr10
+    vsadd.h       vr3,    vr3,    vr11
+    vsadd.h       \out0,  vr2,    vr22
+    vsadd.h       \out1,  vr3,    vr22
+.endm
+
+.macro VLD_DOUBLE_QPEL8_H_LSX in0, in1, in2, in3, in4
+    vld           vr0,    \in4,   0
+    vldx          vr1,    \in4,   a2
+    QPEL8_H_LSX   \in0,   \in1
+    vldx          vr0,    \in4,   t1
+    vldx          vr1,    \in4,   t2
+    QPEL8_H_LSX   \in2,   \in3
+.endm
+
+.macro put_h264_qpel16 in0
+function ff_put_h264_qpel16_mc\in0\()_lsx
+.ifc \in0, 10
+    addi.d        t8,     a1,     0
+.else
+    addi.d        t8,     a1,     1
+.endif
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    addi.d        a1,     t0,     8    // a1 = t0 + 8
+.rept 4
+    VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+    VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1
+    vld           vr10,   t8,     0
+    vldx          vr11,   t8,     a2
+    vavgr.bu      vr0,    vr2,    vr10
+    vavgr.bu      vr1,    vr3,    vr11
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+    VLDX_QPEL8_H_SSRANI_LSX vr4, vr5, vr14, vr15, a1
+    vldx          vr12,   t8,     t1
+    vldx          vr13,   t8,     t2
+    vavgr.bu      vr2,    vr4,    vr12
+    vavgr.bu      vr3,    vr5,    vr13
+    vstx          vr2,    a0,     t1
+    vstx          vr3,    a0,     t2
+    alsl.d        a0,     a2,     a0,    2
+    alsl.d        t8,     a2,     t8,    2
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        t0,     a2,     t0,    2
+.endr
+endfunc
+.endm
+
+put_h264_qpel16 10
+put_h264_qpel16 30
+
+function ff_put_h264_qpel16_mc20_lsx
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    addi.d        a1,     t0,     8    // a1 = t0 + 8
+.rept 4
+    VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+    VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1
+    vst           vr2,    a0,     0
+    vstx          vr3,    a0,     a2
+    VLDX_QPEL8_H_SSRANI_LSX vr4, vr5, vr14, vr15, a1
+    vstx          vr4,    a0,     t1
+    vstx          vr5,    a0,     t2
+    alsl.d        a0,     a2,     a0,    2
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        t0,     a2,     t0,    2
+.endr
+endfunc
+
+.macro QPEL8_V_LSX in0, in1, in2, in3, in4, in5, in6
+    vilvl.b       vr7,    \in3,   \in2
+    vilvl.b       vr8,    \in4,   \in3
+    vilvl.b       vr9,    \in4,   \in1
+    vilvl.b       vr10,   \in5,   \in2
+    vilvl.b       vr11,   \in5,   \in0
+    vilvl.b       vr12,   \in6,   \in1
+    vhaddw.hu.bu  vr7,    vr7,    vr7
+    vhaddw.hu.bu  vr8,    vr8,    vr8
+    vhaddw.hu.bu  vr9,    vr9,    vr9
+    vhaddw.hu.bu  vr10,   vr10,   vr10
+    vhaddw.hu.bu  vr11,   vr11,   vr11
+    vhaddw.hu.bu  vr12,   vr12,   vr12
+    vmul.h        vr7,    vr7,    vr20
+    vmul.h        vr8,    vr8,    vr20
+    vmul.h        vr9,    vr9,    vr21
+    vmul.h        vr10,   vr10,   vr21
+    vssub.h       vr7,    vr7,    vr9
+    vssub.h       vr8,    vr8,    vr10
+    vsadd.h       vr7,    vr7,    vr11
+    vsadd.h       vr8,    vr8,    vr12
+    vsadd.h       vr7,    vr7,    vr22
+    vsadd.h       vr8,    vr8,    vr22
+
+    vilvh.b       vr13,   \in3,   \in2
+    vilvh.b       vr14,   \in4,   \in3
+    vilvh.b       vr15,   \in4,   \in1
+    vilvh.b       vr16,   \in5,   \in2
+    vilvh.b       vr17,   \in5,   \in0
+    vilvh.b       vr18,   \in6,   \in1
+    vhaddw.hu.bu  vr13,   vr13,   vr13
+    vhaddw.hu.bu  vr14,   vr14,   vr14
+    vhaddw.hu.bu  vr15,   vr15,   vr15
+    vhaddw.hu.bu  vr16,   vr16,   vr16
+    vhaddw.hu.bu  vr17,   vr17,   vr17
+    vhaddw.hu.bu  vr18,   vr18,   vr18
+    vmul.h        vr13,   vr13,   vr20
+    vmul.h        vr14,   vr14,   vr20
+    vmul.h        vr15,   vr15,   vr21
+    vmul.h        vr16,   vr16,   vr21
+    vssub.h       vr13,   vr13,   vr15
+    vssub.h       vr14,   vr14,   vr16
+    vsadd.h       vr13,   vr13,   vr17
+    vsadd.h       vr14,   vr14,   vr18
+    vsadd.h       vr13,   vr13,   vr22
+    vsadd.h       vr14,   vr14,   vr22
+    vssrani.bu.h  vr13,   vr7,    5
+    vssrani.bu.h  vr14,   vr8,    5
+.endm
+
+.macro put_h264_qpel16_mc1 in0
+function ff_put_h264_qpel16_mc\in0\()_lsx
+    slli.d        t0,     a2,     1
+    add.d         t1,     t0,     a2
+    sub.d         t2,     a1,     t0  // t2 = src - 2 * stride
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    vld           vr0,    t2,     0
+    vldx          vr1,    t2,     a2
+    vldx          vr2,    t2,     t0
+    vldx          vr3,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 * stride
+    vld           vr4,    t2,     0
+    vldx          vr5,    t2,     a2
+    vldx          vr6,    t2,     t0
+    QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+.ifc \in0, 01
+    vavgr.bu      vr13,   vr2,    vr13
+    vavgr.bu      vr14,   vr3,    vr14
+.else
+    vavgr.bu      vr13,   vr3,    vr13
+    vavgr.bu      vr14,   vr4,    vr14
+.endif
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr0,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 *stride
+    vld           vr1,    t2,     0
+    QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
+.ifc \in0, 01
+    vavgr.bu      vr13,   vr4,    vr13
+    vavgr.bu      vr14,   vr5,    vr14
+.else
+    vavgr.bu      vr13,   vr5,    vr13
+    vavgr.bu      vr14,   vr6,    vr14
+.endif
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr2,    t2,     a2
+    vldx          vr3,    t2,     t0
+    QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
+.ifc \in0, 01
+    vavgr.bu      vr13,   vr6,    vr13
+    vavgr.bu      vr14,   vr0,    vr14
+.else
+    vavgr.bu      vr13,   vr0,    vr13
+    vavgr.bu      vr14,   vr1,    vr14
+.endif
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr4,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2 // t2 = t2 + 4 * stride
+    vld           vr5,    t2,     0
+    QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
+.ifc \in0, 01
+    vavgr.bu      vr13,   vr1,    vr13
+    vavgr.bu      vr14,   vr2,    vr14
+.else
+    vavgr.bu      vr13,   vr2,    vr13
+    vavgr.bu      vr14,   vr3,    vr14
+.endif
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr6,    t2,     a2
+    vldx          vr0,    t2,     t0
+    QPEL8_V_LSX vr1, vr2, vr3, vr4, vr5, vr6, vr0
+.ifc \in0, 01
+    vavgr.bu      vr13,   vr3,    vr13
+    vavgr.bu      vr14,   vr4,    vr14
+.else
+    vavgr.bu      vr13,   vr4,    vr13
+    vavgr.bu      vr14,   vr5,    vr14
+.endif
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr1,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 * stride
+    vld           vr2,    t2,     0
+    QPEL8_V_LSX vr3, vr4, vr5, vr6, vr0, vr1, vr2
+.ifc \in0, 01
+    vavgr.bu      vr13,   vr5,    vr13
+    vavgr.bu      vr14,   vr6,    vr14
+.else
+    vavgr.bu      vr13,   vr6,    vr13
+    vavgr.bu      vr14,   vr0,    vr14
+.endif
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr3,    t2,     a2
+    vldx          vr4,    t2,     t0
+    QPEL8_V_LSX vr5, vr6, vr0, vr1, vr2, vr3, vr4
+.ifc \in0, 01
+    vavgr.bu      vr13,   vr0,    vr13
+    vavgr.bu      vr14,   vr1,    vr14
+.else
+    vavgr.bu      vr13,   vr1,    vr13
+    vavgr.bu      vr14,   vr2,    vr14
+.endif
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+
+    vldx          vr5,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2 // t2 = t2 + 4 * stride
+    vld           vr6,    t2,     0
+    QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+.ifc \in0, 01
+    vavgr.bu      vr13,   vr2,    vr13
+    vavgr.bu      vr14,   vr3,    vr14
+.else
+    vavgr.bu      vr13,   vr3,    vr13
+    vavgr.bu      vr14,   vr4,    vr14
+.endif
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+endfunc
+.endm
+
+put_h264_qpel16_mc1 01
+put_h264_qpel16_mc1 03
+
+.macro VST_QPEL8_V_LOWPASS_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8
+    QPEL8_V_LSX \in0, \in1, \in2, \in3, \in4, \in5, \in6
+    vavgr.bu      vr13,   \in7,   vr13
+    vavgr.bu      vr14,   \in8,   vr14
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+.endm
+
+.macro VSTX_QPEL8_V_LOWPASS_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8
+    QPEL8_V_LSX \in0, \in1, \in2, \in3, \in4, \in5, \in6
+    vavgr.bu      vr13,   \in7,   vr13
+    vavgr.bu      vr14,   \in8,   vr14
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+.endm
+
+function ff_put_h264_qpel16_mc11_lsx
+    addi.d        sp,     sp,     -64
+    fst.d         f24,    sp,     0
+    fst.d         f25,    sp,     8
+    fst.d         f26,    sp,     16
+    fst.d         f27,    sp,     24
+    fst.d         f28,    sp,     32
+    fst.d         f29,    sp,     40
+    fst.d         f30,    sp,     48
+    fst.d         f31,    sp,     56
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    slli.d        t6,     t1,     1
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    addi.d        a1,     t0,     8    // a1 = t0 + 8
+.rept 2
+    VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+    alsl.d        t0,     a2,     t0,    2
+    VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t0
+    VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+                                  vr14, vr15, a1
+    alsl.d        a1,     a2,     a1,    2
+    VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+                                  vr18, vr19, a1
+
+    vld           vr0,    t4,     0      // t4 = src - 2 * stride
+    vldx          vr1,    t4,     a2
+    vldx          vr2,    t4,     t1
+    vldx          vr3,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2  // src + 2 *stride
+    vld           vr4,    t4,     0
+    vldx          vr5,    t4,     a2
+    vldx          vr6,    t4,     t1
+    VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+    vldx          vr0,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2  // src + 6 *stride
+    vld           vr1,    t4,     0
+    VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+    vldx          vr2,    t4,     a2
+    vldx          vr3,    t4,     t1
+    VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+    vldx          vr4,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2  // src + 10 *stride
+    vld           vr5,    t4,     0
+    VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        a1,     a2,     a1,    2   // a1 = src + 8 * stride
+    alsl.d        a0,     a2,     a0,    2   // dst = dst + 8 * stride
+    sub.d         t4,     t4,     t6
+.endr
+    fld.d         f24,    sp,     0
+    fld.d         f25,    sp,     8
+    fld.d         f26,    sp,     16
+    fld.d         f27,    sp,     24
+    fld.d         f28,    sp,     32
+    fld.d         f29,    sp,     40
+    fld.d         f30,    sp,     48
+    fld.d         f31,    sp,     56
+    addi.d        sp,     sp,     64
+endfunc
+
+function ff_avg_h264_qpel16_mc00_lsx
+    slli.d        t0,     a2,     1
+    add.d         t1,     t0,     a2
+    slli.d        t2,     t0,     1
+    addi.d        t3,     a0,     0
+.rept 4
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vld           vr8,    t3,     0
+    vldx          vr9,    t3,     a2
+    vldx          vr10,   t3,     t0
+    vldx          vr11,   t3,     t1
+    add.d         t3,     t3,     t2
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+    vstx          vr2,    a0,     t0
+    vstx          vr3,    a0,     t1
+    add.d         a0,     a0,     t2
+.endr
+endfunc
+
+.macro put_h264_qpel16_mc in0
+function ff_put_h264_qpel16_mc\in0\()_lsx
+    addi.d        sp,     sp,     -64
+    fst.d         f24,    sp,     0
+    fst.d         f25,    sp,     8
+    fst.d         f26,    sp,     16
+    fst.d         f27,    sp,     24
+    fst.d         f28,    sp,     32
+    fst.d         f29,    sp,     40
+    fst.d         f30,    sp,     48
+    fst.d         f31,    sp,     56
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+
+.ifc \in0, 33
+    add.d         t0,     t0,     a2
+.endif
+    add.d         t3,     a1,     zero // t3 = src
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+    addi.d        t4,     t4,     1
+
+    VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+    alsl.d        a1,     a2,     t0,    2
+    VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+    addi.d        a1,     t0,     8
+    VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+                                  vr14, vr15, a1
+    alsl.d        a1,     a2,     a1,    2
+    VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+                                  vr18, vr19, a1
+    vld           vr0,    t4,     0      // t4 = src - 2 * stride + 1
+    vldx          vr1,    t4,     a2
+    vldx          vr2,    t4,     t1
+    vldx          vr3,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr4,    t4,     0
+    vldx          vr5,    t4,     a2
+    vldx          vr6,    t4,     t1
+    VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+    vldx          vr0,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr1,    t4,     0
+    VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
+    add.d         t6,     t4,     zero     // t6 = src + 6 * stride
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+    vldx          vr2,    t4,     a2
+    vldx          vr3,    t4,     t1
+    VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+    vldx          vr4,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr5,    t4,     0
+    VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+    alsl.d        a1,     a2,     t0,    3  // a1 = src + 8 * stride
+    addi.d        t5,     a1,     8         // a1 = src + 8 * stride + 8
+    VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1
+    alsl.d        a1,     a2,     a1,    2
+    VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+    VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+                                  vr14, vr15, t5
+    alsl.d        t5,     a2,     t5,    2
+    VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+                                  vr18, vr19, t5
+    alsl.d        a0,     a2,     a0,    2   // dst = dst + 4 * stride
+
+    // t6 = src + 6 * stride + 1
+    vld           vr0,    t6,     0
+    vldx          vr1,    t6,     a2
+    vldx          vr2,    t6,     t1
+    vldx          vr3,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr4,    t6,     0
+    vldx          vr5,    t6,     a2
+    vldx          vr6,    t6,     t1
+    VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+    vldx          vr0,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr1,    t6,     0
+    VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5 ,vr6, vr0, vr1, vr25, vr26
+    alsl.d        a0,     a2,     a0,    2    // dst = dst + 4 *stride
+    vldx          vr2,    t6,     a2
+    vldx          vr3,    t6,     t1
+    VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+    vldx          vr4,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr5,    t6,     0
+    VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+
+    fld.d         f24,    sp,     0
+    fld.d         f25,    sp,     8
+    fld.d         f26,    sp,     16
+    fld.d         f27,    sp,     24
+    fld.d         f28,    sp,     32
+    fld.d         f29,    sp,     40
+    fld.d         f30,    sp,     48
+    fld.d         f31,    sp,     56
+    addi.d        sp,     sp,     64
+endfunc
+.endm
+
+put_h264_qpel16_mc 33
+put_h264_qpel16_mc 31
+
+function ff_put_h264_qpel16_mc13_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+    addi.d        sp,     sp,     -64
+    fst.d         f24,    sp,     0
+    fst.d         f25,    sp,     8
+    fst.d         f26,    sp,     16
+    fst.d         f27,    sp,     24
+    fst.d         f28,    sp,     32
+    fst.d         f29,    sp,     40
+    fst.d         f30,    sp,     48
+    fst.d         f31,    sp,     56
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t0,     t0,     a2
+    add.d         t3,     a1,     zero // t3 = src
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+
+    VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+    alsl.d        a1,     a2,     t0,    2
+    VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+    addi.d        a1,     t0,     8
+    VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+                                  vr14, vr15, a1
+    alsl.d        a1,     a2,     a1,    2
+    VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+                                  vr18, vr19, a1
+    vld           vr0,    t4,     0      // t4 = src - 2 * stride + 1
+    vldx          vr1,    t4,     a2
+    vldx          vr2,    t4,     t1
+    vldx          vr3,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr4,    t4,     0
+    vldx          vr5,    t4,     a2
+    vldx          vr6,    t4,     t1
+    VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+    vldx          vr0,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr1,    t4,     0
+    VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
+    add.d         t6,     t4,     zero
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+    vldx          vr2,    t4,     a2
+    vldx          vr3,    t4,     t1
+    VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+    vldx          vr4,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr5,    t4,     0
+    VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+    alsl.d        a1,     a2,     t0,    3  // a1 = src + 8 * stride
+    addi.d        t5,     a1,     8         // a1 = src + 8 * stride + 8
+    VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1
+    alsl.d        a1,     a2,     a1,    2
+    VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+    VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+                                  vr14, vr15, t5
+    alsl.d        t5,     a2,     t5,    2
+    VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+                                  vr18, vr19, t5
+    alsl.d        a0,     a2,     a0,    2   // dst = dst + 4 * stride
+
+    vld           vr0,    t6,     0          // // t6 = src + 6 * stride + 1
+    vldx          vr1,    t6,     a2
+    vldx          vr2,    t6,     t1
+    vldx          vr3,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr4,    t6,     0
+    vldx          vr5,    t6,     a2
+    vldx          vr6,    t6,     t1
+    VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+    vldx          vr0,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr1,    t6,     0
+    VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
+    alsl.d        a0,     a2,     a0,    2    // dst = dst + 4 *stride
+    vldx          vr2,    t6,     a2
+    vldx          vr3,    t6,     t1
+    VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+    vldx          vr4,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr5,    t6,     0
+    VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+    fld.d         f24,    sp,     0
+    fld.d         f25,    sp,     8
+    fld.d         f26,    sp,     16
+    fld.d         f27,    sp,     24
+    fld.d         f28,    sp,     32
+    fld.d         f29,    sp,     40
+    fld.d         f30,    sp,     48
+    fld.d         f31,    sp,     56
+    addi.d        sp,     sp,     64
+endfunc
+
+function ff_avg_h264_qpel16_mc10_lsx
+    addi.d        t0,     a0,     0   // t0 = dst
+    addi.d        t4,     a1,     -2  // t1 = src - 2
+    addi.d        t5,     t4,     8
+    slli.d        t1,     a2,     1
+    add.d         t2,     a2,     t1
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+.rept 2
+    VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t4
+    alsl.d        t4,     a2,     t4,    2
+    VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t4
+    VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, t5
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vld           vr12,   t0,     0
+    vldx          vr13,   t0,     a2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+    VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, t5
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    vldx          vr12,   t0,     t1
+    vldx          vr13,   t0,     t2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vstx          vr0,    a0,     t1
+    vstx          vr1,    a0,     t2
+    alsl.d        t5,     a2,     t5,    2
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        a0,     a2,     a0,    2
+    VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr16, vr17, t5
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vld           vr12,   t0,     0
+    vldx          vr13,   t0,     a2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+    VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr18, vr19, t5
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    vldx          vr12,   t0,     t1
+    vldx          vr13,   t0,     t2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vstx          vr0,    a0,     t1
+    vstx          vr1,    a0,     t2
+    alsl.d        t5,     a2,     t5,    2
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        a0,     a2,     a0,    2
+    alsl.d        t4,     a2,     t4,    2   // src + 8 * stride -2
+.endr
+endfunc
+
+function ff_avg_h264_qpel16_mc30_lsx
+    addi.d        t0,     a0,     0   // t0 = dst
+    addi.d        t4,     a1,     -2  // t1 = src - 2
+    addi.d        t5,     t4,     8
+    addi.d        a1,     a1,     1   // a1 = a1 + 1
+    slli.d        t1,     a2,     1
+    add.d         t2,     a2,     t1
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+.rept 2
+    VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t4
+    alsl.d        t4,     a2,     t4,    2
+    VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t4
+    VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, t5
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vld           vr12,   t0,     0
+    vldx          vr13,   t0,     a2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+    VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, t5
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    vldx          vr12,   t0,     t1
+    vldx          vr13,   t0,     t2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vstx          vr0,    a0,     t1
+    vstx          vr1,    a0,     t2
+    alsl.d        t5,     a2,     t5,    2
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        a0,     a2,     a0,    2
+    VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr16, vr17, t5
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vld           vr12,   t0,     0
+    vldx          vr13,   t0,     a2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+    VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr18, vr19, t5
+    vldx          vr0,    a1,     t1
+    vldx          vr1,    a1,     t2
+    vldx          vr12,   t0,     t1
+    vldx          vr13,   t0,     t2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vavgr.bu      vr0,    vr0,    vr12
+    vavgr.bu      vr1,    vr1,    vr13
+    vstx          vr0,    a0,     t1
+    vstx          vr1,    a0,     t2
+    alsl.d        t5,     a2,     t5,    2
+    alsl.d        a1,     a2,     a1,    2
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        a0,     a2,     a0,    2
+    alsl.d        t4,     a2,     t4,    2   // t1 = src + 8 * stride -2
+.endr
+endfunc
+
+function ff_put_h264_qpel16_mc02_lsx
+    slli.d        t0,     a2,     1
+    add.d         t1,     t0,     a2
+    sub.d         t2,     a1,     t0  // t2 = src - 2 * stride
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    vld           vr0,    t2,     0
+    vldx          vr1,    t2,     a2
+    vldx          vr2,    t2,     t0
+    vldx          vr3,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 * stride
+    vld           vr4,    t2,     0
+    vldx          vr5,    t2,     a2
+    vldx          vr6,    t2,     t0
+    QPEL8_V_LSX   vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+    vldx          vr0,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 *stride
+    vld           vr1,    t2,     0
+    QPEL8_V_LSX   vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+    vldx          vr2,    t2,     a2
+    vldx          vr3,    t2,     t0
+    QPEL8_V_LSX   vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+    vldx          vr4,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2 // t2 = t2 + 4 * stride
+    vld           vr5,    t2,     0
+    QPEL8_V_LSX   vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+
+    vldx          vr6,    t2,     a2
+    vldx          vr0,    t2,     t0
+    QPEL8_V_LSX   vr1, vr2, vr3, vr4, vr5, vr6, vr0
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+    vldx          vr1,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2  // t2 = t2 + 4 * stride
+    vld           vr2,    t2,     0
+    QPEL8_V_LSX   vr3, vr4, vr5, vr6, vr0, vr1, vr2
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+    vldx          vr3,    t2,     a2
+    vldx          vr4,    t2,     t0
+    QPEL8_V_LSX   vr5, vr6, vr0, vr1, vr2, vr3, vr4
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+    vldx          vr5,    t2,     t1
+    alsl.d        t2,     a2,     t2,    2 // t2 = t2 + 4 * stride
+    vld           vr6,    t2,     0
+    QPEL8_V_LSX   vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vstx          vr13,   a0,     t0
+    vstx          vr14,   a0,     t1
+endfunc
+
+.macro avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+    addi.d        sp,     sp,     -64
+    fst.d         f24,    sp,     0
+    fst.d         f25,    sp,     8
+    fst.d         f26,    sp,     16
+    fst.d         f27,    sp,     24
+    fst.d         f28,    sp,     32
+    fst.d         f29,    sp,     40
+    fst.d         f30,    sp,     48
+    fst.d         f31,    sp,     56
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+    alsl.d        a1,     a2,     t0,    2
+    VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+    addi.d        a1,     t0,     8
+    VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+                                  vr14, vr15, a1
+    alsl.d        a1,     a2,     a1,    2
+    VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+                                  vr18, vr19, a1
+    vld           vr0,    t4,     0      // t4 = src - 2 * stride + 1
+    vldx          vr1,    t4,     a2
+    vldx          vr2,    t4,     t1
+    vldx          vr3,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr4,    t4,     0
+    vldx          vr5,    t4,     a2
+    vldx          vr6,    t4,     t1
+    QPEL8_V_LSX   vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vld           vr0,    t8,     0
+    vldx          vr1,    t8,     a2
+    vavgr.bu      vr13,   vr23,   vr13
+    vavgr.bu      vr14,   vr24,   vr14
+    vavgr.bu      vr13,   vr13,   vr0
+    vavgr.bu      vr14,   vr14,   vr1
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+    vldx          vr0,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr1,    t4,     0
+    QPEL8_V_LSX   vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vldx          vr2,    t8,     t1
+    vldx          vr3,    t8,     t2
+    vavgr.bu      vr13,   vr25,   vr13
+    vavgr.bu      vr14,   vr26,   vr14
+    vavgr.bu      vr13,   vr13,   vr2
+    vavgr.bu      vr14,   vr14,   vr3
+    add.d         t6,     t4,     zero     // t6 = src + 6 * stride
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+    alsl.d        a0,     a2,     a0,    2  // dst = dst + 4 * stride
+    alsl.d        t8,     a2,     t8,    2
+    vldx          vr2,    t4,     a2
+    vldx          vr3,    t4,     t1
+    QPEL8_V_LSX   vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vld           vr4,    t8,     0
+    vldx          vr5,    t8,     a2
+    vavgr.bu      vr13,   vr27,   vr13
+    vavgr.bu      vr14,   vr28,   vr14
+    vavgr.bu      vr13,   vr13,   vr4
+    vavgr.bu      vr14,   vr14,   vr5
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+    vldx          vr4,    t4,     t2
+    alsl.d        t4,     a2,     t4,    2
+    vld           vr5,    t4,     0
+    QPEL8_V_LSX   vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vldx          vr6,    t8,     t1
+    vldx          vr0,    t8,     t2
+    vavgr.bu      vr13,   vr29,   vr13
+    vavgr.bu      vr14,   vr30,   vr14
+    vavgr.bu      vr13,   vr13,   vr6
+    vavgr.bu      vr14,   vr14,   vr0
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+    alsl.d        a1,     a2,     t0,    3  // a1 = src + 8 * stride
+    addi.d        t5,     a1,     8         // a1 = src + 8 * stride + 8
+    VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1
+    alsl.d        a1,     a2,     a1,    2
+    VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+    VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+                                  vr14, vr15, t5
+    alsl.d        t5,     a2,     t5,    2
+    VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+                                  vr18, vr19, t5
+    alsl.d        a0,     a2,     a0,    2   // dst = dst + 4 * stride
+    alsl.d        t8,     a2,     t8,    2
+    // t6 = src + 6 * stride + 1
+    vld           vr0,    t6,     0
+    vldx          vr1,    t6,     a2
+    vldx          vr2,    t6,     t1
+    vldx          vr3,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr4,    t6,     0
+    vldx          vr5,    t6,     a2
+    vldx          vr6,    t6,     t1
+    QPEL8_V_LSX   vr0, vr1, vr2, vr3, vr4, vr5, vr6
+    vld           vr0,    t8,     0
+    vldx          vr1,    t8,     a2
+    vavgr.bu      vr13,   vr23,   vr13
+    vavgr.bu      vr14,   vr24,   vr14
+    vavgr.bu      vr13,   vr13,   vr0
+    vavgr.bu      vr14,   vr14,   vr1
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+    vldx          vr0,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr1,    t6,     0
+    QPEL8_V_LSX   vr2, vr3, vr4, vr5, vr6, vr0, vr1
+    vldx          vr2,    t8,     t1
+    vldx          vr3,    t8,     t2
+    vavgr.bu      vr13,   vr25,   vr13
+    vavgr.bu      vr14,   vr26,   vr14
+    vavgr.bu      vr13,   vr13,   vr2
+    vavgr.bu      vr14,   vr14,   vr3
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+    alsl.d        a0,     a2,     a0,    2    // dst = dst + 4 *stride
+    alsl.d        t8,     a2,     t8,    2
+    vldx          vr2,    t6,     a2
+    vldx          vr3,    t6,     t1
+    QPEL8_V_LSX   vr4, vr5, vr6, vr0, vr1, vr2, vr3
+    vld           vr4,    t8,     0
+    vldx          vr5,    t8,     a2
+    vavgr.bu      vr13,   vr27,   vr13
+    vavgr.bu      vr14,   vr28,   vr14
+    vavgr.bu      vr13,   vr13,   vr4
+    vavgr.bu      vr14,   vr14,   vr5
+    vst           vr13,   a0,     0
+    vstx          vr14,   a0,     a2
+    vldx          vr4,    t6,     t2
+    alsl.d        t6,     a2,     t6,    2
+    vld           vr5,    t6,     0
+    QPEL8_V_LSX   vr6, vr0, vr1, vr2, vr3, vr4, vr5
+    vldx          vr6,    t8,     t1
+    vldx          vr0,    t8,     t2
+    vavgr.bu      vr13,   vr29,   vr13
+    vavgr.bu      vr14,   vr30,   vr14
+    vavgr.bu      vr13,   vr13,   vr6
+    vavgr.bu      vr14,   vr14,   vr0
+    vstx          vr13,   a0,     t1
+    vstx          vr14,   a0,     t2
+    fld.d         f24,    sp,     0
+    fld.d         f25,    sp,     8
+    fld.d         f26,    sp,     16
+    fld.d         f27,    sp,     24
+    fld.d         f28,    sp,     32
+    fld.d         f29,    sp,     40
+    fld.d         f30,    sp,     48
+    fld.d         f31,    sp,     56
+    addi.d        sp,     sp,     64
+.endm
+
+function ff_avg_h264_qpel16_mc33_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t0,     t0,     a2   // t0 = src + stride - 2
+    add.d         t3,     a1,     zero // t3 = src
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+    addi.d        t4,     t4,     1
+    addi.d        t8,     a0,     0
+    avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+endfunc
+
+function ff_avg_h264_qpel16_mc11_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t3,     a1,     zero // t3 = src
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+    addi.d        t8,     a0,     0
+    avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+endfunc
+
+function ff_avg_h264_qpel16_mc31_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t3,     a1,     zero // t3 = src
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+    addi.d        t4,     t4,     1
+    addi.d        t8,     a0,     0
+    avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+endfunc
+
+function ff_avg_h264_qpel16_mc13_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t0,     t0,     a2
+    add.d         t3,     a1,     zero // t3 = src
+    sub.d         t4,     a1,     t1   // t4 = src - 2 * stride
+    addi.d        t8,     a0,     0
+    avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+endfunc
+
+function ff_avg_h264_qpel16_mc20_lsx
+    slli.d        t1,     a2,     1
+    add.d         t2,     t1,     a2
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    addi.d        t5,     a0,     0
+    addi.d        a1,     t0,     8
+.rept 4
+    VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+    VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1
+    vld           vr0,    t5,     0
+    vldx          vr1,    t5,     a2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a2
+    add.d         a1,     a1,     t1
+    VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, a1
+    vldx          vr0,    t5,     t1
+    vldx          vr1,    t5,     t2
+    vavgr.bu      vr0,    vr0,    vr2
+    vavgr.bu      vr1,    vr1,    vr3
+    vstx          vr0,    a0,     t1
+    vstx          vr1,    a0,     t2
+    alsl.d        t0,     a2,     t0,    2
+    alsl.d        t5,     a2,     t5,    2
+    alsl.d        a0,     a2,     a0,    2
+    alsl.d        a1,     a2,     a1,    1
+.endr
+endfunc
+
+.macro QPEL8_HV_H_LSX out0, out1
+    vbsrl.v       vr2,    vr0,    1
+    vbsrl.v       vr3,    vr1,    1
+    vbsrl.v       vr4,    vr0,    2
+    vbsrl.v       vr5,    vr1,    2
+    vbsrl.v       vr6,    vr0,    3
+    vbsrl.v       vr7,    vr1,    3
+    vbsrl.v       vr8,    vr0,    4
+    vbsrl.v       vr9,    vr1,    4
+    vbsrl.v       vr10,   vr0,    5
+    vbsrl.v       vr11,   vr1,    5
+    vilvl.b       vr6,    vr4,    vr6
+    vilvl.b       vr7,    vr5,    vr7
+    vilvl.b       vr8,    vr2,    vr8
+    vilvl.b       vr9,    vr3,    vr9
+    vilvl.b       vr10,   vr0,    vr10
+    vilvl.b       vr11,   vr1,    vr11
+    vhaddw.hu.bu  vr6,    vr6,    vr6
+    vhaddw.hu.bu  vr7,    vr7,    vr7
+    vhaddw.hu.bu  vr8,    vr8,    vr8
+    vhaddw.hu.bu  vr9,    vr9,    vr9
+    vhaddw.hu.bu  vr10,   vr10,   vr10
+    vhaddw.hu.bu  vr11,   vr11,   vr11
+    vmul.h        vr2,    vr6,    vr20
+    vmul.h        vr3,    vr7,    vr20
+    vmul.h        vr4,    vr8,    vr21
+    vmul.h        vr5,    vr9,    vr21
+    vssub.h       vr2,    vr2,    vr4
+    vssub.h       vr3,    vr3,    vr5
+    vsadd.h       \out0,  vr2,    vr10
+    vsadd.h       \out1,  vr3,    vr11
+.endm
+
+.macro QPEL8_HV_V_LSX in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3
+    vilvl.h       vr0,    \in2,   \in3
+    vilvl.h       vr1,    \in3,   \in4  // tmp0
+    vilvl.h       vr2,    \in1,   \in4
+    vilvl.h       vr3,    \in2,   \in5  // tmp2
+    vilvl.h       vr4,    \in0,   \in5
+    vilvl.h       vr5,    \in1,   \in6  // tmp4
+    vhaddw.w.h    vr0,    vr0,    vr0
+    vhaddw.w.h    vr1,    vr1,    vr1
+    vhaddw.w.h    vr2,    vr2,    vr2
+    vhaddw.w.h    vr3,    vr3,    vr3
+    vhaddw.w.h    vr4,    vr4,    vr4
+    vhaddw.w.h    vr5,    vr5,    vr5
+    vmul.w        vr0,    vr0,    vr22
+    vmul.w        vr1,    vr1,    vr22
+    vmul.w        vr2,    vr2,    vr23
+    vmul.w        vr3,    vr3,    vr23
+    vssub.w       vr0,    vr0,    vr2
+    vssub.w       vr1,    vr1,    vr3
+    vsadd.w       vr0,    vr0,    vr4
+    vsadd.w       vr1,    vr1,    vr5
+    vsadd.w       \out0,  vr0,    vr24
+    vsadd.w       \out1,  vr1,    vr24
+    vilvh.h       vr0,    \in2,   \in3
+    vilvh.h       vr1,    \in3,   \in4  // tmp0
+    vilvh.h       vr2,    \in1,   \in4
+    vilvh.h       vr3,    \in2,   \in5  // tmp2
+    vilvh.h       vr4,    \in0,   \in5
+    vilvh.h       vr5,    \in1,   \in6  // tmp4
+    vhaddw.w.h    vr0,    vr0,    vr0
+    vhaddw.w.h    vr1,    vr1,    vr1
+    vhaddw.w.h    vr2,    vr2,    vr2
+    vhaddw.w.h    vr3,    vr3,    vr3
+    vhaddw.w.h    vr4,    vr4,    vr4
+    vhaddw.w.h    vr5,    vr5,    vr5
+    vmul.w        vr0,    vr0,    vr22
+    vmul.w        vr1,    vr1,    vr22
+    vmul.w        vr2,    vr2,    vr23
+    vmul.w        vr3,    vr3,    vr23
+    vssub.w       vr0,    vr0,    vr2
+    vssub.w       vr1,    vr1,    vr3
+    vsadd.w       vr0,    vr0,    vr4
+    vsadd.w       vr1,    vr1,    vr5
+    vsadd.w       \out2,  vr0,    vr24
+    vsadd.w       \out3,  vr1,    vr24
+    vssrani.hu.w  \out2,  \out0,  10
+    vssrani.hu.w  \out3,  \out1,  10
+    vssrani.bu.h  \out3,  \out2,  0
+.endm
+
+.macro h264_qpel8_hv_lowpass_core_lsx in0, in1, type
+    vld           vr0,    \in0,  0
+    vldx          vr1,    \in0,  a3
+    QPEL8_HV_H_LSX vr12, vr13 // a b$
+    vldx          vr0,    \in0,  t1
+    vldx          vr1,    \in0,  t2
+    QPEL8_HV_H_LSX vr14, vr15 // c d$
+
+    alsl.d        \in0,   a3,    \in0,   2
+
+    vld           vr0,    \in0,  0
+    vldx          vr1,    \in0,  a3
+    QPEL8_HV_H_LSX vr16, vr17 // e f$
+    vldx          vr0,    \in0,  t1
+    vldx          vr1,    \in0,  t2
+    QPEL8_HV_H_LSX vr18, vr19 // g h$
+    QPEL8_HV_V_LSX vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr6, vr7, vr0, vr1
+.ifc \type, avg
+    fld.d         f2,     t3,      0
+    fldx.d        f3,     t3,      a2
+    vilvl.d       vr2,    vr3,     vr2
+    vavgr.bu      vr1,    vr2,     vr1
+.endif
+    vstelm.d      vr1,    \in1,    0,     0
+    add.d         \in1,   \in1,    a2
+    vstelm.d      vr1,    \in1,    0,     1
+
+    alsl.d        \in0,    a3,    \in0,   2
+
+    // tmp8
+    vld           vr0,    \in0,   0
+    vldx          vr1,    \in0,   a3
+    QPEL8_HV_H_LSX vr12, vr13
+    QPEL8_HV_V_LSX vr14, vr15, vr16, vr17, vr18, vr19, vr12, vr6, vr7, vr0, vr1
+.ifc \type, avg
+    fldx.d        f2,     t3,      t5
+    fldx.d        f3,     t3,      t6
+    vilvl.d       vr2,    vr3,     vr2
+    vavgr.bu      vr1,    vr2,     vr1
+.endif
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     0
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     1
+
+    // tmp10
+    vldx          vr0,    \in0,   t1
+    vldx          vr1,    \in0,   t2
+    QPEL8_HV_H_LSX vr14, vr15
+    QPEL8_HV_V_LSX vr16, vr17, vr18, vr19, vr12, vr13, vr14, vr6, vr7, vr0, vr1
+.ifc \type, avg
+    alsl.d        t3,     a2,      t3,   2
+    fld.d         f2,     t3,      0
+    fldx.d        f3,     t3,      a2
+    vilvl.d       vr2,    vr3,     vr2
+    vavgr.bu      vr1,    vr2,     vr1
+.endif
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     0
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     1
+
+    // tmp12
+    alsl.d        \in0,   a3,     \in0,  2
+
+    vld           vr0,    \in0,   0
+    vldx          vr1,    \in0,   a3
+    QPEL8_HV_H_LSX vr16, vr17
+    QPEL8_HV_V_LSX vr18, vr19, vr12, vr13, vr14, vr15, vr16, vr6, vr7, vr0, vr1
+.ifc \type, avg
+    fldx.d        f2,     t3,     t5
+    fldx.d        f3,     t3,     t6
+    vilvl.d       vr2,    vr3,    vr2
+    vavgr.bu      vr1,    vr2,    vr1
+.endif
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     0
+    add.d         \in1,   \in1,   a2
+    vstelm.d      vr1,    \in1,   0,     1
+.endm
+
+function put_h264_qpel8_hv_lowpass_lsx
+    slli.d        t1,     a3,     1
+    add.d         t2,     t1,     a3
+    addi.d        sp,     sp,     -8
+    fst.d         f24,    sp,     0
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    sub.d         t0,     t0,     t1   // t0 = t0 - 2 * stride
+    vldi          vr20,   0x414   // h_20
+    vldi          vr21,   0x405   // h_5
+    vldi          vr22,   0x814   // w_20
+    vldi          vr23,   0x805   // w_5
+    addi.d        t4,     zero,   512
+    vreplgr2vr.w  vr24,   t4      // w_512
+    h264_qpel8_hv_lowpass_core_lsx t0, a0, put
+    fld.d         f24,    sp,     0
+    addi.d        sp,     sp,     8
+endfunc
+
+function put_h264_qpel8_h_lowpass_lsx
+    slli.d        t1,     a3,     1
+    add.d         t2,     t1,     a3
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t3,     a1,     zero // t3 = src
+.rept 2
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a3
+    QPEL8_H_LSX   vr12, vr13
+    vssrani.bu.h  vr13,   vr12,   5
+    vstelm.d      vr13,   a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr13,   a0,     0,    1
+    add.d         a0,     a0,     a2
+    vldx          vr0,    t0,     t1
+    vldx          vr1,    t0,     t2
+    QPEL8_H_LSX   vr12, vr13
+    vssrani.bu.h  vr13,   vr12,   5
+    vstelm.d      vr13,   a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr13,   a0,     0,    1
+    add.d         a0,     a0,     a2
+    alsl.d        t0,     a3,     t0,    2
+.endr
+endfunc
+
+function put_pixels16_l2_8_lsx
+    slli.d        t0,     a4,     1
+    add.d         t1,     t0,     a4
+    slli.d        t2,     t0,     1
+    slli.d        t3,     a3,     1
+    add.d         t4,     t3,     a3
+    slli.d        t5,     t3,     1
+.rept 4
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a4
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vld           vr8,    a2,     0x00
+    vld           vr9,    a2,     0x10
+    vld           vr10,   a2,     0x20
+    vld           vr11,   a2,     0x30
+    addi.d        a2,     a2,     0x40
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a3
+    vstx          vr2,    a0,     t3
+    vstx          vr3,    a0,     t4
+    add.d         a0,     a0,     t5
+.endr
+endfunc
+
+.macro QPEL8_V1_LSX in0, in1, in2, in3, in4, in5, in6
+    vilvl.b       vr7,    \in3,   \in2
+    vilvl.b       vr8,    \in4,   \in3
+    vilvl.b       vr9,    \in4,   \in1
+    vilvl.b       vr10,   \in5,   \in2
+    vilvl.b       vr11,   \in5,   \in0
+    vilvl.b       vr12,   \in6,   \in1
+    vhaddw.hu.bu  vr7,    vr7,    vr7
+    vhaddw.hu.bu  vr8,    vr8,    vr8
+    vhaddw.hu.bu  vr9,    vr9,    vr9
+    vhaddw.hu.bu  vr10,   vr10,   vr10
+    vhaddw.hu.bu  vr11,   vr11,   vr11
+    vhaddw.hu.bu  vr12,   vr12,   vr12
+    vmul.h        vr7,    vr7,    vr20
+    vmul.h        vr8,    vr8,    vr20
+    vmul.h        vr9,    vr9,    vr21
+    vmul.h        vr10,   vr10,   vr21
+    vssub.h       vr7,    vr7,    vr9
+    vssub.h       vr8,    vr8,    vr10
+    vsadd.h       vr7,    vr7,    vr11
+    vsadd.h       vr8,    vr8,    vr12
+    vsadd.h       vr7,    vr7,    vr22
+    vsadd.h       vr8,    vr8,    vr22
+    vssrani.bu.h  vr8,    vr7,    5
+.endm
+
+.macro h264_qpel8_v_lowpass_lsx type
+function \type\()_h264_qpel8_v_lowpass_lsx
+    slli.d        t0,     a3,     1
+    add.d         t1,     t0,     a3
+    sub.d         t2,     a1,     t0  // t2 = src - 2 * stride
+.ifc \type, avg
+    addi.d        t3,     a0,     0
+    slli.d        t4,     a2,     1
+    add.d         t5,     t4,     a2
+.endif
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+
+    fld.d         f0,     t2,     0
+    fldx.d        f1,     t2,     a3
+    fldx.d        f2,     t2,     t0
+    fldx.d        f3,     t2,     t1
+    alsl.d        t2,     a3,     t2,    2  // t2 = t2 + 4 * stride
+    fld.d         f4,     t2,     0
+    fldx.d        f5,     t2,     a3
+    fldx.d        f6,     t2,     t0
+    QPEL8_V1_LSX  vr0, vr1, vr2, vr3, vr4, vr5, vr6
+.ifc \type, avg
+    fld.d         f0,     t3,     0
+    fldx.d        f1,     t3,     a2
+    vilvl.d       vr0,    vr1,    vr0
+    vavgr.bu      vr8,    vr8,    vr0
+.endif
+    vstelm.d      vr8,    a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr8,    a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    fldx.d        f0,     t2,     t1
+    alsl.d        t2,     a3,     t2,   2  // t2 = t2 + 4 *stride
+    fld.d         f1,     t2,     0
+    QPEL8_V1_LSX  vr2, vr3, vr4, vr5, vr6, vr0, vr1
+.ifc \type, avg
+    fldx.d        f2,     t3,     t4
+    fldx.d        f3,     t3,     t5
+    vilvl.d       vr2,    vr3,    vr2
+    vavgr.bu      vr8,    vr8,    vr2
+.endif
+    vstelm.d      vr8,    a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr8,    a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    alsl.d        t3,     a2,     t3,   2
+
+    fldx.d        f2,     t2,     a3
+    fldx.d        f3,     t2,     t0
+    QPEL8_V1_LSX  vr4, vr5, vr6, vr0, vr1, vr2, vr3
+.ifc \type, avg
+    fld.d         f4,     t3,     0
+    fldx.d        f5,     t3,     a2
+    vilvl.d       vr4,    vr5,    vr4
+    vavgr.bu      vr8,    vr8,    vr4
+.endif
+    vstelm.d      vr8,    a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr8,    a0,     0,    1
+    add.d         a0,     a0,     a2
+
+    fldx.d        f4,     t2,     t1
+    alsl.d        t2,     a3,     t2,   2 // t2 = t2 + 4 * stride
+    fld.d         f5,     t2,     0
+    QPEL8_V1_LSX  vr6, vr0, vr1, vr2, vr3, vr4, vr5
+.ifc \type, avg
+    fldx.d        f6,     t3,     t4
+    fldx.d        f0,     t3,     t5
+    vilvl.d       vr6,    vr0,    vr6
+    vavgr.bu      vr8,    vr8,    vr6
+.endif
+    vstelm.d      vr8,    a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr8,    a0,     0,    1
+endfunc
+.endm
+
+h264_qpel8_v_lowpass_lsx put
+h264_qpel8_v_lowpass_lsx avg
+
+function avg_pixels16_l2_8_lsx
+    slli.d        t0,     a4,     1
+    add.d         t1,     t0,     a4
+    slli.d        t2,     t0,     1
+    slli.d        t3,     a3,     1
+    add.d         t4,     t3,     a3
+    slli.d        t5,     t3,     1
+    addi.d        t6,     a0,     0
+.rept 4
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a4
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vld           vr8,    a2,     0x00
+    vld           vr9,    a2,     0x10
+    vld           vr10,   a2,     0x20
+    vld           vr11,   a2,     0x30
+    addi.d        a2,     a2,     0x40
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vld           vr8,    t6,     0
+    vldx          vr9,    t6,     a3
+    vldx          vr10,   t6,     t3
+    vldx          vr11,   t6,     t4
+    add.d         t6,     t6,     t5
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr1,    vr9,    vr1
+    vavgr.bu      vr2,    vr10,   vr2
+    vavgr.bu      vr3,    vr11,   vr3
+    vst           vr0,    a0,     0
+    vstx          vr1,    a0,     a3
+    vstx          vr2,    a0,     t3
+    vstx          vr3,    a0,     t4
+    add.d         a0,     a0,     t5
+.endr
+endfunc
+
+function avg_h264_qpel8_hv_lowpass_lsx
+    slli.d        t1,     a3,     1
+    add.d         t2,     t1,     a3
+    slli.d        t5,     a2,     1
+    add.d         t6,     a2,     t5
+    addi.d        sp,     sp,     -8
+    fst.d         f24,    sp,     0
+    vldi          vr20,   0x414   // h_20
+    vldi          vr21,   0x405   // h_5
+    vldi          vr22,   0x814   // w_20
+    vldi          vr23,   0x805   // w_5
+    addi.d        t4,     zero,   512
+    vreplgr2vr.w  vr24,   t4      // w_512
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    sub.d         t0,     t0,     t1   // t0 = t0 - 2 * stride
+    addi.d        t3,     a0,     0    // t3 = dst
+    h264_qpel8_hv_lowpass_core_lsx t0, a0, avg
+    fld.d         f24,    sp,     0
+    addi.d        sp,     sp,     8
+endfunc
+
+function put_pixels8_l2_8_lsx
+    slli.d        t0,     a4,     1
+    add.d         t1,     t0,     a4
+    slli.d        t2,     t0,     1
+.rept 2
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a4
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vilvl.d       vr0,    vr1,    vr0
+    vilvl.d       vr2,    vr3,    vr2
+    vld           vr8,    a2,     0x00
+    vld           vr9,    a2,     0x08
+    vld           vr10,   a2,     0x10
+    vld           vr11,   a2,     0x18
+    vilvl.d       vr8,    vr9,    vr8
+    vilvl.d       vr10,   vr11,   vr10
+    addi.d        a2,     a2,     32
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr2,    vr10,   vr2
+    vstelm.d      vr0,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr0,    a0,     0,     1
+    add.d         a0,     a0,     a3
+    vstelm.d      vr2,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr2,    a0,     0,     1
+    add.d         a0,     a0,     a3
+.endr
+endfunc
+
+function ff_put_h264_qpel8_mc00_lsx
+    slli.d        t0,     a2,     1
+    add.d         t1,     t0,     a2
+    slli.d        t2,     t0,     1
+    ld.d          t3,     a1,     0x0
+    ldx.d         t4,     a1,     a2
+    ldx.d         t5,     a1,     t0
+    ldx.d         t6,     a1,     t1
+    st.d          t3,     a0,     0x0
+    stx.d         t4,     a0,     a2
+    stx.d         t5,     a0,     t0
+    stx.d         t6,     a0,     t1
+    add.d         a1,     a1,     t2
+    add.d         a0,     a0,     t2
+    ld.d          t3,     a1,     0x0
+    ldx.d         t4,     a1,     a2
+    ldx.d         t5,     a1,     t0
+    ldx.d         t6,     a1,     t1
+    st.d          t3,     a0,     0x0
+    stx.d         t4,     a0,     a2
+    stx.d         t5,     a0,     t0
+    stx.d         t6,     a0,     t1
+endfunc
+
+function ff_avg_h264_qpel8_mc00_lsx
+    slli.d        t0,     a2,     1
+    add.d         t1,     t0,     a2
+    slli.d        t2,     t0,     1
+    addi.d        t3,     a0,     0
+.rept 2
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a2
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vilvl.d       vr0,    vr1,    vr0
+    vilvl.d       vr2,    vr3,    vr2
+    vld           vr8,    t3,     0
+    vldx          vr9,    t3,     a2
+    vldx          vr10,   t3,     t0
+    vldx          vr11,   t3,     t1
+    add.d         t3,     t3,     t2
+    vilvl.d       vr8,    vr9,    vr8
+    vilvl.d       vr10,   vr11,   vr10
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr2,    vr10,   vr2
+    vstelm.d      vr0,    a0,     0,     0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr0,    a0,     0,     1
+    add.d         a0,     a0,     a2
+    vstelm.d      vr2,    a0,     0,     0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr2,    a0,     0,     1
+    add.d         a0,     a0,     a2
+.endr
+endfunc
+
+function avg_pixels8_l2_8_lsx
+    slli.d        t0,     a4,     1
+    add.d         t1,     t0,     a4
+    slli.d        t2,     t0,     1
+    addi.d        t3,     a0,     0
+    slli.d        t4,     a3,     1
+    add.d         t5,     t4,     a3
+    slli.d        t6,     t4,     1
+.rept 2
+    vld           vr0,    a1,     0
+    vldx          vr1,    a1,     a4
+    vldx          vr2,    a1,     t0
+    vldx          vr3,    a1,     t1
+    add.d         a1,     a1,     t2
+    vilvl.d       vr0,    vr1,    vr0
+    vilvl.d       vr2,    vr3,    vr2
+    vld           vr8,    a2,     0x00
+    vld           vr9,    a2,     0x08
+    vld           vr10,   a2,     0x10
+    vld           vr11,   a2,     0x18
+    addi.d        a2,     a2,     0x20
+    vilvl.d       vr8,    vr9,    vr8
+    vilvl.d       vr10,   vr11,   vr10
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr2,    vr10,   vr2
+    vld           vr8,    t3,     0
+    vldx          vr9,    t3,     a3
+    vldx          vr10,   t3,     t4
+    vldx          vr11,   t3,     t5
+    add.d         t3,     t3,     t6
+    vilvl.d       vr8,    vr9,    vr8
+    vilvl.d       vr10,   vr11,   vr10
+    vavgr.bu      vr0,    vr8,    vr0
+    vavgr.bu      vr2,    vr10,   vr2
+    vstelm.d      vr0,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr0,    a0,     0,     1
+    add.d         a0,     a0,     a3
+    vstelm.d      vr2,    a0,     0,     0
+    add.d         a0,     a0,     a3
+    vstelm.d      vr2,    a0,     0,     1
+    add.d         a0,     a0,     a3
+.endr
+endfunc
+
+function avg_h264_qpel8_h_lowpass_lsx
+    slli.d        t1,     a3,     1
+    add.d         t2,     t1,     a3
+    slli.d        t5,     a2,     1
+    add.d         t6,     t5,     a2
+    vldi          vr20,   0x414
+    vldi          vr21,   0x405
+    vldi          vr22,   0x410
+    addi.d        t0,     a1,     -2   // t0 = src - 2
+    add.d         t3,     a1,     zero // t3 = src
+    addi.d        t4,     a0,     0    // t4 = dst
+.rept 4
+    vld           vr0,    t0,     0
+    vldx          vr1,    t0,     a3
+    QPEL8_H_LSX   vr12, vr13
+    vssrani.bu.h  vr13,   vr12,   5
+    fld.d         f0,     t4,     0
+    fldx.d        f1,     t4,     a2
+    vilvl.d       vr0,    vr1,    vr0
+    vavgr.bu      vr13,   vr13,   vr0
+    vstelm.d      vr13,   a0,     0,    0
+    add.d         a0,     a0,     a2
+    vstelm.d      vr13,   a0,     0,    1
+    add.d         a0,     a0,     a2
+    add.d         t0,     t0,     t1
+    add.d         t4,     t4,     t1
+.endr
+endfunc
diff --git a/libavcodec/loongarch/h264qpel_init_loongarch.c b/libavcodec/loongarch/h264qpel_init_loongarch.c
index 969c9c376c..9d3a5cb164 100644
--- a/libavcodec/loongarch/h264qpel_init_loongarch.c
+++ b/libavcodec/loongarch/h264qpel_init_loongarch.c
@@ -19,7 +19,7 @@ 
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "h264qpel_lasx.h"
+#include "h264qpel_loongarch.h"
 #include "libavutil/attributes.h"
 #include "libavutil/loongarch/cpu.h"
 #include "libavcodec/h264qpel.h"
@@ -27,6 +27,77 @@ 
 av_cold void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
+
+    if (have_lsx(cpu_flags)) {
+        if (8 == bit_depth) {
+            c->put_h264_qpel_pixels_tab[0][0]  = ff_put_h264_qpel16_mc00_lsx;
+            c->put_h264_qpel_pixels_tab[0][1]  = ff_put_h264_qpel16_mc10_lsx;
+            c->put_h264_qpel_pixels_tab[0][2]  = ff_put_h264_qpel16_mc20_lsx;
+            c->put_h264_qpel_pixels_tab[0][3]  = ff_put_h264_qpel16_mc30_lsx;
+            c->put_h264_qpel_pixels_tab[0][4]  = ff_put_h264_qpel16_mc01_lsx;
+            c->put_h264_qpel_pixels_tab[0][5]  = ff_put_h264_qpel16_mc11_lsx;
+            c->put_h264_qpel_pixels_tab[0][6]  = ff_put_h264_qpel16_mc21_lsx;
+            c->put_h264_qpel_pixels_tab[0][7]  = ff_put_h264_qpel16_mc31_lsx;
+            c->put_h264_qpel_pixels_tab[0][8]  = ff_put_h264_qpel16_mc02_lsx;
+            c->put_h264_qpel_pixels_tab[0][9]  = ff_put_h264_qpel16_mc12_lsx;
+            c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_lsx;
+            c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_lsx;
+            c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_lsx;
+            c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_lsx;
+            c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_lsx;
+            c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_lsx;
+
+            c->avg_h264_qpel_pixels_tab[0][0]  = ff_avg_h264_qpel16_mc00_lsx;
+            c->avg_h264_qpel_pixels_tab[0][1]  = ff_avg_h264_qpel16_mc10_lsx;
+            c->avg_h264_qpel_pixels_tab[0][2]  = ff_avg_h264_qpel16_mc20_lsx;
+            c->avg_h264_qpel_pixels_tab[0][3]  = ff_avg_h264_qpel16_mc30_lsx;
+            c->avg_h264_qpel_pixels_tab[0][4]  = ff_avg_h264_qpel16_mc01_lsx;
+            c->avg_h264_qpel_pixels_tab[0][5]  = ff_avg_h264_qpel16_mc11_lsx;
+            c->avg_h264_qpel_pixels_tab[0][6]  = ff_avg_h264_qpel16_mc21_lsx;
+            c->avg_h264_qpel_pixels_tab[0][7]  = ff_avg_h264_qpel16_mc31_lsx;
+            c->avg_h264_qpel_pixels_tab[0][8]  = ff_avg_h264_qpel16_mc02_lsx;
+            c->avg_h264_qpel_pixels_tab[0][9]  = ff_avg_h264_qpel16_mc12_lsx;
+            c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_lsx;
+            c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_lsx;
+            c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_lsx;
+            c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_lsx;
+            c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_lsx;
+            c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_lsx;
+
+            c->put_h264_qpel_pixels_tab[1][0]  = ff_put_h264_qpel8_mc00_lsx;
+            c->put_h264_qpel_pixels_tab[1][1]  = ff_put_h264_qpel8_mc10_lsx;
+            c->put_h264_qpel_pixels_tab[1][2]  = ff_put_h264_qpel8_mc20_lsx;
+            c->put_h264_qpel_pixels_tab[1][3]  = ff_put_h264_qpel8_mc30_lsx;
+            c->put_h264_qpel_pixels_tab[1][4]  = ff_put_h264_qpel8_mc01_lsx;
+            c->put_h264_qpel_pixels_tab[1][5]  = ff_put_h264_qpel8_mc11_lsx;
+            c->put_h264_qpel_pixels_tab[1][6]  = ff_put_h264_qpel8_mc21_lsx;
+            c->put_h264_qpel_pixels_tab[1][7]  = ff_put_h264_qpel8_mc31_lsx;
+            c->put_h264_qpel_pixels_tab[1][8]  = ff_put_h264_qpel8_mc02_lsx;
+            c->put_h264_qpel_pixels_tab[1][9]  = ff_put_h264_qpel8_mc12_lsx;
+            c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_lsx;
+            c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_lsx;
+            c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_lsx;
+            c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_lsx;
+            c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_lsx;
+            c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_lsx;
+
+            c->avg_h264_qpel_pixels_tab[1][0]  = ff_avg_h264_qpel8_mc00_lsx;
+            c->avg_h264_qpel_pixels_tab[1][1]  = ff_avg_h264_qpel8_mc10_lsx;
+            c->avg_h264_qpel_pixels_tab[1][2]  = ff_avg_h264_qpel8_mc20_lsx;
+            c->avg_h264_qpel_pixels_tab[1][3]  = ff_avg_h264_qpel8_mc30_lsx;
+            c->avg_h264_qpel_pixels_tab[1][5]  = ff_avg_h264_qpel8_mc11_lsx;
+            c->avg_h264_qpel_pixels_tab[1][6]  = ff_avg_h264_qpel8_mc21_lsx;
+            c->avg_h264_qpel_pixels_tab[1][7]  = ff_avg_h264_qpel8_mc31_lsx;
+            c->avg_h264_qpel_pixels_tab[1][8]  = ff_avg_h264_qpel8_mc02_lsx;
+            c->avg_h264_qpel_pixels_tab[1][9]  = ff_avg_h264_qpel8_mc12_lsx;
+            c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_lsx;
+            c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_lsx;
+            c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_lsx;
+            c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_lsx;
+            c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_lsx;
+        }
+    }
+#if HAVE_LASX
     if (have_lasx(cpu_flags)) {
         if (8 == bit_depth) {
             c->put_h264_qpel_pixels_tab[0][0]  = ff_put_h264_qpel16_mc00_lasx;
@@ -95,4 +166,5 @@  av_cold void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth)
             c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_lasx;
         }
     }
+#endif
 }
diff --git a/libavcodec/loongarch/h264qpel_lasx.c b/libavcodec/loongarch/h264qpel_lasx.c
index 1c142e510e..519bb03fe6 100644
--- a/libavcodec/loongarch/h264qpel_lasx.c
+++ b/libavcodec/loongarch/h264qpel_lasx.c
@@ -21,7 +21,7 @@ 
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "h264qpel_lasx.h"
+#include "h264qpel_loongarch.h"
 #include "libavutil/loongarch/loongson_intrinsics.h"
 #include "libavutil/attributes.h"
 
@@ -418,157 +418,6 @@  avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
     );
 }
 
-/* avg_pixels8_8_lsx   : dst = avg(src, dst)
- * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
-                     ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
-    ptrdiff_t stride_2, stride_3, stride_4;
-    __asm__ volatile (
-    /* h0~h7 */
-    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
-    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
-    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
-    "vld        $vr0,            %[src],         0            \n\t"
-    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-    "vld        $vr4,            %[src],         0            \n\t"
-    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
-
-    "vld        $vr8,            %[half],        0x00         \n\t"
-    "vld        $vr9,            %[half],        0x08         \n\t"
-    "vld        $vr10,           %[half],        0x10         \n\t"
-    "vld        $vr11,           %[half],        0x18         \n\t"
-    "vld        $vr12,           %[half],        0x20         \n\t"
-    "vld        $vr13,           %[half],        0x28         \n\t"
-    "vld        $vr14,           %[half],        0x30         \n\t"
-    "vld        $vr15,           %[half],        0x38         \n\t"
-
-    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
-    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
-    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
-    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
-    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
-    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
-    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
-    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
-
-    "vstelm.d   $vr0,            %[dst],         0,  0        \n\t"
-    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
-    "vstelm.d   $vr1,            %[dst],         0,  0        \n\t"
-    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
-    "vstelm.d   $vr2,            %[dst],         0,  0        \n\t"
-    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
-    "vstelm.d   $vr3,            %[dst],         0,  0        \n\t"
-    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
-    "vstelm.d   $vr4,            %[dst],         0,  0        \n\t"
-    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
-    "vstelm.d   $vr5,            %[dst],         0,  0        \n\t"
-    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
-    "vstelm.d   $vr6,            %[dst],         0,  0        \n\t"
-    "add.d      %[dst],          %[dst],         %[dstStride] \n\t"
-    "vstelm.d   $vr7,            %[dst],         0,  0        \n\t"
-    : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
-      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
-      [stride_4]"=&r"(stride_4)
-    : [srcStride]"r"(srcStride), [dstStride]"r"(dstStride)
-    : "memory"
-    );
-}
-
-/* avg_pixels8_8_lsx   : dst = avg(src, dst)
- * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
-                     ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
-    uint8_t *tmp = dst;
-    ptrdiff_t stride_2, stride_3, stride_4;
-    __asm__ volatile (
-    /* h0~h7 */
-    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
-    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
-    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
-    "vld        $vr0,            %[src],         0            \n\t"
-    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-    "vld        $vr4,            %[src],         0            \n\t"
-    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
-
-    "vld        $vr8,            %[half],        0x00         \n\t"
-    "vld        $vr9,            %[half],        0x08         \n\t"
-    "vld        $vr10,           %[half],        0x10         \n\t"
-    "vld        $vr11,           %[half],        0x18         \n\t"
-    "vld        $vr12,           %[half],        0x20         \n\t"
-    "vld        $vr13,           %[half],        0x28         \n\t"
-    "vld        $vr14,           %[half],        0x30         \n\t"
-    "vld        $vr15,           %[half],        0x38         \n\t"
-
-    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
-    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
-    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
-    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
-    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
-    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
-    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
-    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
-
-    "slli.d     %[stride_2],     %[dstStride],   1            \n\t"
-    "add.d      %[stride_3],     %[stride_2],    %[dstStride] \n\t"
-    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
-    "vld        $vr8,            %[tmp],         0            \n\t"
-    "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
-    "vldx       $vr10,           %[tmp],         %[stride_2]  \n\t"
-    "vldx       $vr11,           %[tmp],         %[stride_3]  \n\t"
-    "add.d      %[tmp],          %[tmp],         %[stride_4]  \n\t"
-    "vld        $vr12,           %[tmp],         0            \n\t"
-    "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
-    "vldx       $vr14,           %[tmp],         %[stride_2]  \n\t"
-    "vldx       $vr15,           %[tmp],         %[stride_3]  \n\t"
-
-    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
-    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
-    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
-    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
-    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
-    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
-    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
-    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
-
-    "vstelm.d    $vr0,           %[dst],         0,  0        \n\t"
-    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
-    "vstelm.d    $vr1,           %[dst],         0,  0        \n\t"
-    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
-    "vstelm.d    $vr2,           %[dst],         0,  0        \n\t"
-    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
-    "vstelm.d    $vr3,           %[dst],         0,  0        \n\t"
-    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
-    "vstelm.d    $vr4,           %[dst],         0,  0        \n\t"
-    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
-    "vstelm.d    $vr5,           %[dst],         0,  0        \n\t"
-    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
-    "vstelm.d    $vr6,           %[dst],         0,  0        \n\t"
-    "add.d       %[dst],         %[dst],         %[dstStride] \n\t"
-    "vstelm.d    $vr7,           %[dst],         0,  0        \n\t"
-    : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half),
-      [src]"+&r"(src), [stride_2]"=&r"(stride_2),
-      [stride_3]"=&r"(stride_3), [stride_4]"=&r"(stride_4)
-    : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
-    : "memory"
-    );
-}
-
 /* put_pixels16_8_lsx: dst = src */
 static av_always_inline void
 put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
@@ -729,254 +578,6 @@  avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
     );
 }
 
-/* avg_pixels16_8_lsx   : dst = avg(src, dst)
- * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
-                      ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
-    ptrdiff_t stride_2, stride_3, stride_4;
-    ptrdiff_t dstride_2, dstride_3, dstride_4;
-    __asm__ volatile (
-    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
-    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
-    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
-    "slli.d     %[dstride_2],    %[dstStride],   1            \n\t"
-    "add.d      %[dstride_3],    %[dstride_2],   %[dstStride] \n\t"
-    "slli.d     %[dstride_4],    %[dstride_2],   1            \n\t"
-    /* h0~h7 */
-    "vld        $vr0,            %[src],         0            \n\t"
-    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-    "vld        $vr4,            %[src],         0            \n\t"
-    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-
-    "vld        $vr8,            %[half],        0x00         \n\t"
-    "vld        $vr9,            %[half],        0x10         \n\t"
-    "vld        $vr10,           %[half],        0x20         \n\t"
-    "vld        $vr11,           %[half],        0x30         \n\t"
-    "vld        $vr12,           %[half],        0x40         \n\t"
-    "vld        $vr13,           %[half],        0x50         \n\t"
-    "vld        $vr14,           %[half],        0x60         \n\t"
-    "vld        $vr15,           %[half],        0x70         \n\t"
-
-    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
-    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
-    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
-    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
-    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
-    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
-    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
-    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
-
-    "vst        $vr0,            %[dst],         0            \n\t"
-    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
-    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
-    "vst        $vr4,            %[dst],         0            \n\t"
-    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
-    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
-
-    /* h8~h15 */
-    "vld        $vr0,            %[src],         0            \n\t"
-    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-    "vld        $vr4,            %[src],         0            \n\t"
-    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
-
-    "vld        $vr8,            %[half],        0x80         \n\t"
-    "vld        $vr9,            %[half],        0x90         \n\t"
-    "vld        $vr10,           %[half],        0xa0         \n\t"
-    "vld        $vr11,           %[half],        0xb0         \n\t"
-    "vld        $vr12,           %[half],        0xc0         \n\t"
-    "vld        $vr13,           %[half],        0xd0         \n\t"
-    "vld        $vr14,           %[half],        0xe0         \n\t"
-    "vld        $vr15,           %[half],        0xf0         \n\t"
-
-    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
-    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
-    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
-    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
-    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
-    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
-    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
-    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
-
-    "vst        $vr0,            %[dst],         0            \n\t"
-    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
-    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
-    "vst        $vr4,            %[dst],         0            \n\t"
-    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
-    : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
-      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
-      [stride_4]"=&r"(stride_4),  [dstride_2]"=&r"(dstride_2),
-      [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
-    : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
-    : "memory"
-    );
-}
-
-/* avg_pixels16_8_lsx    : dst = avg(src, dst)
- * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
-                      ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
-    uint8_t *tmp = dst;
-    ptrdiff_t stride_2, stride_3, stride_4;
-    ptrdiff_t dstride_2, dstride_3, dstride_4;
-    __asm__ volatile (
-    "slli.d     %[stride_2],     %[srcStride],   1            \n\t"
-    "add.d      %[stride_3],     %[stride_2],    %[srcStride] \n\t"
-    "slli.d     %[stride_4],     %[stride_2],    1            \n\t"
-    "slli.d     %[dstride_2],    %[dstStride],   1            \n\t"
-    "add.d      %[dstride_3],    %[dstride_2],   %[dstStride] \n\t"
-    "slli.d     %[dstride_4],    %[dstride_2],   1            \n\t"
-    /* h0~h7 */
-    "vld        $vr0,            %[src],         0            \n\t"
-    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-    "vld        $vr4,            %[src],         0            \n\t"
-    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-
-    "vld        $vr8,            %[half],        0x00         \n\t"
-    "vld        $vr9,            %[half],        0x10         \n\t"
-    "vld        $vr10,           %[half],        0x20         \n\t"
-    "vld        $vr11,           %[half],        0x30         \n\t"
-    "vld        $vr12,           %[half],        0x40         \n\t"
-    "vld        $vr13,           %[half],        0x50         \n\t"
-    "vld        $vr14,           %[half],        0x60         \n\t"
-    "vld        $vr15,           %[half],        0x70         \n\t"
-
-    "vavgr.bu   $vr0,            $vr8,           $vr0         \n\t"
-    "vavgr.bu   $vr1,            $vr9,           $vr1         \n\t"
-    "vavgr.bu   $vr2,            $vr10,          $vr2         \n\t"
-    "vavgr.bu   $vr3,            $vr11,          $vr3         \n\t"
-    "vavgr.bu   $vr4,            $vr12,          $vr4         \n\t"
-    "vavgr.bu   $vr5,            $vr13,          $vr5         \n\t"
-    "vavgr.bu   $vr6,            $vr14,          $vr6         \n\t"
-    "vavgr.bu   $vr7,            $vr15,          $vr7         \n\t"
-
-    "vld        $vr8,            %[tmp],         0            \n\t"
-    "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
-    "vldx       $vr10,           %[tmp],         %[dstride_2] \n\t"
-    "vldx       $vr11,           %[tmp],         %[dstride_3] \n\t"
-    "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
-    "vld        $vr12,           %[tmp],         0            \n\t"
-    "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
-    "vldx       $vr14,           %[tmp],         %[dstride_2] \n\t"
-    "vldx       $vr15,           %[tmp],         %[dstride_3] \n\t"
-    "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
-
-    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
-    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
-    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
-    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
-    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
-    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
-    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
-    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
-
-    "vst        $vr0,            %[dst],         0            \n\t"
-    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
-    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
-    "vst        $vr4,            %[dst],         0            \n\t"
-    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
-    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
-
-    /* h8~h15    */
-    "vld        $vr0,            %[src],         0            \n\t"
-    "vldx       $vr1,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr2,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr3,            %[src],         %[stride_3]  \n\t"
-    "add.d      %[src],          %[src],         %[stride_4]  \n\t"
-    "vld        $vr4,            %[src],         0            \n\t"
-    "vldx       $vr5,            %[src],         %[srcStride] \n\t"
-    "vldx       $vr6,            %[src],         %[stride_2]  \n\t"
-    "vldx       $vr7,            %[src],         %[stride_3]  \n\t"
-
-    "vld        $vr8,            %[half],        0x80         \n\t"
-    "vld        $vr9,            %[half],        0x90         \n\t"
-    "vld        $vr10,           %[half],        0xa0         \n\t"
-    "vld        $vr11,           %[half],        0xb0         \n\t"
-    "vld        $vr12,           %[half],        0xc0         \n\t"
-    "vld        $vr13,           %[half],        0xd0         \n\t"
-    "vld        $vr14,           %[half],        0xe0         \n\t"
-    "vld        $vr15,           %[half],        0xf0         \n\t"
-
-    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
-    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
-    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
-    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
-    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
-    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
-    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
-    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
-
-    "vld        $vr8,            %[tmp],         0            \n\t"
-    "vldx       $vr9,            %[tmp],         %[dstStride] \n\t"
-    "vldx       $vr10,           %[tmp],         %[dstride_2] \n\t"
-    "vldx       $vr11,           %[tmp],         %[dstride_3] \n\t"
-    "add.d      %[tmp],          %[tmp],         %[dstride_4] \n\t"
-    "vld        $vr12,           %[tmp],         0            \n\t"
-    "vldx       $vr13,           %[tmp],         %[dstStride] \n\t"
-    "vldx       $vr14,           %[tmp],         %[dstride_2] \n\t"
-    "vldx       $vr15,           %[tmp],         %[dstride_3] \n\t"
-
-    "vavgr.bu    $vr0,           $vr8,           $vr0         \n\t"
-    "vavgr.bu    $vr1,           $vr9,           $vr1         \n\t"
-    "vavgr.bu    $vr2,           $vr10,          $vr2         \n\t"
-    "vavgr.bu    $vr3,           $vr11,          $vr3         \n\t"
-    "vavgr.bu    $vr4,           $vr12,          $vr4         \n\t"
-    "vavgr.bu    $vr5,           $vr13,          $vr5         \n\t"
-    "vavgr.bu    $vr6,           $vr14,          $vr6         \n\t"
-    "vavgr.bu    $vr7,           $vr15,          $vr7         \n\t"
-
-    "vst        $vr0,            %[dst],         0            \n\t"
-    "vstx       $vr1,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr2,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr3,            %[dst],         %[dstride_3] \n\t"
-    "add.d      %[dst],          %[dst],         %[dstride_4] \n\t"
-    "vst        $vr4,            %[dst],         0            \n\t"
-    "vstx       $vr5,            %[dst],         %[dstStride] \n\t"
-    "vstx       $vr6,            %[dst],         %[dstride_2] \n\t"
-    "vstx       $vr7,            %[dst],         %[dstride_3] \n\t"
-    : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), [src]"+&r"(src),
-      [stride_2]"=&r"(stride_2),  [stride_3]"=&r"(stride_3),
-      [stride_4]"=&r"(stride_4),  [dstride_2]"=&r"(dstride_2),
-      [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
-    : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
-    : "memory"
-    );
-}
-
 #define QPEL8_H_LOWPASS(out_v)                                               \
     src00 = __lasx_xvld(src, - 2);                                           \
     src += srcStride;                                                        \
diff --git a/libavcodec/loongarch/h264qpel_lasx.h b/libavcodec/loongarch/h264qpel_lasx.h
deleted file mode 100644
index 32b6b50917..0000000000
--- a/libavcodec/loongarch/h264qpel_lasx.h
+++ /dev/null
@@ -1,158 +0,0 @@ 
-/*
- * Copyright (c) 2020 Loongson Technology Corporation Limited
- * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_LOONGARCH_H264QPEL_LASX_H
-#define AVCODEC_LOONGARCH_H264QPEL_LASX_H
-
-#include <stdint.h>
-#include <stddef.h>
-#include "libavcodec/h264.h"
-
-void ff_h264_h_lpf_luma_inter_lasx(uint8_t *src, int stride,
-                                   int alpha, int beta, int8_t *tc0);
-void ff_h264_v_lpf_luma_inter_lasx(uint8_t *src, int stride,
-                                   int alpha, int beta, int8_t *tc0);
-void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
-                                  ptrdiff_t dst_stride);
-
-void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t stride);
-void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
-                                 ptrdiff_t dst_stride);
-#endif  // #ifndef AVCODEC_LOONGARCH_H264QPEL_LASX_H
diff --git a/libavcodec/loongarch/h264qpel_loongarch.h b/libavcodec/loongarch/h264qpel_loongarch.h
new file mode 100644
index 0000000000..68232730da
--- /dev/null
+++ b/libavcodec/loongarch/h264qpel_loongarch.h
@@ -0,0 +1,312 @@ 
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "libavcodec/h264.h"
+#include "config.h"
+
+void put_h264_qpel8_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                   ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_h264_qpel8_h_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_h264_qpel8_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
+                           ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
+                          ptrdiff_t dstStride, ptrdiff_t srcStride);
+
+void avg_h264_qpel8_h_lowpass_lsx(uint8_t *dst, const uint8_t *src, int dstStride,
+                                  int srcStride);
+void avg_h264_qpel8_v_lowpass_lsx(uint8_t *dst, uint8_t *src, int dstStride,
+                                  int srcStride);
+void avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
+                           ptrdiff_t dstStride, ptrdiff_t srcStride);
+void avg_h264_qpel8_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                   ptrdiff_t dstStride, ptrdiff_t srcStride);
+void avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
+                          ptrdiff_t dstStride, ptrdiff_t srcStride);
+
+void ff_put_h264_qpel16_mc00_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+
+void ff_avg_h264_qpel16_mc00_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+
+void ff_put_h264_qpel8_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc00_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+
+void ff_avg_h264_qpel8_mc00_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride);
+
+#if HAVE_LASX
+void ff_h264_h_lpf_luma_inter_lasx(uint8_t *src, int stride,
+                                   int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_luma_inter_lasx(uint8_t *src, int stride,
+                                   int alpha, int beta, int8_t *tc0);
+void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
+                                  ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t dst_stride);
+#endif
+
+#endif  // #ifndef AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
diff --git a/libavcodec/loongarch/h264qpel_lsx.c b/libavcodec/loongarch/h264qpel_lsx.c
new file mode 100644
index 0000000000..12b3bae6d1
--- /dev/null
+++ b/libavcodec/loongarch/h264qpel_lsx.c
@@ -0,0 +1,487 @@ 
+/*
+ * Loongson LSX optimized h264qpel
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264qpel_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "libavutil/attributes.h"
+
+static void put_h264_qpel16_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t dstStride, ptrdiff_t srcStride)
+{
+    put_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+    put_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+    src += srcStride << 3;
+    dst += dstStride << 3;
+    put_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+    put_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+}
+
+void ff_put_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    put_h264_qpel16_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+static void put_h264_qpel16_h_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                          int dstStride, int srcStride)
+{
+    put_h264_qpel8_h_lowpass_lsx(dst, src, dstStride, srcStride);
+    put_h264_qpel8_h_lowpass_lsx(dst+8, src+8, dstStride, srcStride);
+    src += srcStride << 3;
+    dst += dstStride << 3;
+    put_h264_qpel8_h_lowpass_lsx(dst, src, dstStride, srcStride);
+    put_h264_qpel8_h_lowpass_lsx(dst+8, src+8, dstStride, srcStride);
+}
+
+static void put_h264_qpel16_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                           int dstStride, int srcStride)
+{
+    put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+    put_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+    put_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+}
+
+void ff_put_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 256;
+
+    put_h264_qpel16_h_lowpass_lsx(halfH, src, 16, stride);
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 256;
+
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_h264_qpel16_v_lowpass_lsx(halfH, src, 16, stride);
+    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 256;
+
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_h264_qpel16_v_lowpass_lsx(halfH, src + 1, 16, stride);
+    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 256;
+
+    put_h264_qpel16_h_lowpass_lsx(halfH, src + stride, 16, stride);
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+static void avg_h264_qpel16_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                          int dstStride, int srcStride)
+{
+    avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+    avg_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+    src += 8*srcStride;
+    dst += 8*dstStride;
+    avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+    avg_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+}
+
+void ff_avg_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avg_h264_qpel16_v_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t half[256];
+
+    put_h264_qpel16_v_lowpass_lsx(half, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, src + stride, half, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 256;
+
+    put_h264_qpel16_h_lowpass_lsx(halfH, src + stride, 16, stride);
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 256;
+
+    put_h264_qpel16_h_lowpass_lsx(halfH, src, 16, stride);
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t half[256];
+
+    put_h264_qpel16_v_lowpass_lsx(half, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 256;
+
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_h264_qpel16_v_lowpass_lsx(halfH, src + 1, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t temp[512];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 256;
+
+    put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+    put_h264_qpel16_v_lowpass_lsx(halfH, src, 16, stride);
+    avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+static void avg_h264_qpel16_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+                                           ptrdiff_t dstStride, ptrdiff_t srcStride)
+{
+    avg_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+    src += srcStride << 3;
+    dst += dstStride << 3;
+    avg_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+    avg_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+}
+
+void ff_avg_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    avg_h264_qpel16_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc03_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_v_lowpass_lsx(half, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, src + stride, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc01_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_v_lowpass_lsx(half, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 64;
+
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src + 1, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 64;
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 64;
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 64;
+
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src, 8, stride);
+    put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    put_h264_qpel8_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    put_h264_qpel8_h_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+                                 ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avg_h264_qpel8_h_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t half[64];
+
+    put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 64;
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 64;
+
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    avg_h264_qpel8_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfHV = temp;
+    uint8_t *const halfH  = temp + 64;
+
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src + 1, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t temp[128];
+    uint8_t *const halfH  = temp;
+    uint8_t *const halfHV = temp + 64;
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+                                ptrdiff_t stride)
+{
+    uint8_t halfH[64];
+    uint8_t halfV[64];
+
+    put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+    put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+    avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}