@@ -31,5 +31,7 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
loongarch/h264idct_loongarch.o \
loongarch/h264dsp.o
+LSX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel.o \
+ loongarch/h264qpel_lsx.o
LSX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma.o
LSX-OBJS-$(CONFIG_H264PRED) += loongarch/h264intrapred.o
new file mode 100644
@@ -0,0 +1,1686 @@
+/*
+ * Loongson LSX optimized h264qpel
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+.macro VLD_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4
+ vld vr0, \in4, 0
+ vldx vr1, \in4, a2
+ QPEL8_H_LSX \in0, \in1
+ vssrani.bu.h \in0, \in2, 5
+ vssrani.bu.h \in1, \in3, 5
+.endm
+
+.macro VLDX_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4
+ vldx vr0, \in4, t1
+ vldx vr1, \in4, t2
+ QPEL8_H_LSX \in0, \in1
+ vssrani.bu.h \in0, \in2, 5
+ vssrani.bu.h \in1, \in3, 5
+.endm
+
+.macro VLD_DOUBLE_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8
+ vld vr0, \in8, 0
+ vldx vr1, \in8, a2
+ QPEL8_H_LSX \in0, \in1
+ vssrani.bu.h \in0, \in4, 5
+ vssrani.bu.h \in1, \in5, 5
+ vldx vr0, \in8, t1
+ vldx vr1, \in8, t2
+ QPEL8_H_LSX \in2, \in3
+ vssrani.bu.h \in2, \in6, 5
+ vssrani.bu.h \in3, \in7, 5
+.endm
+
+function ff_put_h264_qpel16_mc00_lsx
+ slli.d t0, a2, 1
+ add.d t1, t0, a2
+ slli.d t2, t0, 1
+.rept 4
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ add.d a1, a1, t2
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ vstx vr2, a0, t0
+ vstx vr3, a0, t1
+ add.d a0, a0, t2
+.endr
+endfunc
+
+.macro QPEL8_H_LSX out0, out1
+ vbsrl.v vr2, vr0, 1
+ vbsrl.v vr3, vr1, 1
+ vbsrl.v vr4, vr0, 2
+ vbsrl.v vr5, vr1, 2
+ vbsrl.v vr6, vr0, 3
+ vbsrl.v vr7, vr1, 3
+ vbsrl.v vr8, vr0, 4
+ vbsrl.v vr9, vr1, 4
+ vbsrl.v vr10, vr0, 5
+ vbsrl.v vr11, vr1, 5
+
+ vilvl.b vr6, vr4, vr6
+ vilvl.b vr7, vr5, vr7
+ vilvl.b vr8, vr2, vr8
+ vilvl.b vr9, vr3, vr9
+ vilvl.b vr10, vr0, vr10
+ vilvl.b vr11, vr1, vr11
+ vhaddw.hu.bu vr6, vr6, vr6
+ vhaddw.hu.bu vr7, vr7, vr7
+ vhaddw.hu.bu vr8, vr8, vr8
+ vhaddw.hu.bu vr9, vr9, vr9
+ vhaddw.hu.bu vr10, vr10, vr10
+ vhaddw.hu.bu vr11, vr11, vr11
+ vmul.h vr2, vr6, vr20
+ vmul.h vr3, vr7, vr20
+ vmul.h vr4, vr8, vr21
+ vmul.h vr5, vr9, vr21
+ vssub.h vr2, vr2, vr4
+ vssub.h vr3, vr3, vr5
+ vsadd.h vr2, vr2, vr10
+ vsadd.h vr3, vr3, vr11
+ vsadd.h \out0, vr2, vr22
+ vsadd.h \out1, vr3, vr22
+.endm
+
+.macro VLD_DOUBLE_QPEL8_H_LSX in0, in1, in2, in3, in4
+ vld vr0, \in4, 0
+ vldx vr1, \in4, a2
+ QPEL8_H_LSX \in0, \in1
+ vldx vr0, \in4, t1
+ vldx vr1, \in4, t2
+ QPEL8_H_LSX \in2, \in3
+.endm
+
+.macro put_h264_qpel16 in0
+function ff_put_h264_qpel16_mc\in0\()_lsx
+.ifc \in0, 10
+ addi.d t8, a1, 0
+.else
+ addi.d t8, a1, 1
+.endif
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ addi.d t0, a1, -2 // t0 = src - 2
+ addi.d a1, t0, 8 // a1 = t0 + 8
+.rept 4
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1
+ vld vr10, t8, 0
+ vldx vr11, t8, a2
+ vavgr.bu vr0, vr2, vr10
+ vavgr.bu vr1, vr3, vr11
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ VLDX_QPEL8_H_SSRANI_LSX vr4, vr5, vr14, vr15, a1
+ vldx vr12, t8, t1
+ vldx vr13, t8, t2
+ vavgr.bu vr2, vr4, vr12
+ vavgr.bu vr3, vr5, vr13
+ vstx vr2, a0, t1
+ vstx vr3, a0, t2
+ alsl.d a0, a2, a0, 2
+ alsl.d t8, a2, t8, 2
+ alsl.d a1, a2, a1, 2
+ alsl.d t0, a2, t0, 2
+.endr
+endfunc
+.endm
+
+put_h264_qpel16 10
+put_h264_qpel16 30
+
+function ff_put_h264_qpel16_mc20_lsx
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ addi.d t0, a1, -2 // t0 = src - 2
+ addi.d a1, t0, 8 // a1 = t0 + 8
+.rept 4
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1
+ vst vr2, a0, 0
+ vstx vr3, a0, a2
+ VLDX_QPEL8_H_SSRANI_LSX vr4, vr5, vr14, vr15, a1
+ vstx vr4, a0, t1
+ vstx vr5, a0, t2
+ alsl.d a0, a2, a0, 2
+ alsl.d a1, a2, a1, 2
+ alsl.d t0, a2, t0, 2
+.endr
+endfunc
+
+.macro QPEL8_V_LSX in0, in1, in2, in3, in4, in5, in6
+ vilvl.b vr7, \in3, \in2
+ vilvl.b vr8, \in4, \in3
+ vilvl.b vr9, \in4, \in1
+ vilvl.b vr10, \in5, \in2
+ vilvl.b vr11, \in5, \in0
+ vilvl.b vr12, \in6, \in1
+ vhaddw.hu.bu vr7, vr7, vr7
+ vhaddw.hu.bu vr8, vr8, vr8
+ vhaddw.hu.bu vr9, vr9, vr9
+ vhaddw.hu.bu vr10, vr10, vr10
+ vhaddw.hu.bu vr11, vr11, vr11
+ vhaddw.hu.bu vr12, vr12, vr12
+ vmul.h vr7, vr7, vr20
+ vmul.h vr8, vr8, vr20
+ vmul.h vr9, vr9, vr21
+ vmul.h vr10, vr10, vr21
+ vssub.h vr7, vr7, vr9
+ vssub.h vr8, vr8, vr10
+ vsadd.h vr7, vr7, vr11
+ vsadd.h vr8, vr8, vr12
+ vsadd.h vr7, vr7, vr22
+ vsadd.h vr8, vr8, vr22
+
+ vilvh.b vr13, \in3, \in2
+ vilvh.b vr14, \in4, \in3
+ vilvh.b vr15, \in4, \in1
+ vilvh.b vr16, \in5, \in2
+ vilvh.b vr17, \in5, \in0
+ vilvh.b vr18, \in6, \in1
+ vhaddw.hu.bu vr13, vr13, vr13
+ vhaddw.hu.bu vr14, vr14, vr14
+ vhaddw.hu.bu vr15, vr15, vr15
+ vhaddw.hu.bu vr16, vr16, vr16
+ vhaddw.hu.bu vr17, vr17, vr17
+ vhaddw.hu.bu vr18, vr18, vr18
+ vmul.h vr13, vr13, vr20
+ vmul.h vr14, vr14, vr20
+ vmul.h vr15, vr15, vr21
+ vmul.h vr16, vr16, vr21
+ vssub.h vr13, vr13, vr15
+ vssub.h vr14, vr14, vr16
+ vsadd.h vr13, vr13, vr17
+ vsadd.h vr14, vr14, vr18
+ vsadd.h vr13, vr13, vr22
+ vsadd.h vr14, vr14, vr22
+ vssrani.bu.h vr13, vr7, 5
+ vssrani.bu.h vr14, vr8, 5
+.endm
+
+.macro put_h264_qpel16_mc1 in0
+function ff_put_h264_qpel16_mc\in0\()_lsx
+ slli.d t0, a2, 1
+ add.d t1, t0, a2
+ sub.d t2, a1, t0 // t2 = src - 2 * stride
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+
+ vld vr0, t2, 0
+ vldx vr1, t2, a2
+ vldx vr2, t2, t0
+ vldx vr3, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr4, t2, 0
+ vldx vr5, t2, a2
+ vldx vr6, t2, t0
+ QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+.ifc \in0, 01
+ vavgr.bu vr13, vr2, vr13
+ vavgr.bu vr14, vr3, vr14
+.else
+ vavgr.bu vr13, vr3, vr13
+ vavgr.bu vr14, vr4, vr14
+.endif
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+
+ vldx vr0, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 *stride
+ vld vr1, t2, 0
+ QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
+.ifc \in0, 01
+ vavgr.bu vr13, vr4, vr13
+ vavgr.bu vr14, vr5, vr14
+.else
+ vavgr.bu vr13, vr5, vr13
+ vavgr.bu vr14, vr6, vr14
+.endif
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+
+ vldx vr2, t2, a2
+ vldx vr3, t2, t0
+ QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
+.ifc \in0, 01
+ vavgr.bu vr13, vr6, vr13
+ vavgr.bu vr14, vr0, vr14
+.else
+ vavgr.bu vr13, vr0, vr13
+ vavgr.bu vr14, vr1, vr14
+.endif
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+
+ vldx vr4, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr5, t2, 0
+ QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
+.ifc \in0, 01
+ vavgr.bu vr13, vr1, vr13
+ vavgr.bu vr14, vr2, vr14
+.else
+ vavgr.bu vr13, vr2, vr13
+ vavgr.bu vr14, vr3, vr14
+.endif
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+
+ vldx vr6, t2, a2
+ vldx vr0, t2, t0
+ QPEL8_V_LSX vr1, vr2, vr3, vr4, vr5, vr6, vr0
+.ifc \in0, 01
+ vavgr.bu vr13, vr3, vr13
+ vavgr.bu vr14, vr4, vr14
+.else
+ vavgr.bu vr13, vr4, vr13
+ vavgr.bu vr14, vr5, vr14
+.endif
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+
+ vldx vr1, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr2, t2, 0
+ QPEL8_V_LSX vr3, vr4, vr5, vr6, vr0, vr1, vr2
+.ifc \in0, 01
+ vavgr.bu vr13, vr5, vr13
+ vavgr.bu vr14, vr6, vr14
+.else
+ vavgr.bu vr13, vr6, vr13
+ vavgr.bu vr14, vr0, vr14
+.endif
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+
+ vldx vr3, t2, a2
+ vldx vr4, t2, t0
+ QPEL8_V_LSX vr5, vr6, vr0, vr1, vr2, vr3, vr4
+.ifc \in0, 01
+ vavgr.bu vr13, vr0, vr13
+ vavgr.bu vr14, vr1, vr14
+.else
+ vavgr.bu vr13, vr1, vr13
+ vavgr.bu vr14, vr2, vr14
+.endif
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+
+ vldx vr5, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr6, t2, 0
+ QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+.ifc \in0, 01
+ vavgr.bu vr13, vr2, vr13
+ vavgr.bu vr14, vr3, vr14
+.else
+ vavgr.bu vr13, vr3, vr13
+ vavgr.bu vr14, vr4, vr14
+.endif
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+endfunc
+.endm
+
+put_h264_qpel16_mc1 01
+put_h264_qpel16_mc1 03
+
+.macro VST_QPEL8_V_LOWPASS_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8
+ QPEL8_V_LSX \in0, \in1, \in2, \in3, \in4, \in5, \in6
+ vavgr.bu vr13, \in7, vr13
+ vavgr.bu vr14, \in8, vr14
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+.endm
+
+.macro VSTX_QPEL8_V_LOWPASS_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8
+ QPEL8_V_LSX \in0, \in1, \in2, \in3, \in4, \in5, \in6
+ vavgr.bu vr13, \in7, vr13
+ vavgr.bu vr14, \in8, vr14
+ vstx vr13, a0, t1
+ vstx vr14, a0, t2
+.endm
+
+function ff_put_h264_qpel16_mc11_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ slli.d t6, t1, 1
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ sub.d t4, a1, t1 // t4 = src - 2 * stride
+ addi.d t0, a1, -2 // t0 = src - 2
+ addi.d a1, t0, 8 // a1 = t0 + 8
+.rept 2
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+ alsl.d t0, a2, t0, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t0
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+ vr14, vr15, a1
+ alsl.d a1, a2, a1, 2
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+ vr18, vr19, a1
+
+ vld vr0, t4, 0 // t4 = src - 2 * stride
+ vldx vr1, t4, a2
+ vldx vr2, t4, t1
+ vldx vr3, t4, t2
+ alsl.d t4, a2, t4, 2 // src + 2 *stride
+ vld vr4, t4, 0
+ vldx vr5, t4, a2
+ vldx vr6, t4, t1
+ VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+ vldx vr0, t4, t2
+ alsl.d t4, a2, t4, 2 // src + 6 *stride
+ vld vr1, t4, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+ vldx vr2, t4, a2
+ vldx vr3, t4, t1
+ VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+ vldx vr4, t4, t2
+ alsl.d t4, a2, t4, 2 // src + 10 *stride
+ vld vr5, t4, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+ alsl.d t0, a2, t0, 2
+ alsl.d a1, a2, a1, 2 // a1 = src + 8 * stride
+ alsl.d a0, a2, a0, 2 // dst = dst + 8 * stride
+ sub.d t4, t4, t6
+.endr
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+function ff_avg_h264_qpel16_mc00_lsx
+ slli.d t0, a2, 1
+ add.d t1, t0, a2
+ slli.d t2, t0, 1
+ addi.d t3, a0, 0
+.rept 4
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ add.d a1, a1, t2
+ vld vr8, t3, 0
+ vldx vr9, t3, a2
+ vldx vr10, t3, t0
+ vldx vr11, t3, t1
+ add.d t3, t3, t2
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr1, vr9, vr1
+ vavgr.bu vr2, vr10, vr2
+ vavgr.bu vr3, vr11, vr3
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ vstx vr2, a0, t0
+ vstx vr3, a0, t1
+ add.d a0, a0, t2
+.endr
+endfunc
+
+.macro put_h264_qpel16_mc in0
+function ff_put_h264_qpel16_mc\in0\()_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ addi.d t0, a1, -2 // t0 = src - 2
+
+.ifc \in0, 33
+ add.d t0, t0, a2
+.endif
+ add.d t3, a1, zero // t3 = src
+ sub.d t4, a1, t1 // t4 = src - 2 * stride
+ addi.d t4, t4, 1
+
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+ alsl.d a1, a2, t0, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+ addi.d a1, t0, 8
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+ vr14, vr15, a1
+ alsl.d a1, a2, a1, 2
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+ vr18, vr19, a1
+ vld vr0, t4, 0 // t4 = src - 2 * stride + 1
+ vldx vr1, t4, a2
+ vldx vr2, t4, t1
+ vldx vr3, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr4, t4, 0
+ vldx vr5, t4, a2
+ vldx vr6, t4, t1
+ VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+ vldx vr0, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr1, t4, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
+ add.d t6, t4, zero // t6 = src + 6 * stride
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+ vldx vr2, t4, a2
+ vldx vr3, t4, t1
+ VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+ vldx vr4, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr5, t4, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+ alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride
+ addi.d t5, a1, 8 // a1 = src + 8 * stride + 8
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1
+ alsl.d a1, a2, a1, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+ vr14, vr15, t5
+ alsl.d t5, a2, t5, 2
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+ vr18, vr19, t5
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+
+ // t6 = src + 6 * stride + 1
+ vld vr0, t6, 0
+ vldx vr1, t6, a2
+ vldx vr2, t6, t1
+ vldx vr3, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr4, t6, 0
+ vldx vr5, t6, a2
+ vldx vr6, t6, t1
+ VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+ vldx vr0, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr1, t6, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5 ,vr6, vr0, vr1, vr25, vr26
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride
+ vldx vr2, t6, a2
+ vldx vr3, t6, t1
+ VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+ vldx vr4, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr5, t6, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+.endm
+
+put_h264_qpel16_mc 33
+put_h264_qpel16_mc 31
+
+function ff_put_h264_qpel16_mc13_lsx
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+ addi.d t0, a1, -2 // t0 = src - 2
+ add.d t0, t0, a2
+ add.d t3, a1, zero // t3 = src
+ sub.d t4, a1, t1 // t4 = src - 2 * stride
+
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+ alsl.d a1, a2, t0, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+ addi.d a1, t0, 8
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+ vr14, vr15, a1
+ alsl.d a1, a2, a1, 2
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+ vr18, vr19, a1
+ vld vr0, t4, 0 // t4 = src - 2 * stride + 1
+ vldx vr1, t4, a2
+ vldx vr2, t4, t1
+ vldx vr3, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr4, t4, 0
+ vldx vr5, t4, a2
+ vldx vr6, t4, t1
+ VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+ vldx vr0, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr1, t4, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
+ add.d t6, t4, zero
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+ vldx vr2, t4, a2
+ vldx vr3, t4, t1
+ VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+ vldx vr4, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr5, t4, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+ alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride
+ addi.d t5, a1, 8 // a1 = src + 8 * stride + 8
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1
+ alsl.d a1, a2, a1, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+ vr14, vr15, t5
+ alsl.d t5, a2, t5, 2
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+ vr18, vr19, t5
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+
+ vld vr0, t6, 0 // // t6 = src + 6 * stride + 1
+ vldx vr1, t6, a2
+ vldx vr2, t6, t1
+ vldx vr3, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr4, t6, 0
+ vldx vr5, t6, a2
+ vldx vr6, t6, t1
+ VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24
+ vldx vr0, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr1, t6, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride
+ vldx vr2, t6, a2
+ vldx vr3, t6, t1
+ VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28
+ vldx vr4, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr5, t6, 0
+ VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+endfunc
+
+function ff_avg_h264_qpel16_mc10_lsx
+ addi.d t0, a0, 0 // t0 = dst
+ addi.d t4, a1, -2 // t1 = src - 2
+ addi.d t5, t4, 8
+ slli.d t1, a2, 1
+ add.d t2, a2, t1
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+.rept 2
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t4
+ alsl.d t4, a2, t4, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t4
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, t5
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vld vr12, t0, 0
+ vldx vr13, t0, a2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, t5
+ vldx vr0, a1, t1
+ vldx vr1, a1, t2
+ vldx vr12, t0, t1
+ vldx vr13, t0, t2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vstx vr0, a0, t1
+ vstx vr1, a0, t2
+ alsl.d t5, a2, t5, 2
+ alsl.d a1, a2, a1, 2
+ alsl.d t0, a2, t0, 2
+ alsl.d a0, a2, a0, 2
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr16, vr17, t5
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vld vr12, t0, 0
+ vldx vr13, t0, a2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr18, vr19, t5
+ vldx vr0, a1, t1
+ vldx vr1, a1, t2
+ vldx vr12, t0, t1
+ vldx vr13, t0, t2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vstx vr0, a0, t1
+ vstx vr1, a0, t2
+ alsl.d t5, a2, t5, 2
+ alsl.d a1, a2, a1, 2
+ alsl.d t0, a2, t0, 2
+ alsl.d a0, a2, a0, 2
+ alsl.d t4, a2, t4, 2 // src + 8 * stride -2
+.endr
+endfunc
+
+function ff_avg_h264_qpel16_mc30_lsx
+ addi.d t0, a0, 0 // t0 = dst
+ addi.d t4, a1, -2 // t1 = src - 2
+ addi.d t5, t4, 8
+ addi.d a1, a1, 1 // a1 = a1 + 1
+ slli.d t1, a2, 1
+ add.d t2, a2, t1
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+.rept 2
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t4
+ alsl.d t4, a2, t4, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t4
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, t5
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vld vr12, t0, 0
+ vldx vr13, t0, a2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, t5
+ vldx vr0, a1, t1
+ vldx vr1, a1, t2
+ vldx vr12, t0, t1
+ vldx vr13, t0, t2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vstx vr0, a0, t1
+ vstx vr1, a0, t2
+ alsl.d t5, a2, t5, 2
+ alsl.d a1, a2, a1, 2
+ alsl.d t0, a2, t0, 2
+ alsl.d a0, a2, a0, 2
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr16, vr17, t5
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vld vr12, t0, 0
+ vldx vr13, t0, a2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr18, vr19, t5
+ vldx vr0, a1, t1
+ vldx vr1, a1, t2
+ vldx vr12, t0, t1
+ vldx vr13, t0, t2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vavgr.bu vr0, vr0, vr12
+ vavgr.bu vr1, vr1, vr13
+ vstx vr0, a0, t1
+ vstx vr1, a0, t2
+ alsl.d t5, a2, t5, 2
+ alsl.d a1, a2, a1, 2
+ alsl.d t0, a2, t0, 2
+ alsl.d a0, a2, a0, 2
+ alsl.d t4, a2, t4, 2 // t1 = src + 8 * stride -2
+.endr
+endfunc
+
+function ff_put_h264_qpel16_mc02_lsx
+ slli.d t0, a2, 1
+ add.d t1, t0, a2
+ sub.d t2, a1, t0 // t2 = src - 2 * stride
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+
+ vld vr0, t2, 0
+ vldx vr1, t2, a2
+ vldx vr2, t2, t0
+ vldx vr3, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr4, t2, 0
+ vldx vr5, t2, a2
+ vldx vr6, t2, t0
+ QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr0, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 *stride
+ vld vr1, t2, 0
+ QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+ vldx vr2, t2, a2
+ vldx vr3, t2, t0
+ QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr4, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr5, t2, 0
+ QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+
+ vldx vr6, t2, a2
+ vldx vr0, t2, t0
+ QPEL8_V_LSX vr1, vr2, vr3, vr4, vr5, vr6, vr0
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr1, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr2, t2, 0
+ QPEL8_V_LSX vr3, vr4, vr5, vr6, vr0, vr1, vr2
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+ vldx vr3, t2, a2
+ vldx vr4, t2, t0
+ QPEL8_V_LSX vr5, vr6, vr0, vr1, vr2, vr3, vr4
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr5, t2, t1
+ alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride
+ vld vr6, t2, 0
+ QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+ vstx vr13, a0, t0
+ vstx vr14, a0, t1
+endfunc
+
+.macro avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+ addi.d sp, sp, -64
+ fst.d f24, sp, 0
+ fst.d f25, sp, 8
+ fst.d f26, sp, 16
+ fst.d f27, sp, 24
+ fst.d f28, sp, 32
+ fst.d f29, sp, 40
+ fst.d f30, sp, 48
+ fst.d f31, sp, 56
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+ alsl.d a1, a2, t0, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+ addi.d a1, t0, 8
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+ vr14, vr15, a1
+ alsl.d a1, a2, a1, 2
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+ vr18, vr19, a1
+ vld vr0, t4, 0 // t4 = src - 2 * stride + 1
+ vldx vr1, t4, a2
+ vldx vr2, t4, t1
+ vldx vr3, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr4, t4, 0
+ vldx vr5, t4, a2
+ vldx vr6, t4, t1
+ QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+ vld vr0, t8, 0
+ vldx vr1, t8, a2
+ vavgr.bu vr13, vr23, vr13
+ vavgr.bu vr14, vr24, vr14
+ vavgr.bu vr13, vr13, vr0
+ vavgr.bu vr14, vr14, vr1
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr0, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr1, t4, 0
+ QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
+ vldx vr2, t8, t1
+ vldx vr3, t8, t2
+ vavgr.bu vr13, vr25, vr13
+ vavgr.bu vr14, vr26, vr14
+ vavgr.bu vr13, vr13, vr2
+ vavgr.bu vr14, vr14, vr3
+ add.d t6, t4, zero // t6 = src + 6 * stride
+ vstx vr13, a0, t1
+ vstx vr14, a0, t2
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+ alsl.d t8, a2, t8, 2
+ vldx vr2, t4, a2
+ vldx vr3, t4, t1
+ QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
+ vld vr4, t8, 0
+ vldx vr5, t8, a2
+ vavgr.bu vr13, vr27, vr13
+ vavgr.bu vr14, vr28, vr14
+ vavgr.bu vr13, vr13, vr4
+ vavgr.bu vr14, vr14, vr5
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr4, t4, t2
+ alsl.d t4, a2, t4, 2
+ vld vr5, t4, 0
+ QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
+ vldx vr6, t8, t1
+ vldx vr0, t8, t2
+ vavgr.bu vr13, vr29, vr13
+ vavgr.bu vr14, vr30, vr14
+ vavgr.bu vr13, vr13, vr6
+ vavgr.bu vr14, vr14, vr0
+ vstx vr13, a0, t1
+ vstx vr14, a0, t2
+ alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride
+ addi.d t5, a1, 8 // a1 = src + 8 * stride + 8
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1
+ alsl.d a1, a2, a1, 2
+ VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \
+ vr14, vr15, t5
+ alsl.d t5, a2, t5, 2
+ VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \
+ vr18, vr19, t5
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride
+ alsl.d t8, a2, t8, 2
+ // t6 = src + 6 * stride + 1
+ vld vr0, t6, 0
+ vldx vr1, t6, a2
+ vldx vr2, t6, t1
+ vldx vr3, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr4, t6, 0
+ vldx vr5, t6, a2
+ vldx vr6, t6, t1
+ QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+ vld vr0, t8, 0
+ vldx vr1, t8, a2
+ vavgr.bu vr13, vr23, vr13
+ vavgr.bu vr14, vr24, vr14
+ vavgr.bu vr13, vr13, vr0
+ vavgr.bu vr14, vr14, vr1
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr0, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr1, t6, 0
+ QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
+ vldx vr2, t8, t1
+ vldx vr3, t8, t2
+ vavgr.bu vr13, vr25, vr13
+ vavgr.bu vr14, vr26, vr14
+ vavgr.bu vr13, vr13, vr2
+ vavgr.bu vr14, vr14, vr3
+ vstx vr13, a0, t1
+ vstx vr14, a0, t2
+ alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride
+ alsl.d t8, a2, t8, 2
+ vldx vr2, t6, a2
+ vldx vr3, t6, t1
+ QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
+ vld vr4, t8, 0
+ vldx vr5, t8, a2
+ vavgr.bu vr13, vr27, vr13
+ vavgr.bu vr14, vr28, vr14
+ vavgr.bu vr13, vr13, vr4
+ vavgr.bu vr14, vr14, vr5
+ vst vr13, a0, 0
+ vstx vr14, a0, a2
+ vldx vr4, t6, t2
+ alsl.d t6, a2, t6, 2
+ vld vr5, t6, 0
+ QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
+ vldx vr6, t8, t1
+ vldx vr0, t8, t2
+ vavgr.bu vr13, vr29, vr13
+ vavgr.bu vr14, vr30, vr14
+ vavgr.bu vr13, vr13, vr6
+ vavgr.bu vr14, vr14, vr0
+ vstx vr13, a0, t1
+ vstx vr14, a0, t2
+ fld.d f24, sp, 0
+ fld.d f25, sp, 8
+ fld.d f26, sp, 16
+ fld.d f27, sp, 24
+ fld.d f28, sp, 32
+ fld.d f29, sp, 40
+ fld.d f30, sp, 48
+ fld.d f31, sp, 56
+ addi.d sp, sp, 64
+.endm
+
+function ff_avg_h264_qpel16_mc33_lsx
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ addi.d t0, a1, -2 // t0 = src - 2
+ add.d t0, t0, a2 // t0 = src + stride - 2
+ add.d t3, a1, zero // t3 = src
+ sub.d t4, a1, t1 // t4 = src - 2 * stride
+ addi.d t4, t4, 1
+ addi.d t8, a0, 0
+ avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+endfunc
+
+function ff_avg_h264_qpel16_mc11_lsx
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ addi.d t0, a1, -2 // t0 = src - 2
+ add.d t3, a1, zero // t3 = src
+ sub.d t4, a1, t1 // t4 = src - 2 * stride
+ addi.d t8, a0, 0
+ avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+endfunc
+
+function ff_avg_h264_qpel16_mc31_lsx
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ addi.d t0, a1, -2 // t0 = src - 2
+ add.d t3, a1, zero // t3 = src
+ sub.d t4, a1, t1 // t4 = src - 2 * stride
+ addi.d t4, t4, 1
+ addi.d t8, a0, 0
+ avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+endfunc
+
+function ff_avg_h264_qpel16_mc13_lsx
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ addi.d t0, a1, -2 // t0 = src - 2
+ add.d t0, t0, a2
+ add.d t3, a1, zero // t3 = src
+ sub.d t4, a1, t1 // t4 = src - 2 * stride
+ addi.d t8, a0, 0
+ avc_luma_hv_qrt_and_aver_dst_16x16_lsx
+endfunc
+
+function ff_avg_h264_qpel16_mc20_lsx
+ slli.d t1, a2, 1
+ add.d t2, t1, a2
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ addi.d t0, a1, -2 // t0 = src - 2
+ addi.d t5, a0, 0
+ addi.d a1, t0, 8
+.rept 4
+ VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1
+ vld vr0, t5, 0
+ vldx vr1, t5, a2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vst vr0, a0, 0
+ vstx vr1, a0, a2
+ add.d a1, a1, t1
+ VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, a1
+ vldx vr0, t5, t1
+ vldx vr1, t5, t2
+ vavgr.bu vr0, vr0, vr2
+ vavgr.bu vr1, vr1, vr3
+ vstx vr0, a0, t1
+ vstx vr1, a0, t2
+ alsl.d t0, a2, t0, 2
+ alsl.d t5, a2, t5, 2
+ alsl.d a0, a2, a0, 2
+ alsl.d a1, a2, a1, 1
+.endr
+endfunc
+
+.macro QPEL8_HV_H_LSX out0, out1
+ vbsrl.v vr2, vr0, 1
+ vbsrl.v vr3, vr1, 1
+ vbsrl.v vr4, vr0, 2
+ vbsrl.v vr5, vr1, 2
+ vbsrl.v vr6, vr0, 3
+ vbsrl.v vr7, vr1, 3
+ vbsrl.v vr8, vr0, 4
+ vbsrl.v vr9, vr1, 4
+ vbsrl.v vr10, vr0, 5
+ vbsrl.v vr11, vr1, 5
+ vilvl.b vr6, vr4, vr6
+ vilvl.b vr7, vr5, vr7
+ vilvl.b vr8, vr2, vr8
+ vilvl.b vr9, vr3, vr9
+ vilvl.b vr10, vr0, vr10
+ vilvl.b vr11, vr1, vr11
+ vhaddw.hu.bu vr6, vr6, vr6
+ vhaddw.hu.bu vr7, vr7, vr7
+ vhaddw.hu.bu vr8, vr8, vr8
+ vhaddw.hu.bu vr9, vr9, vr9
+ vhaddw.hu.bu vr10, vr10, vr10
+ vhaddw.hu.bu vr11, vr11, vr11
+ vmul.h vr2, vr6, vr20
+ vmul.h vr3, vr7, vr20
+ vmul.h vr4, vr8, vr21
+ vmul.h vr5, vr9, vr21
+ vssub.h vr2, vr2, vr4
+ vssub.h vr3, vr3, vr5
+ vsadd.h \out0, vr2, vr10
+ vsadd.h \out1, vr3, vr11
+.endm
+
+.macro QPEL8_HV_V_LSX in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3
+ vilvl.h vr0, \in2, \in3
+ vilvl.h vr1, \in3, \in4 // tmp0
+ vilvl.h vr2, \in1, \in4
+ vilvl.h vr3, \in2, \in5 // tmp2
+ vilvl.h vr4, \in0, \in5
+ vilvl.h vr5, \in1, \in6 // tmp4
+ vhaddw.w.h vr0, vr0, vr0
+ vhaddw.w.h vr1, vr1, vr1
+ vhaddw.w.h vr2, vr2, vr2
+ vhaddw.w.h vr3, vr3, vr3
+ vhaddw.w.h vr4, vr4, vr4
+ vhaddw.w.h vr5, vr5, vr5
+ vmul.w vr0, vr0, vr22
+ vmul.w vr1, vr1, vr22
+ vmul.w vr2, vr2, vr23
+ vmul.w vr3, vr3, vr23
+ vssub.w vr0, vr0, vr2
+ vssub.w vr1, vr1, vr3
+ vsadd.w vr0, vr0, vr4
+ vsadd.w vr1, vr1, vr5
+ vsadd.w \out0, vr0, vr24
+ vsadd.w \out1, vr1, vr24
+ vilvh.h vr0, \in2, \in3
+ vilvh.h vr1, \in3, \in4 // tmp0
+ vilvh.h vr2, \in1, \in4
+ vilvh.h vr3, \in2, \in5 // tmp2
+ vilvh.h vr4, \in0, \in5
+ vilvh.h vr5, \in1, \in6 // tmp4
+ vhaddw.w.h vr0, vr0, vr0
+ vhaddw.w.h vr1, vr1, vr1
+ vhaddw.w.h vr2, vr2, vr2
+ vhaddw.w.h vr3, vr3, vr3
+ vhaddw.w.h vr4, vr4, vr4
+ vhaddw.w.h vr5, vr5, vr5
+ vmul.w vr0, vr0, vr22
+ vmul.w vr1, vr1, vr22
+ vmul.w vr2, vr2, vr23
+ vmul.w vr3, vr3, vr23
+ vssub.w vr0, vr0, vr2
+ vssub.w vr1, vr1, vr3
+ vsadd.w vr0, vr0, vr4
+ vsadd.w vr1, vr1, vr5
+ vsadd.w \out2, vr0, vr24
+ vsadd.w \out3, vr1, vr24
+ vssrani.hu.w \out2, \out0, 10
+ vssrani.hu.w \out3, \out1, 10
+ vssrani.bu.h \out3, \out2, 0
+.endm
+
+.macro h264_qpel8_hv_lowpass_core_lsx in0, in1, type
+ vld vr0, \in0, 0
+ vldx vr1, \in0, a3
+ QPEL8_HV_H_LSX vr12, vr13 // a b$
+ vldx vr0, \in0, t1
+ vldx vr1, \in0, t2
+ QPEL8_HV_H_LSX vr14, vr15 // c d$
+
+ alsl.d \in0, a3, \in0, 2
+
+ vld vr0, \in0, 0
+ vldx vr1, \in0, a3
+ QPEL8_HV_H_LSX vr16, vr17 // e f$
+ vldx vr0, \in0, t1
+ vldx vr1, \in0, t2
+ QPEL8_HV_H_LSX vr18, vr19 // g h$
+ QPEL8_HV_V_LSX vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr6, vr7, vr0, vr1
+.ifc \type, avg
+ fld.d f2, t3, 0
+ fldx.d f3, t3, a2
+ vilvl.d vr2, vr3, vr2
+ vavgr.bu vr1, vr2, vr1
+.endif
+ vstelm.d vr1, \in1, 0, 0
+ add.d \in1, \in1, a2
+ vstelm.d vr1, \in1, 0, 1
+
+ alsl.d \in0, a3, \in0, 2
+
+ // tmp8
+ vld vr0, \in0, 0
+ vldx vr1, \in0, a3
+ QPEL8_HV_H_LSX vr12, vr13
+ QPEL8_HV_V_LSX vr14, vr15, vr16, vr17, vr18, vr19, vr12, vr6, vr7, vr0, vr1
+.ifc \type, avg
+ fldx.d f2, t3, t5
+ fldx.d f3, t3, t6
+ vilvl.d vr2, vr3, vr2
+ vavgr.bu vr1, vr2, vr1
+.endif
+ add.d \in1, \in1, a2
+ vstelm.d vr1, \in1, 0, 0
+ add.d \in1, \in1, a2
+ vstelm.d vr1, \in1, 0, 1
+
+ // tmp10
+ vldx vr0, \in0, t1
+ vldx vr1, \in0, t2
+ QPEL8_HV_H_LSX vr14, vr15
+ QPEL8_HV_V_LSX vr16, vr17, vr18, vr19, vr12, vr13, vr14, vr6, vr7, vr0, vr1
+.ifc \type, avg
+ alsl.d t3, a2, t3, 2
+ fld.d f2, t3, 0
+ fldx.d f3, t3, a2
+ vilvl.d vr2, vr3, vr2
+ vavgr.bu vr1, vr2, vr1
+.endif
+ add.d \in1, \in1, a2
+ vstelm.d vr1, \in1, 0, 0
+ add.d \in1, \in1, a2
+ vstelm.d vr1, \in1, 0, 1
+
+ // tmp12
+ alsl.d \in0, a3, \in0, 2
+
+ vld vr0, \in0, 0
+ vldx vr1, \in0, a3
+ QPEL8_HV_H_LSX vr16, vr17
+ QPEL8_HV_V_LSX vr18, vr19, vr12, vr13, vr14, vr15, vr16, vr6, vr7, vr0, vr1
+.ifc \type, avg
+ fldx.d f2, t3, t5
+ fldx.d f3, t3, t6
+ vilvl.d vr2, vr3, vr2
+ vavgr.bu vr1, vr2, vr1
+.endif
+ add.d \in1, \in1, a2
+ vstelm.d vr1, \in1, 0, 0
+ add.d \in1, \in1, a2
+ vstelm.d vr1, \in1, 0, 1
+.endm
+
+function put_h264_qpel8_hv_lowpass_lsx
+ slli.d t1, a3, 1
+ add.d t2, t1, a3
+ addi.d sp, sp, -8
+ fst.d f24, sp, 0
+ addi.d t0, a1, -2 // t0 = src - 2
+ sub.d t0, t0, t1 // t0 = t0 - 2 * stride
+ vldi vr20, 0x414 // h_20
+ vldi vr21, 0x405 // h_5
+ vldi vr22, 0x814 // w_20
+ vldi vr23, 0x805 // w_5
+ addi.d t4, zero, 512
+ vreplgr2vr.w vr24, t4 // w_512
+ h264_qpel8_hv_lowpass_core_lsx t0, a0, put
+ fld.d f24, sp, 0
+ addi.d sp, sp, 8
+endfunc
+
+function put_h264_qpel8_h_lowpass_lsx
+ slli.d t1, a3, 1
+ add.d t2, t1, a3
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ addi.d t0, a1, -2 // t0 = src - 2
+ add.d t3, a1, zero // t3 = src
+.rept 2
+ vld vr0, t0, 0
+ vldx vr1, t0, a3
+ QPEL8_H_LSX vr12, vr13
+ vssrani.bu.h vr13, vr12, 5
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr13, a0, 0, 1
+ add.d a0, a0, a2
+ vldx vr0, t0, t1
+ vldx vr1, t0, t2
+ QPEL8_H_LSX vr12, vr13
+ vssrani.bu.h vr13, vr12, 5
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr13, a0, 0, 1
+ add.d a0, a0, a2
+ alsl.d t0, a3, t0, 2
+.endr
+endfunc
+
+function put_pixels16_l2_8_lsx
+ slli.d t0, a4, 1
+ add.d t1, t0, a4
+ slli.d t2, t0, 1
+ slli.d t3, a3, 1
+ add.d t4, t3, a3
+ slli.d t5, t3, 1
+.rept 4
+ vld vr0, a1, 0
+ vldx vr1, a1, a4
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ add.d a1, a1, t2
+ vld vr8, a2, 0x00
+ vld vr9, a2, 0x10
+ vld vr10, a2, 0x20
+ vld vr11, a2, 0x30
+ addi.d a2, a2, 0x40
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr1, vr9, vr1
+ vavgr.bu vr2, vr10, vr2
+ vavgr.bu vr3, vr11, vr3
+ vst vr0, a0, 0
+ vstx vr1, a0, a3
+ vstx vr2, a0, t3
+ vstx vr3, a0, t4
+ add.d a0, a0, t5
+.endr
+endfunc
+
+.macro QPEL8_V1_LSX in0, in1, in2, in3, in4, in5, in6
+ vilvl.b vr7, \in3, \in2
+ vilvl.b vr8, \in4, \in3
+ vilvl.b vr9, \in4, \in1
+ vilvl.b vr10, \in5, \in2
+ vilvl.b vr11, \in5, \in0
+ vilvl.b vr12, \in6, \in1
+ vhaddw.hu.bu vr7, vr7, vr7
+ vhaddw.hu.bu vr8, vr8, vr8
+ vhaddw.hu.bu vr9, vr9, vr9
+ vhaddw.hu.bu vr10, vr10, vr10
+ vhaddw.hu.bu vr11, vr11, vr11
+ vhaddw.hu.bu vr12, vr12, vr12
+ vmul.h vr7, vr7, vr20
+ vmul.h vr8, vr8, vr20
+ vmul.h vr9, vr9, vr21
+ vmul.h vr10, vr10, vr21
+ vssub.h vr7, vr7, vr9
+ vssub.h vr8, vr8, vr10
+ vsadd.h vr7, vr7, vr11
+ vsadd.h vr8, vr8, vr12
+ vsadd.h vr7, vr7, vr22
+ vsadd.h vr8, vr8, vr22
+ vssrani.bu.h vr8, vr7, 5
+.endm
+
+.macro h264_qpel8_v_lowpass_lsx type
+function \type\()_h264_qpel8_v_lowpass_lsx
+ slli.d t0, a3, 1
+ add.d t1, t0, a3
+ sub.d t2, a1, t0 // t2 = src - 2 * stride
+.ifc \type, avg
+ addi.d t3, a0, 0
+ slli.d t4, a2, 1
+ add.d t5, t4, a2
+.endif
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+
+ fld.d f0, t2, 0
+ fldx.d f1, t2, a3
+ fldx.d f2, t2, t0
+ fldx.d f3, t2, t1
+ alsl.d t2, a3, t2, 2 // t2 = t2 + 4 * stride
+ fld.d f4, t2, 0
+ fldx.d f5, t2, a3
+ fldx.d f6, t2, t0
+ QPEL8_V1_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6
+.ifc \type, avg
+ fld.d f0, t3, 0
+ fldx.d f1, t3, a2
+ vilvl.d vr0, vr1, vr0
+ vavgr.bu vr8, vr8, vr0
+.endif
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr8, a0, 0, 1
+ add.d a0, a0, a2
+
+ fldx.d f0, t2, t1
+ alsl.d t2, a3, t2, 2 // t2 = t2 + 4 *stride
+ fld.d f1, t2, 0
+ QPEL8_V1_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1
+.ifc \type, avg
+ fldx.d f2, t3, t4
+ fldx.d f3, t3, t5
+ vilvl.d vr2, vr3, vr2
+ vavgr.bu vr8, vr8, vr2
+.endif
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr8, a0, 0, 1
+ add.d a0, a0, a2
+
+ alsl.d t3, a2, t3, 2
+
+ fldx.d f2, t2, a3
+ fldx.d f3, t2, t0
+ QPEL8_V1_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3
+.ifc \type, avg
+ fld.d f4, t3, 0
+ fldx.d f5, t3, a2
+ vilvl.d vr4, vr5, vr4
+ vavgr.bu vr8, vr8, vr4
+.endif
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr8, a0, 0, 1
+ add.d a0, a0, a2
+
+ fldx.d f4, t2, t1
+ alsl.d t2, a3, t2, 2 // t2 = t2 + 4 * stride
+ fld.d f5, t2, 0
+ QPEL8_V1_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5
+.ifc \type, avg
+ fldx.d f6, t3, t4
+ fldx.d f0, t3, t5
+ vilvl.d vr6, vr0, vr6
+ vavgr.bu vr8, vr8, vr6
+.endif
+ vstelm.d vr8, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr8, a0, 0, 1
+endfunc
+.endm
+
+h264_qpel8_v_lowpass_lsx put
+h264_qpel8_v_lowpass_lsx avg
+
+function avg_pixels16_l2_8_lsx
+ slli.d t0, a4, 1
+ add.d t1, t0, a4
+ slli.d t2, t0, 1
+ slli.d t3, a3, 1
+ add.d t4, t3, a3
+ slli.d t5, t3, 1
+ addi.d t6, a0, 0
+.rept 4
+ vld vr0, a1, 0
+ vldx vr1, a1, a4
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ add.d a1, a1, t2
+ vld vr8, a2, 0x00
+ vld vr9, a2, 0x10
+ vld vr10, a2, 0x20
+ vld vr11, a2, 0x30
+ addi.d a2, a2, 0x40
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr1, vr9, vr1
+ vavgr.bu vr2, vr10, vr2
+ vavgr.bu vr3, vr11, vr3
+ vld vr8, t6, 0
+ vldx vr9, t6, a3
+ vldx vr10, t6, t3
+ vldx vr11, t6, t4
+ add.d t6, t6, t5
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr1, vr9, vr1
+ vavgr.bu vr2, vr10, vr2
+ vavgr.bu vr3, vr11, vr3
+ vst vr0, a0, 0
+ vstx vr1, a0, a3
+ vstx vr2, a0, t3
+ vstx vr3, a0, t4
+ add.d a0, a0, t5
+.endr
+endfunc
+
+function avg_h264_qpel8_hv_lowpass_lsx
+ slli.d t1, a3, 1
+ add.d t2, t1, a3
+ slli.d t5, a2, 1
+ add.d t6, a2, t5
+ addi.d sp, sp, -8
+ fst.d f24, sp, 0
+ vldi vr20, 0x414 // h_20
+ vldi vr21, 0x405 // h_5
+ vldi vr22, 0x814 // w_20
+ vldi vr23, 0x805 // w_5
+ addi.d t4, zero, 512
+ vreplgr2vr.w vr24, t4 // w_512
+ addi.d t0, a1, -2 // t0 = src - 2
+ sub.d t0, t0, t1 // t0 = t0 - 2 * stride
+ addi.d t3, a0, 0 // t3 = dst
+ h264_qpel8_hv_lowpass_core_lsx t0, a0, avg
+ fld.d f24, sp, 0
+ addi.d sp, sp, 8
+endfunc
+
+function put_pixels8_l2_8_lsx
+ slli.d t0, a4, 1
+ add.d t1, t0, a4
+ slli.d t2, t0, 1
+.rept 2
+ vld vr0, a1, 0
+ vldx vr1, a1, a4
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ add.d a1, a1, t2
+ vilvl.d vr0, vr1, vr0
+ vilvl.d vr2, vr3, vr2
+ vld vr8, a2, 0x00
+ vld vr9, a2, 0x08
+ vld vr10, a2, 0x10
+ vld vr11, a2, 0x18
+ vilvl.d vr8, vr9, vr8
+ vilvl.d vr10, vr11, vr10
+ addi.d a2, a2, 32
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr2, vr10, vr2
+ vstelm.d vr0, a0, 0, 0
+ add.d a0, a0, a3
+ vstelm.d vr0, a0, 0, 1
+ add.d a0, a0, a3
+ vstelm.d vr2, a0, 0, 0
+ add.d a0, a0, a3
+ vstelm.d vr2, a0, 0, 1
+ add.d a0, a0, a3
+.endr
+endfunc
+
+function ff_put_h264_qpel8_mc00_lsx
+ slli.d t0, a2, 1
+ add.d t1, t0, a2
+ slli.d t2, t0, 1
+ ld.d t3, a1, 0x0
+ ldx.d t4, a1, a2
+ ldx.d t5, a1, t0
+ ldx.d t6, a1, t1
+ st.d t3, a0, 0x0
+ stx.d t4, a0, a2
+ stx.d t5, a0, t0
+ stx.d t6, a0, t1
+ add.d a1, a1, t2
+ add.d a0, a0, t2
+ ld.d t3, a1, 0x0
+ ldx.d t4, a1, a2
+ ldx.d t5, a1, t0
+ ldx.d t6, a1, t1
+ st.d t3, a0, 0x0
+ stx.d t4, a0, a2
+ stx.d t5, a0, t0
+ stx.d t6, a0, t1
+endfunc
+
+function ff_avg_h264_qpel8_mc00_lsx
+ slli.d t0, a2, 1
+ add.d t1, t0, a2
+ slli.d t2, t0, 1
+ addi.d t3, a0, 0
+.rept 2
+ vld vr0, a1, 0
+ vldx vr1, a1, a2
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ add.d a1, a1, t2
+ vilvl.d vr0, vr1, vr0
+ vilvl.d vr2, vr3, vr2
+ vld vr8, t3, 0
+ vldx vr9, t3, a2
+ vldx vr10, t3, t0
+ vldx vr11, t3, t1
+ add.d t3, t3, t2
+ vilvl.d vr8, vr9, vr8
+ vilvl.d vr10, vr11, vr10
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr2, vr10, vr2
+ vstelm.d vr0, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr0, a0, 0, 1
+ add.d a0, a0, a2
+ vstelm.d vr2, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr2, a0, 0, 1
+ add.d a0, a0, a2
+.endr
+endfunc
+
+function avg_pixels8_l2_8_lsx
+ slli.d t0, a4, 1
+ add.d t1, t0, a4
+ slli.d t2, t0, 1
+ addi.d t3, a0, 0
+ slli.d t4, a3, 1
+ add.d t5, t4, a3
+ slli.d t6, t4, 1
+.rept 2
+ vld vr0, a1, 0
+ vldx vr1, a1, a4
+ vldx vr2, a1, t0
+ vldx vr3, a1, t1
+ add.d a1, a1, t2
+ vilvl.d vr0, vr1, vr0
+ vilvl.d vr2, vr3, vr2
+ vld vr8, a2, 0x00
+ vld vr9, a2, 0x08
+ vld vr10, a2, 0x10
+ vld vr11, a2, 0x18
+ addi.d a2, a2, 0x20
+ vilvl.d vr8, vr9, vr8
+ vilvl.d vr10, vr11, vr10
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr2, vr10, vr2
+ vld vr8, t3, 0
+ vldx vr9, t3, a3
+ vldx vr10, t3, t4
+ vldx vr11, t3, t5
+ add.d t3, t3, t6
+ vilvl.d vr8, vr9, vr8
+ vilvl.d vr10, vr11, vr10
+ vavgr.bu vr0, vr8, vr0
+ vavgr.bu vr2, vr10, vr2
+ vstelm.d vr0, a0, 0, 0
+ add.d a0, a0, a3
+ vstelm.d vr0, a0, 0, 1
+ add.d a0, a0, a3
+ vstelm.d vr2, a0, 0, 0
+ add.d a0, a0, a3
+ vstelm.d vr2, a0, 0, 1
+ add.d a0, a0, a3
+.endr
+endfunc
+
+function avg_h264_qpel8_h_lowpass_lsx
+ slli.d t1, a3, 1
+ add.d t2, t1, a3
+ slli.d t5, a2, 1
+ add.d t6, t5, a2
+ vldi vr20, 0x414
+ vldi vr21, 0x405
+ vldi vr22, 0x410
+ addi.d t0, a1, -2 // t0 = src - 2
+ add.d t3, a1, zero // t3 = src
+ addi.d t4, a0, 0 // t4 = dst
+.rept 4
+ vld vr0, t0, 0
+ vldx vr1, t0, a3
+ QPEL8_H_LSX vr12, vr13
+ vssrani.bu.h vr13, vr12, 5
+ fld.d f0, t4, 0
+ fldx.d f1, t4, a2
+ vilvl.d vr0, vr1, vr0
+ vavgr.bu vr13, vr13, vr0
+ vstelm.d vr13, a0, 0, 0
+ add.d a0, a0, a2
+ vstelm.d vr13, a0, 0, 1
+ add.d a0, a0, a2
+ add.d t0, t0, t1
+ add.d t4, t4, t1
+.endr
+endfunc
@@ -19,7 +19,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "h264qpel_lasx.h"
+#include "h264qpel_loongarch.h"
#include "libavutil/attributes.h"
#include "libavutil/loongarch/cpu.h"
#include "libavcodec/h264qpel.h"
@@ -27,6 +27,77 @@
av_cold void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth)
{
int cpu_flags = av_get_cpu_flags();
+
+ if (have_lsx(cpu_flags)) {
+ if (8 == bit_depth) {
+ c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_lsx;
+ c->put_h264_qpel_pixels_tab[0][1] = ff_put_h264_qpel16_mc10_lsx;
+ c->put_h264_qpel_pixels_tab[0][2] = ff_put_h264_qpel16_mc20_lsx;
+ c->put_h264_qpel_pixels_tab[0][3] = ff_put_h264_qpel16_mc30_lsx;
+ c->put_h264_qpel_pixels_tab[0][4] = ff_put_h264_qpel16_mc01_lsx;
+ c->put_h264_qpel_pixels_tab[0][5] = ff_put_h264_qpel16_mc11_lsx;
+ c->put_h264_qpel_pixels_tab[0][6] = ff_put_h264_qpel16_mc21_lsx;
+ c->put_h264_qpel_pixels_tab[0][7] = ff_put_h264_qpel16_mc31_lsx;
+ c->put_h264_qpel_pixels_tab[0][8] = ff_put_h264_qpel16_mc02_lsx;
+ c->put_h264_qpel_pixels_tab[0][9] = ff_put_h264_qpel16_mc12_lsx;
+ c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_lsx;
+ c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_lsx;
+ c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_lsx;
+ c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_lsx;
+ c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_lsx;
+ c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_lsx;
+
+ c->avg_h264_qpel_pixels_tab[0][0] = ff_avg_h264_qpel16_mc00_lsx;
+ c->avg_h264_qpel_pixels_tab[0][1] = ff_avg_h264_qpel16_mc10_lsx;
+ c->avg_h264_qpel_pixels_tab[0][2] = ff_avg_h264_qpel16_mc20_lsx;
+ c->avg_h264_qpel_pixels_tab[0][3] = ff_avg_h264_qpel16_mc30_lsx;
+ c->avg_h264_qpel_pixels_tab[0][4] = ff_avg_h264_qpel16_mc01_lsx;
+ c->avg_h264_qpel_pixels_tab[0][5] = ff_avg_h264_qpel16_mc11_lsx;
+ c->avg_h264_qpel_pixels_tab[0][6] = ff_avg_h264_qpel16_mc21_lsx;
+ c->avg_h264_qpel_pixels_tab[0][7] = ff_avg_h264_qpel16_mc31_lsx;
+ c->avg_h264_qpel_pixels_tab[0][8] = ff_avg_h264_qpel16_mc02_lsx;
+ c->avg_h264_qpel_pixels_tab[0][9] = ff_avg_h264_qpel16_mc12_lsx;
+ c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_lsx;
+ c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_lsx;
+ c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_lsx;
+ c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_lsx;
+ c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_lsx;
+ c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_lsx;
+
+ c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_lsx;
+ c->put_h264_qpel_pixels_tab[1][1] = ff_put_h264_qpel8_mc10_lsx;
+ c->put_h264_qpel_pixels_tab[1][2] = ff_put_h264_qpel8_mc20_lsx;
+ c->put_h264_qpel_pixels_tab[1][3] = ff_put_h264_qpel8_mc30_lsx;
+ c->put_h264_qpel_pixels_tab[1][4] = ff_put_h264_qpel8_mc01_lsx;
+ c->put_h264_qpel_pixels_tab[1][5] = ff_put_h264_qpel8_mc11_lsx;
+ c->put_h264_qpel_pixels_tab[1][6] = ff_put_h264_qpel8_mc21_lsx;
+ c->put_h264_qpel_pixels_tab[1][7] = ff_put_h264_qpel8_mc31_lsx;
+ c->put_h264_qpel_pixels_tab[1][8] = ff_put_h264_qpel8_mc02_lsx;
+ c->put_h264_qpel_pixels_tab[1][9] = ff_put_h264_qpel8_mc12_lsx;
+ c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_lsx;
+ c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_lsx;
+ c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_lsx;
+ c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_lsx;
+ c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_lsx;
+ c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_lsx;
+
+ c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_h264_qpel8_mc00_lsx;
+ c->avg_h264_qpel_pixels_tab[1][1] = ff_avg_h264_qpel8_mc10_lsx;
+ c->avg_h264_qpel_pixels_tab[1][2] = ff_avg_h264_qpel8_mc20_lsx;
+ c->avg_h264_qpel_pixels_tab[1][3] = ff_avg_h264_qpel8_mc30_lsx;
+ c->avg_h264_qpel_pixels_tab[1][5] = ff_avg_h264_qpel8_mc11_lsx;
+ c->avg_h264_qpel_pixels_tab[1][6] = ff_avg_h264_qpel8_mc21_lsx;
+ c->avg_h264_qpel_pixels_tab[1][7] = ff_avg_h264_qpel8_mc31_lsx;
+ c->avg_h264_qpel_pixels_tab[1][8] = ff_avg_h264_qpel8_mc02_lsx;
+ c->avg_h264_qpel_pixels_tab[1][9] = ff_avg_h264_qpel8_mc12_lsx;
+ c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_lsx;
+ c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_lsx;
+ c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_lsx;
+ c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_lsx;
+ c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_lsx;
+ }
+ }
+#if HAVE_LASX
if (have_lasx(cpu_flags)) {
if (8 == bit_depth) {
c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_lasx;
@@ -95,4 +166,5 @@ av_cold void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth)
c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_lasx;
}
}
+#endif
}
@@ -21,7 +21,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "h264qpel_lasx.h"
+#include "h264qpel_loongarch.h"
#include "libavutil/loongarch/loongson_intrinsics.h"
#include "libavutil/attributes.h"
@@ -418,157 +418,6 @@ avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
);
}
-/* avg_pixels8_8_lsx : dst = avg(src, dst)
- * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
- ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
- ptrdiff_t stride_2, stride_3, stride_4;
- __asm__ volatile (
- /* h0~h7 */
- "slli.d %[stride_2], %[srcStride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
- "slli.d %[stride_4], %[stride_2], 1 \n\t"
- "vld $vr0, %[src], 0 \n\t"
- "vldx $vr1, %[src], %[srcStride] \n\t"
- "vldx $vr2, %[src], %[stride_2] \n\t"
- "vldx $vr3, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "vld $vr4, %[src], 0 \n\t"
- "vldx $vr5, %[src], %[srcStride] \n\t"
- "vldx $vr6, %[src], %[stride_2] \n\t"
- "vldx $vr7, %[src], %[stride_3] \n\t"
-
- "vld $vr8, %[half], 0x00 \n\t"
- "vld $vr9, %[half], 0x08 \n\t"
- "vld $vr10, %[half], 0x10 \n\t"
- "vld $vr11, %[half], 0x18 \n\t"
- "vld $vr12, %[half], 0x20 \n\t"
- "vld $vr13, %[half], 0x28 \n\t"
- "vld $vr14, %[half], 0x30 \n\t"
- "vld $vr15, %[half], 0x38 \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vstelm.d $vr0, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr1, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr2, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr3, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr4, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr5, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr6, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr7, %[dst], 0, 0 \n\t"
- : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
- [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
- [stride_4]"=&r"(stride_4)
- : [srcStride]"r"(srcStride), [dstStride]"r"(dstStride)
- : "memory"
- );
-}
-
-/* avg_pixels8_8_lsx : dst = avg(src, dst)
- * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
- ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
- uint8_t *tmp = dst;
- ptrdiff_t stride_2, stride_3, stride_4;
- __asm__ volatile (
- /* h0~h7 */
- "slli.d %[stride_2], %[srcStride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
- "slli.d %[stride_4], %[stride_2], 1 \n\t"
- "vld $vr0, %[src], 0 \n\t"
- "vldx $vr1, %[src], %[srcStride] \n\t"
- "vldx $vr2, %[src], %[stride_2] \n\t"
- "vldx $vr3, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "vld $vr4, %[src], 0 \n\t"
- "vldx $vr5, %[src], %[srcStride] \n\t"
- "vldx $vr6, %[src], %[stride_2] \n\t"
- "vldx $vr7, %[src], %[stride_3] \n\t"
-
- "vld $vr8, %[half], 0x00 \n\t"
- "vld $vr9, %[half], 0x08 \n\t"
- "vld $vr10, %[half], 0x10 \n\t"
- "vld $vr11, %[half], 0x18 \n\t"
- "vld $vr12, %[half], 0x20 \n\t"
- "vld $vr13, %[half], 0x28 \n\t"
- "vld $vr14, %[half], 0x30 \n\t"
- "vld $vr15, %[half], 0x38 \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "slli.d %[stride_2], %[dstStride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[dstStride] \n\t"
- "slli.d %[stride_4], %[stride_2], 1 \n\t"
- "vld $vr8, %[tmp], 0 \n\t"
- "vldx $vr9, %[tmp], %[dstStride] \n\t"
- "vldx $vr10, %[tmp], %[stride_2] \n\t"
- "vldx $vr11, %[tmp], %[stride_3] \n\t"
- "add.d %[tmp], %[tmp], %[stride_4] \n\t"
- "vld $vr12, %[tmp], 0 \n\t"
- "vldx $vr13, %[tmp], %[dstStride] \n\t"
- "vldx $vr14, %[tmp], %[stride_2] \n\t"
- "vldx $vr15, %[tmp], %[stride_3] \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vstelm.d $vr0, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr1, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr2, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr3, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr4, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr5, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr6, %[dst], 0, 0 \n\t"
- "add.d %[dst], %[dst], %[dstStride] \n\t"
- "vstelm.d $vr7, %[dst], 0, 0 \n\t"
- : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half),
- [src]"+&r"(src), [stride_2]"=&r"(stride_2),
- [stride_3]"=&r"(stride_3), [stride_4]"=&r"(stride_4)
- : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
- : "memory"
- );
-}
-
/* put_pixels16_8_lsx: dst = src */
static av_always_inline void
put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
@@ -729,254 +578,6 @@ avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
);
}
-/* avg_pixels16_8_lsx : dst = avg(src, dst)
- * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
- ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
- ptrdiff_t stride_2, stride_3, stride_4;
- ptrdiff_t dstride_2, dstride_3, dstride_4;
- __asm__ volatile (
- "slli.d %[stride_2], %[srcStride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
- "slli.d %[stride_4], %[stride_2], 1 \n\t"
- "slli.d %[dstride_2], %[dstStride], 1 \n\t"
- "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t"
- "slli.d %[dstride_4], %[dstride_2], 1 \n\t"
- /* h0~h7 */
- "vld $vr0, %[src], 0 \n\t"
- "vldx $vr1, %[src], %[srcStride] \n\t"
- "vldx $vr2, %[src], %[stride_2] \n\t"
- "vldx $vr3, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "vld $vr4, %[src], 0 \n\t"
- "vldx $vr5, %[src], %[srcStride] \n\t"
- "vldx $vr6, %[src], %[stride_2] \n\t"
- "vldx $vr7, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
-
- "vld $vr8, %[half], 0x00 \n\t"
- "vld $vr9, %[half], 0x10 \n\t"
- "vld $vr10, %[half], 0x20 \n\t"
- "vld $vr11, %[half], 0x30 \n\t"
- "vld $vr12, %[half], 0x40 \n\t"
- "vld $vr13, %[half], 0x50 \n\t"
- "vld $vr14, %[half], 0x60 \n\t"
- "vld $vr15, %[half], 0x70 \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vst $vr0, %[dst], 0 \n\t"
- "vstx $vr1, %[dst], %[dstStride] \n\t"
- "vstx $vr2, %[dst], %[dstride_2] \n\t"
- "vstx $vr3, %[dst], %[dstride_3] \n\t"
- "add.d %[dst], %[dst], %[dstride_4] \n\t"
- "vst $vr4, %[dst], 0 \n\t"
- "vstx $vr5, %[dst], %[dstStride] \n\t"
- "vstx $vr6, %[dst], %[dstride_2] \n\t"
- "vstx $vr7, %[dst], %[dstride_3] \n\t"
- "add.d %[dst], %[dst], %[dstride_4] \n\t"
-
- /* h8~h15 */
- "vld $vr0, %[src], 0 \n\t"
- "vldx $vr1, %[src], %[srcStride] \n\t"
- "vldx $vr2, %[src], %[stride_2] \n\t"
- "vldx $vr3, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "vld $vr4, %[src], 0 \n\t"
- "vldx $vr5, %[src], %[srcStride] \n\t"
- "vldx $vr6, %[src], %[stride_2] \n\t"
- "vldx $vr7, %[src], %[stride_3] \n\t"
-
- "vld $vr8, %[half], 0x80 \n\t"
- "vld $vr9, %[half], 0x90 \n\t"
- "vld $vr10, %[half], 0xa0 \n\t"
- "vld $vr11, %[half], 0xb0 \n\t"
- "vld $vr12, %[half], 0xc0 \n\t"
- "vld $vr13, %[half], 0xd0 \n\t"
- "vld $vr14, %[half], 0xe0 \n\t"
- "vld $vr15, %[half], 0xf0 \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vst $vr0, %[dst], 0 \n\t"
- "vstx $vr1, %[dst], %[dstStride] \n\t"
- "vstx $vr2, %[dst], %[dstride_2] \n\t"
- "vstx $vr3, %[dst], %[dstride_3] \n\t"
- "add.d %[dst], %[dst], %[dstride_4] \n\t"
- "vst $vr4, %[dst], 0 \n\t"
- "vstx $vr5, %[dst], %[dstStride] \n\t"
- "vstx $vr6, %[dst], %[dstride_2] \n\t"
- "vstx $vr7, %[dst], %[dstride_3] \n\t"
- : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src),
- [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
- [stride_4]"=&r"(stride_4), [dstride_2]"=&r"(dstride_2),
- [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
- : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
- : "memory"
- );
-}
-
-/* avg_pixels16_8_lsx : dst = avg(src, dst)
- * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
- * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
-static av_always_inline void
-avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
- ptrdiff_t dstStride, ptrdiff_t srcStride)
-{
- uint8_t *tmp = dst;
- ptrdiff_t stride_2, stride_3, stride_4;
- ptrdiff_t dstride_2, dstride_3, dstride_4;
- __asm__ volatile (
- "slli.d %[stride_2], %[srcStride], 1 \n\t"
- "add.d %[stride_3], %[stride_2], %[srcStride] \n\t"
- "slli.d %[stride_4], %[stride_2], 1 \n\t"
- "slli.d %[dstride_2], %[dstStride], 1 \n\t"
- "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t"
- "slli.d %[dstride_4], %[dstride_2], 1 \n\t"
- /* h0~h7 */
- "vld $vr0, %[src], 0 \n\t"
- "vldx $vr1, %[src], %[srcStride] \n\t"
- "vldx $vr2, %[src], %[stride_2] \n\t"
- "vldx $vr3, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "vld $vr4, %[src], 0 \n\t"
- "vldx $vr5, %[src], %[srcStride] \n\t"
- "vldx $vr6, %[src], %[stride_2] \n\t"
- "vldx $vr7, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
-
- "vld $vr8, %[half], 0x00 \n\t"
- "vld $vr9, %[half], 0x10 \n\t"
- "vld $vr10, %[half], 0x20 \n\t"
- "vld $vr11, %[half], 0x30 \n\t"
- "vld $vr12, %[half], 0x40 \n\t"
- "vld $vr13, %[half], 0x50 \n\t"
- "vld $vr14, %[half], 0x60 \n\t"
- "vld $vr15, %[half], 0x70 \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vld $vr8, %[tmp], 0 \n\t"
- "vldx $vr9, %[tmp], %[dstStride] \n\t"
- "vldx $vr10, %[tmp], %[dstride_2] \n\t"
- "vldx $vr11, %[tmp], %[dstride_3] \n\t"
- "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
- "vld $vr12, %[tmp], 0 \n\t"
- "vldx $vr13, %[tmp], %[dstStride] \n\t"
- "vldx $vr14, %[tmp], %[dstride_2] \n\t"
- "vldx $vr15, %[tmp], %[dstride_3] \n\t"
- "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vst $vr0, %[dst], 0 \n\t"
- "vstx $vr1, %[dst], %[dstStride] \n\t"
- "vstx $vr2, %[dst], %[dstride_2] \n\t"
- "vstx $vr3, %[dst], %[dstride_3] \n\t"
- "add.d %[dst], %[dst], %[dstride_4] \n\t"
- "vst $vr4, %[dst], 0 \n\t"
- "vstx $vr5, %[dst], %[dstStride] \n\t"
- "vstx $vr6, %[dst], %[dstride_2] \n\t"
- "vstx $vr7, %[dst], %[dstride_3] \n\t"
- "add.d %[dst], %[dst], %[dstride_4] \n\t"
-
- /* h8~h15 */
- "vld $vr0, %[src], 0 \n\t"
- "vldx $vr1, %[src], %[srcStride] \n\t"
- "vldx $vr2, %[src], %[stride_2] \n\t"
- "vldx $vr3, %[src], %[stride_3] \n\t"
- "add.d %[src], %[src], %[stride_4] \n\t"
- "vld $vr4, %[src], 0 \n\t"
- "vldx $vr5, %[src], %[srcStride] \n\t"
- "vldx $vr6, %[src], %[stride_2] \n\t"
- "vldx $vr7, %[src], %[stride_3] \n\t"
-
- "vld $vr8, %[half], 0x80 \n\t"
- "vld $vr9, %[half], 0x90 \n\t"
- "vld $vr10, %[half], 0xa0 \n\t"
- "vld $vr11, %[half], 0xb0 \n\t"
- "vld $vr12, %[half], 0xc0 \n\t"
- "vld $vr13, %[half], 0xd0 \n\t"
- "vld $vr14, %[half], 0xe0 \n\t"
- "vld $vr15, %[half], 0xf0 \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vld $vr8, %[tmp], 0 \n\t"
- "vldx $vr9, %[tmp], %[dstStride] \n\t"
- "vldx $vr10, %[tmp], %[dstride_2] \n\t"
- "vldx $vr11, %[tmp], %[dstride_3] \n\t"
- "add.d %[tmp], %[tmp], %[dstride_4] \n\t"
- "vld $vr12, %[tmp], 0 \n\t"
- "vldx $vr13, %[tmp], %[dstStride] \n\t"
- "vldx $vr14, %[tmp], %[dstride_2] \n\t"
- "vldx $vr15, %[tmp], %[dstride_3] \n\t"
-
- "vavgr.bu $vr0, $vr8, $vr0 \n\t"
- "vavgr.bu $vr1, $vr9, $vr1 \n\t"
- "vavgr.bu $vr2, $vr10, $vr2 \n\t"
- "vavgr.bu $vr3, $vr11, $vr3 \n\t"
- "vavgr.bu $vr4, $vr12, $vr4 \n\t"
- "vavgr.bu $vr5, $vr13, $vr5 \n\t"
- "vavgr.bu $vr6, $vr14, $vr6 \n\t"
- "vavgr.bu $vr7, $vr15, $vr7 \n\t"
-
- "vst $vr0, %[dst], 0 \n\t"
- "vstx $vr1, %[dst], %[dstStride] \n\t"
- "vstx $vr2, %[dst], %[dstride_2] \n\t"
- "vstx $vr3, %[dst], %[dstride_3] \n\t"
- "add.d %[dst], %[dst], %[dstride_4] \n\t"
- "vst $vr4, %[dst], 0 \n\t"
- "vstx $vr5, %[dst], %[dstStride] \n\t"
- "vstx $vr6, %[dst], %[dstride_2] \n\t"
- "vstx $vr7, %[dst], %[dstride_3] \n\t"
- : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), [src]"+&r"(src),
- [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
- [stride_4]"=&r"(stride_4), [dstride_2]"=&r"(dstride_2),
- [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4)
- : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride)
- : "memory"
- );
-}
-
#define QPEL8_H_LOWPASS(out_v) \
src00 = __lasx_xvld(src, - 2); \
src += srcStride; \
deleted file mode 100644
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2020 Loongson Technology Corporation Limited
- * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_LOONGARCH_H264QPEL_LASX_H
-#define AVCODEC_LOONGARCH_H264QPEL_LASX_H
-
-#include <stdint.h>
-#include <stddef.h>
-#include "libavcodec/h264.h"
-
-void ff_h264_h_lpf_luma_inter_lasx(uint8_t *src, int stride,
- int alpha, int beta, int8_t *tc0);
-void ff_h264_v_lpf_luma_inter_lasx(uint8_t *src, int stride,
- int alpha, int beta, int8_t *tc0);
-void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-
-void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride);
-void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
- ptrdiff_t dst_stride);
-#endif // #ifndef AVCODEC_LOONGARCH_H264QPEL_LASX_H
new file mode 100644
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
+#define AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "libavcodec/h264.h"
+#include "config.h"
+
+void put_h264_qpel8_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_h264_qpel8_h_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_h264_qpel8_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+void put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+
+void avg_h264_qpel8_h_lowpass_lsx(uint8_t *dst, const uint8_t *src, int dstStride,
+ int srcStride);
+void avg_h264_qpel8_v_lowpass_lsx(uint8_t *dst, uint8_t *src, int dstStride,
+ int srcStride);
+void avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+void avg_h264_qpel8_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+void avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half,
+ ptrdiff_t dstStride, ptrdiff_t srcStride);
+
+void ff_put_h264_qpel16_mc00_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+
+void ff_avg_h264_qpel16_mc00_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+
+void ff_put_h264_qpel8_mc03_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc00_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+
+void ff_avg_h264_qpel8_mc00_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+
+#if HAVE_LASX
+void ff_h264_h_lpf_luma_inter_lasx(uint8_t *src, int stride,
+ int alpha, int beta, int8_t *tc0);
+void ff_h264_v_lpf_luma_inter_lasx(uint8_t *src, int stride,
+ int alpha, int beta, int8_t *tc0);
+void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+
+void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dst_stride);
+#endif
+
+#endif // #ifndef AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H
new file mode 100644
@@ -0,0 +1,487 @@
+/*
+ * Loongson LSX optimized h264qpel
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264qpel_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "libavutil/attributes.h"
+
+static void put_h264_qpel16_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dstStride, ptrdiff_t srcStride)
+{
+ put_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+ put_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+ src += srcStride << 3;
+ dst += dstStride << 3;
+ put_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+ put_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+}
+
+void ff_put_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ put_h264_qpel16_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+static void put_h264_qpel16_h_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ int dstStride, int srcStride)
+{
+ put_h264_qpel8_h_lowpass_lsx(dst, src, dstStride, srcStride);
+ put_h264_qpel8_h_lowpass_lsx(dst+8, src+8, dstStride, srcStride);
+ src += srcStride << 3;
+ dst += dstStride << 3;
+ put_h264_qpel8_h_lowpass_lsx(dst, src, dstStride, srcStride);
+ put_h264_qpel8_h_lowpass_lsx(dst+8, src+8, dstStride, srcStride);
+}
+
+static void put_h264_qpel16_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ int dstStride, int srcStride)
+{
+ put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+ put_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+ src += 8*srcStride;
+ dst += 8*dstStride;
+ put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+ put_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+}
+
+void ff_put_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 256;
+
+ put_h264_qpel16_h_lowpass_lsx(halfH, src, 16, stride);
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 256;
+
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ put_h264_qpel16_v_lowpass_lsx(halfH, src, 16, stride);
+ put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 256;
+
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ put_h264_qpel16_v_lowpass_lsx(halfH, src + 1, 16, stride);
+ put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_put_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 256;
+
+ put_h264_qpel16_h_lowpass_lsx(halfH, src + stride, 16, stride);
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+static void avg_h264_qpel16_v_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ int dstStride, int srcStride)
+{
+ avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+ avg_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+ src += 8*srcStride;
+ dst += 8*dstStride;
+ avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride);
+ avg_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
+}
+
+void ff_avg_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ avg_h264_qpel16_v_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[256];
+
+ put_h264_qpel16_v_lowpass_lsx(half, src, 16, stride);
+ avg_pixels16_l2_8_lsx(dst, src + stride, half, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 256;
+
+ put_h264_qpel16_h_lowpass_lsx(halfH, src + stride, 16, stride);
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 256;
+
+ put_h264_qpel16_h_lowpass_lsx(halfH, src, 16, stride);
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[256];
+
+ put_h264_qpel16_v_lowpass_lsx(half, src, 16, stride);
+ avg_pixels16_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_avg_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 256;
+
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ put_h264_qpel16_v_lowpass_lsx(halfH, src + 1, 16, stride);
+ avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+void ff_avg_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[512];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 256;
+
+ put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride);
+ put_h264_qpel16_v_lowpass_lsx(halfH, src, 16, stride);
+ avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
+}
+
+static void avg_h264_qpel16_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t dstStride, ptrdiff_t srcStride)
+{
+ avg_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+ avg_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+ src += srcStride << 3;
+ dst += dstStride << 3;
+ avg_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride);
+ avg_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride);
+}
+
+void ff_avg_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ avg_h264_qpel16_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc03_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[64];
+
+ put_h264_qpel8_v_lowpass_lsx(half, (uint8_t*)src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, src + stride, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc01_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[64];
+
+ put_h264_qpel8_v_lowpass_lsx(half, (uint8_t*)src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[64];
+
+ put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[64];
+
+ put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 64;
+
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src + 1, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 64;
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 64;
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 64;
+
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src, 8, stride);
+ put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_put_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ put_h264_qpel8_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_put_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ put_h264_qpel8_h_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[64];
+
+ put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, src, half, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ avg_h264_qpel8_h_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t half[64];
+
+ put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, src+1, half, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 64;
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 64;
+
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ avg_h264_qpel8_hv_lowpass_lsx(dst, src, stride, stride);
+}
+
+void ff_avg_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfHV = temp;
+ uint8_t *const halfH = temp + 64;
+
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src + 1, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t temp[128];
+ uint8_t *const halfH = temp;
+ uint8_t *const halfHV = temp + 64;
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+ put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
+}
+
+void ff_avg_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride)
+{
+ uint8_t halfH[64];
+ uint8_t halfV[64];
+
+ put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride);
+ put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride);
+ avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
+}
From: yuanhecai <yuanhecai@loongson.cn> ./configure --disable-lasx ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an before: 214fps after: 274fps --- libavcodec/loongarch/Makefile | 2 + libavcodec/loongarch/h264qpel.S | 1686 +++++++++++++++++ .../loongarch/h264qpel_init_loongarch.c | 74 +- libavcodec/loongarch/h264qpel_lasx.c | 401 +--- libavcodec/loongarch/h264qpel_lasx.h | 158 -- libavcodec/loongarch/h264qpel_loongarch.h | 312 +++ libavcodec/loongarch/h264qpel_lsx.c | 487 +++++ 7 files changed, 2561 insertions(+), 559 deletions(-) create mode 100644 libavcodec/loongarch/h264qpel.S delete mode 100644 libavcodec/loongarch/h264qpel_lasx.h create mode 100644 libavcodec/loongarch/h264qpel_loongarch.h create mode 100644 libavcodec/loongarch/h264qpel_lsx.c