diff mbox series

[FFmpeg-devel,v2,3/7] avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt

Message ID 20231227045019.25078-4-jinbo@loongson.cn
State New
Headers show
Series [FFmpeg-devel,v2,1/7] avcodec/hevc: Add init for sao_edge_filter | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

金波 Dec. 27, 2023, 4:50 a.m. UTC
tests/checkasm/checkasm:           C       LSX     LASX
put_hevc_pel_uni_w_pixels4_8_c:    2.7     1.0
put_hevc_pel_uni_w_pixels6_8_c:    6.2     2.0     1.5
put_hevc_pel_uni_w_pixels8_8_c:    10.7    2.5     1.7
put_hevc_pel_uni_w_pixels12_8_c:   23.0    5.5     5.0
put_hevc_pel_uni_w_pixels16_8_c:   41.0    8.2     5.0
put_hevc_pel_uni_w_pixels24_8_c:   91.0    19.7    13.2
put_hevc_pel_uni_w_pixels32_8_c:   161.7   32.5    16.2
put_hevc_pel_uni_w_pixels48_8_c:   354.5   73.7    43.0
put_hevc_pel_uni_w_pixels64_8_c:   641.5   130.0   64.2

Speedup of decoding H265 4K 30FPS 30Mbps on 3A6000 with
8 threads is 1fps(47fps-->48fps).
---
 libavcodec/loongarch/Makefile                 |   3 +-
 libavcodec/loongarch/hevc_mc.S                | 471 ++++++++++++++++++
 libavcodec/loongarch/hevcdsp_init_loongarch.c |  43 ++
 libavcodec/loongarch/hevcdsp_lasx.h           |  53 ++
 libavcodec/loongarch/hevcdsp_lsx.h            |  27 +
 5 files changed, 596 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/loongarch/hevc_mc.S
 create mode 100644 libavcodec/loongarch/hevcdsp_lasx.h

Comments

Shiyou Yin Dec. 28, 2023, 2:18 a.m. UTC | #1
> -----原始邮件-----
> 发件人: jinbo <jinbo@loongson.cn>
> 发送时间:2023-12-27 12:50:15 (星期三)
> 收件人: ffmpeg-devel@ffmpeg.org
> 抄送: jinbo <jinbo@loongson.cn>
> 主题: [FFmpeg-devel] [PATCH v2 3/7] avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt
> 

> +
> +.macro HEVC_PEL_UNI_W_PIXELS8_LSX src0, dst0, w
> +    vldrepl.d      vr0,    \src0,   0
> +    vsllwil.hu.bu  vr0,    vr0,     0
> +    vexth.wu.hu    vr5,    vr0
> +    vsllwil.wu.hu  vr0,    vr0,     0
> +    vslli.w        vr0,    vr0,     6
> +    vslli.w        vr5,    vr5,     6
> +    vmul.w         vr0,    vr0,     vr1
> +    vmul.w         vr5,    vr5,     vr1
> +    vadd.w         vr0,    vr0,     vr2
> +    vadd.w         vr5,    vr5,     vr2
You can use 'vmadd.w' here.
> +    vsra.w         vr0,    vr0,     vr3
> +    vsra.w         vr5,    vr5,     vr3
> +    vadd.w         vr0,    vr0,     vr4
> +    vadd.w         vr5,    vr5,     vr4
> +    vssrani.h.w    vr5,    vr0,     0
> +    vssrani.bu.h   vr5,    vr5,     0
> +.if \w == 6
> +    fst.s          f5,     \dst0,   0
> +    vstelm.h       vr5,    \dst0,   4,     2
> +.else
> +    fst.d          f5,     \dst0,   0
> +.endif
> +.endm
> +
> +.macro HEVC_PEL_UNI_W_PIXELS8x2_LASX src0, dst0, w
> +    vldrepl.d      vr0,    \src0,   0
> +    add.d          t2,     \src0,   a3
> +    vldrepl.d      vr5,    t2,      0
> +    xvpermi.q      xr0,    xr5,     0x02
> +    xvsllwil.hu.bu xr0,    xr0,     0
> +    xvexth.wu.hu   xr5,    xr0
> +    xvsllwil.wu.hu xr0,    xr0,     0
> +    xvslli.w       xr0,    xr0,     6
> +    xvslli.w       xr5,    xr5,     6
> +    xvmul.w        xr0,    xr0,     xr1
> +    xvmul.w        xr5,    xr5,     xr1
> +    xvadd.w        xr0,    xr0,     xr2
> +    xvadd.w        xr5,    xr5,     xr2

Use 'vmadd.w' will be better.

> +    xvsra.w        xr0,    xr0,     xr3
> +    xvsra.w        xr5,    xr5,     xr3
> +    xvadd.w        xr0,    xr0,     xr4
> +    xvadd.w        xr5,    xr5,     xr4
> +    xvssrani.h.w   xr5,    xr0,     0
> +    xvpermi.q      xr0,    xr5,     0x01
> +    xvssrani.bu.h  xr0,    xr5,     0
> +    add.d          t3,     \dst0,   a1
> +.if \w == 6
> +    vstelm.w       vr0,    \dst0,   0,     0
> +    vstelm.h       vr0,    \dst0,   4,     2
> +    vstelm.w       vr0,    t3,      0,     2
> +    vstelm.h       vr0,    t3,      4,     6
> +.else
> +    vstelm.d       vr0,    \dst0,   0,     0
> +    vstelm.d       vr0,    t3,      0,     1
> +.endif
> +.endm
> +
> +.macro HEVC_PEL_UNI_W_PIXELS16_LSX src0, dst0
> +    vld            vr0,    \src0,   0
> +    vexth.hu.bu    vr7,    vr0
> +    vexth.wu.hu    vr8,    vr7
> +    vsllwil.wu.hu  vr7,    vr7,     0
> +    vsllwil.hu.bu  vr5,    vr0,     0
> +    vexth.wu.hu    vr6,    vr5
> +    vsllwil.wu.hu  vr5,    vr5,     0
> +    vslli.w        vr5,    vr5,     6
> +    vslli.w        vr6,    vr6,     6
> +    vslli.w        vr7,    vr7,     6
> +    vslli.w        vr8,    vr8,     6
> +    vmul.w         vr5,    vr5,     vr1
> +    vmul.w         vr6,    vr6,     vr1
> +    vmul.w         vr7,    vr7,     vr1
> +    vmul.w         vr8,    vr8,     vr1
> +    vadd.w         vr5,    vr5,     vr2
> +    vadd.w         vr6,    vr6,     vr2
> +    vadd.w         vr7,    vr7,     vr2
> +    vadd.w         vr8,    vr8,     vr2

Use 'vmadd.w', please check it in your left code.


本邮件及其附件含有龙芯中科的商业秘密信息,仅限于发送给上面地址中列出的个人或群组。禁止任何其他人以任何形式使用(包括但不限于全部或部分地泄露、复制或散发)本邮件及其附件中的信息。如果您错收本邮件,请您立即电话或邮件通知发件人并删除本邮件。 
This email and its attachments contain confidential information from Loongson Technology , which is intended only for the person or entity whose address is listed above. Any use of the information contained herein in any way (including, but not limited to, total or partial disclosure, reproduction or dissemination) by persons other than the intended recipient(s) is prohibited. If you receive this email in error, please notify the sender by phone or email immediately and delete it.
diff mbox series

Patch

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 07ea97f803..ad98cd4054 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -28,7 +28,8 @@  LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
                                          loongarch/hevc_mc_bi_lsx.o \
                                          loongarch/hevc_mc_uni_lsx.o \
                                          loongarch/hevc_mc_uniw_lsx.o \
-                                         loongarch/hevc_add_res.o
+                                         loongarch/hevc_add_res.o \
+                                         loongarch/hevc_mc.o
 LSX-OBJS-$(CONFIG_H264DSP)            += loongarch/h264idct.o \
                                          loongarch/h264idct_loongarch.o \
                                          loongarch/h264dsp.o
diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S
new file mode 100644
index 0000000000..c5d553effe
--- /dev/null
+++ b/libavcodec/loongarch/hevc_mc.S
@@ -0,0 +1,471 @@ 
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo <jinbo@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+.macro LOAD_VAR bit
+    addi.w         t1,     a5,      6  //shift
+    addi.w         t3,     zero,    1  //one
+    sub.w          t4,     t1,      t3
+    sll.w          t3,     t3,      t4 //offset
+.if \bit == 128
+    vreplgr2vr.w   vr1,    a6          //wx
+    vreplgr2vr.w   vr2,    t3          //offset
+    vreplgr2vr.w   vr3,    t1          //shift
+    vreplgr2vr.w   vr4,    a7          //ox
+.else
+    xvreplgr2vr.w  xr1,    a6
+    xvreplgr2vr.w  xr2,    t3
+    xvreplgr2vr.w  xr3,    t1
+    xvreplgr2vr.w  xr4,    a7
+.endif
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS8_LSX src0, dst0, w
+    vldrepl.d      vr0,    \src0,   0
+    vsllwil.hu.bu  vr0,    vr0,     0
+    vexth.wu.hu    vr5,    vr0
+    vsllwil.wu.hu  vr0,    vr0,     0
+    vslli.w        vr0,    vr0,     6
+    vslli.w        vr5,    vr5,     6
+    vmul.w         vr0,    vr0,     vr1
+    vmul.w         vr5,    vr5,     vr1
+    vadd.w         vr0,    vr0,     vr2
+    vadd.w         vr5,    vr5,     vr2
+    vsra.w         vr0,    vr0,     vr3
+    vsra.w         vr5,    vr5,     vr3
+    vadd.w         vr0,    vr0,     vr4
+    vadd.w         vr5,    vr5,     vr4
+    vssrani.h.w    vr5,    vr0,     0
+    vssrani.bu.h   vr5,    vr5,     0
+.if \w == 6
+    fst.s          f5,     \dst0,   0
+    vstelm.h       vr5,    \dst0,   4,     2
+.else
+    fst.d          f5,     \dst0,   0
+.endif
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS8x2_LASX src0, dst0, w
+    vldrepl.d      vr0,    \src0,   0
+    add.d          t2,     \src0,   a3
+    vldrepl.d      vr5,    t2,      0
+    xvpermi.q      xr0,    xr5,     0x02
+    xvsllwil.hu.bu xr0,    xr0,     0
+    xvexth.wu.hu   xr5,    xr0
+    xvsllwil.wu.hu xr0,    xr0,     0
+    xvslli.w       xr0,    xr0,     6
+    xvslli.w       xr5,    xr5,     6
+    xvmul.w        xr0,    xr0,     xr1
+    xvmul.w        xr5,    xr5,     xr1
+    xvadd.w        xr0,    xr0,     xr2
+    xvadd.w        xr5,    xr5,     xr2
+    xvsra.w        xr0,    xr0,     xr3
+    xvsra.w        xr5,    xr5,     xr3
+    xvadd.w        xr0,    xr0,     xr4
+    xvadd.w        xr5,    xr5,     xr4
+    xvssrani.h.w   xr5,    xr0,     0
+    xvpermi.q      xr0,    xr5,     0x01
+    xvssrani.bu.h  xr0,    xr5,     0
+    add.d          t3,     \dst0,   a1
+.if \w == 6
+    vstelm.w       vr0,    \dst0,   0,     0
+    vstelm.h       vr0,    \dst0,   4,     2
+    vstelm.w       vr0,    t3,      0,     2
+    vstelm.h       vr0,    t3,      4,     6
+.else
+    vstelm.d       vr0,    \dst0,   0,     0
+    vstelm.d       vr0,    t3,      0,     1
+.endif
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS16_LSX src0, dst0
+    vld            vr0,    \src0,   0
+    vexth.hu.bu    vr7,    vr0
+    vexth.wu.hu    vr8,    vr7
+    vsllwil.wu.hu  vr7,    vr7,     0
+    vsllwil.hu.bu  vr5,    vr0,     0
+    vexth.wu.hu    vr6,    vr5
+    vsllwil.wu.hu  vr5,    vr5,     0
+    vslli.w        vr5,    vr5,     6
+    vslli.w        vr6,    vr6,     6
+    vslli.w        vr7,    vr7,     6
+    vslli.w        vr8,    vr8,     6
+    vmul.w         vr5,    vr5,     vr1
+    vmul.w         vr6,    vr6,     vr1
+    vmul.w         vr7,    vr7,     vr1
+    vmul.w         vr8,    vr8,     vr1
+    vadd.w         vr5,    vr5,     vr2
+    vadd.w         vr6,    vr6,     vr2
+    vadd.w         vr7,    vr7,     vr2
+    vadd.w         vr8,    vr8,     vr2
+    vsra.w         vr5,    vr5,     vr3
+    vsra.w         vr6,    vr6,     vr3
+    vsra.w         vr7,    vr7,     vr3
+    vsra.w         vr8,    vr8,     vr3
+    vadd.w         vr5,    vr5,     vr4
+    vadd.w         vr6,    vr6,     vr4
+    vadd.w         vr7,    vr7,     vr4
+    vadd.w         vr8,    vr8,     vr4
+    vssrani.h.w    vr6,    vr5,     0
+    vssrani.h.w    vr8,    vr7,     0
+    vssrani.bu.h   vr8,    vr6,     0
+    vst            vr8,    \dst0,   0
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS16_LASX src0, dst0
+    vld            vr0,    \src0,   0
+    xvpermi.d      xr0,    xr0,     0xd8
+    xvsllwil.hu.bu xr0,    xr0,     0
+    xvexth.wu.hu   xr6,    xr0
+    xvsllwil.wu.hu xr5,    xr0,     0
+    xvslli.w       xr5,    xr5,     6
+    xvslli.w       xr6,    xr6,     6
+    xvmul.w        xr5,    xr5,     xr1
+    xvmul.w        xr6,    xr6,     xr1
+    xvadd.w        xr5,    xr5,     xr2
+    xvadd.w        xr6,    xr6,     xr2
+    xvsra.w        xr5,    xr5,     xr3
+    xvsra.w        xr6,    xr6,     xr3
+    xvadd.w        xr5,    xr5,     xr4
+    xvadd.w        xr6,    xr6,     xr4
+    xvssrani.h.w   xr6,    xr5,     0
+    xvpermi.q      xr7,    xr6,     0x01
+    xvssrani.bu.h  xr7,    xr6,     0
+    vst            vr7,    \dst0,   0
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS32_LASX src0, dst0, w
+.if \w == 16
+    vld            vr0,    \src0,   0
+    add.d          t2,     \src0,   a3
+    vld            vr5,    t2,      0
+    xvpermi.q      xr0,    xr5,     0x02
+.else //w=24/32
+    xvld           xr0,    \src0,   0
+.endif
+    xvexth.hu.bu   xr7,    xr0
+    xvexth.wu.hu   xr8,    xr7
+    xvsllwil.wu.hu xr7,    xr7,     0
+    xvsllwil.hu.bu xr5,    xr0,     0
+    xvexth.wu.hu   xr6,    xr5
+    xvsllwil.wu.hu xr5,    xr5,     0
+    xvslli.w       xr5,    xr5,     6
+    xvslli.w       xr6,    xr6,     6
+    xvslli.w       xr7,    xr7,     6
+    xvslli.w       xr8,    xr8,     6
+    xvmul.w        xr5,    xr5,     xr1
+    xvmul.w        xr6,    xr6,     xr1
+    xvmul.w        xr7,    xr7,     xr1
+    xvmul.w        xr8,    xr8,     xr1
+    xvadd.w        xr5,    xr5,     xr2
+    xvadd.w        xr6,    xr6,     xr2
+    xvadd.w        xr7,    xr7,     xr2
+    xvadd.w        xr8,    xr8,     xr2
+    xvsra.w        xr5,    xr5,     xr3
+    xvsra.w        xr6,    xr6,     xr3
+    xvsra.w        xr7,    xr7,     xr3
+    xvsra.w        xr8,    xr8,     xr3
+    xvadd.w        xr5,    xr5,     xr4
+    xvadd.w        xr6,    xr6,     xr4
+    xvadd.w        xr7,    xr7,     xr4
+    xvadd.w        xr8,    xr8,     xr4
+    xvssrani.h.w   xr6,    xr5,     0
+    xvssrani.h.w   xr8,    xr7,     0
+    xvssrani.bu.h  xr8,    xr6,     0
+.if \w == 16
+    vst            vr8,    \dst0,   0
+    add.d          t2,     \dst0,   a1
+    xvpermi.q      xr8,    xr8,     0x01
+    vst            vr8,    t2,      0
+.elseif \w == 24
+    vst            vr8,    \dst0,   0
+    xvstelm.d      xr8,    \dst0,   16,    2
+.else
+    xvst           xr8,    \dst0,   0
+.endif
+.endm
+
+function ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx
+    LOAD_VAR 128
+    srli.w         t0,     a4,      1
+.LOOP_PIXELS4:
+    vldrepl.w      vr0,    a2,      0
+    add.d          t1,     a2,      a3
+    vldrepl.w      vr5,    t1,      0
+    vsllwil.hu.bu  vr0,    vr0,     0
+    vsllwil.wu.hu  vr0,    vr0,     0
+    vsllwil.hu.bu  vr5,    vr5,     0
+    vsllwil.wu.hu  vr5,    vr5,     0
+    vslli.w        vr0,    vr0,     6
+    vslli.w        vr5,    vr5,     6
+    vmul.w         vr0,    vr0,     vr1
+    vmul.w         vr5,    vr5,     vr1
+    vadd.w         vr0,    vr0,     vr2
+    vadd.w         vr5,    vr5,     vr2
+    vsra.w         vr0,    vr0,     vr3
+    vsra.w         vr5,    vr5,     vr3
+    vadd.w         vr0,    vr0,     vr4
+    vadd.w         vr5,    vr5,     vr4
+    vssrani.h.w    vr5,    vr0,     0
+    vssrani.bu.h   vr5,    vr5,     0
+    fst.s          f5,     a0,      0
+    add.d          t2,     a0,      a1
+    vstelm.w       vr5,    t2,      0,     1
+    alsl.d         a2,     a3,      a2,    1
+    alsl.d         a0,     a1,      a0,    1
+    addi.w         t0,     t0,      -1
+    bnez           t0,     .LOOP_PIXELS4
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS6:
+    HEVC_PEL_UNI_W_PIXELS8_LSX      a2,    a0,    6
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS6
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx
+    LOAD_VAR 256
+    srli.w         t0,     a4,      1
+.LOOP_PIXELS6_LASX:
+    HEVC_PEL_UNI_W_PIXELS8x2_LASX   a2,    a0,    6
+    alsl.d         a2,     a3,      a2,    1
+    alsl.d         a0,     a1,      a0,    1
+    addi.w         t0,     t0,      -1
+    bnez           t0,     .LOOP_PIXELS6_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS8:
+    HEVC_PEL_UNI_W_PIXELS8_LSX      a2,    a0,    8
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS8
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx
+    LOAD_VAR 256
+    srli.w         t0,     a4,      1
+.LOOP_PIXELS8_LASX:
+    HEVC_PEL_UNI_W_PIXELS8x2_LASX   a2,    a0,    8
+    alsl.d         a2,     a3,      a2,    1
+    alsl.d         a0,     a1,      a0,    1
+    addi.w         t0,     t0,      -1
+    bnez           t0,     .LOOP_PIXELS8_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS12:
+    vld            vr0,    a2,      0
+    vexth.hu.bu    vr7,    vr0
+    vsllwil.wu.hu  vr7,    vr7,     0
+    vsllwil.hu.bu  vr5,    vr0,     0
+    vexth.wu.hu    vr6,    vr5
+    vsllwil.wu.hu  vr5,    vr5,     0
+    vslli.w        vr5,    vr5,     6
+    vslli.w        vr6,    vr6,     6
+    vslli.w        vr7,    vr7,     6
+    vmul.w         vr5,    vr5,     vr1
+    vmul.w         vr6,    vr6,     vr1
+    vmul.w         vr7,    vr7,     vr1
+    vadd.w         vr5,    vr5,     vr2
+    vadd.w         vr6,    vr6,     vr2
+    vadd.w         vr7,    vr7,     vr2
+    vsra.w         vr5,    vr5,     vr3
+    vsra.w         vr6,    vr6,     vr3
+    vsra.w         vr7,    vr7,     vr3
+    vadd.w         vr5,    vr5,     vr4
+    vadd.w         vr6,    vr6,     vr4
+    vadd.w         vr7,    vr7,     vr4
+    vssrani.h.w    vr6,    vr5,     0
+    vssrani.h.w    vr7,    vr7,     0
+    vssrani.bu.h   vr7,    vr6,     0
+    fst.d          f7,     a0,      0
+    vstelm.w       vr7,    a0,      8,     2
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS12
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS12_LASX:
+    vld            vr0,    a2,      0
+    xvpermi.d      xr0,    xr0,     0xd8
+    xvsllwil.hu.bu xr0,    xr0,     0
+    xvexth.wu.hu   xr6,    xr0
+    xvsllwil.wu.hu xr5,    xr0,     0
+    xvslli.w       xr5,    xr5,     6
+    xvslli.w       xr6,    xr6,     6
+    xvmul.w        xr5,    xr5,     xr1
+    xvmul.w        xr6,    xr6,     xr1
+    xvadd.w        xr5,    xr5,     xr2
+    xvadd.w        xr6,    xr6,     xr2
+    xvsra.w        xr5,    xr5,     xr3
+    xvsra.w        xr6,    xr6,     xr3
+    xvadd.w        xr5,    xr5,     xr4
+    xvadd.w        xr6,    xr6,     xr4
+    xvssrani.h.w   xr6,    xr5,     0
+    xvpermi.q      xr7,    xr6,     0x01
+    xvssrani.bu.h  xr7,    xr6,     0
+    fst.d          f7,     a0,      0
+    vstelm.w       vr7,    a0,      8,     2
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS12_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS16:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS16
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx
+    LOAD_VAR 256
+    srli.w         t0,     a4,      1
+.LOOP_PIXELS16_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,   16
+    alsl.d         a2,     a3,      a2,    1
+    alsl.d         a0,     a1,      a0,    1
+    addi.w         t0,     t0,      -1
+    bnez           t0,     .LOOP_PIXELS16_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS24:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    addi.d         t0,     a2,      16
+    addi.d         t1,     a0,      16
+    HEVC_PEL_UNI_W_PIXELS8_LSX      t0,    t1,   8
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS24
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS24_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,   24
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS24_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS32:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    addi.d         t0,     a2,      16
+    addi.d         t1,     a0,      16
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS32
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS32_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,    32
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS32_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS48:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    addi.d         t0,     a2,      16
+    addi.d         t1,     a0,      16
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    addi.d         t0,     a2,      32
+    addi.d         t1,     a0,      32
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS48
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS48_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,    32
+    addi.d         t0,     a2,      32
+    addi.d         t1,     a0,      32
+    HEVC_PEL_UNI_W_PIXELS16_LASX    t0,    t1
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS48_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS64:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    addi.d         t0,     a2,      16
+    addi.d         t1,     a0,      16
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    addi.d         t0,     a2,      32
+    addi.d         t1,     a0,      32
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    addi.d         t0,     a2,      48
+    addi.d         t1,     a0,      48
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS64
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS64_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,    32
+    addi.d         t0,     a2,      32
+    addi.d         t1,     a0,      32
+    HEVC_PEL_UNI_W_PIXELS32_LASX    t0,    t1,    32
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS64_LASX
+endfunc
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index a8f753dc86..d0ee99d6b5 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -22,6 +22,7 @@ 
 
 #include "libavutil/loongarch/cpu.h"
 #include "hevcdsp_lsx.h"
+#include "hevcdsp_lasx.h"
 
 void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
 {
@@ -160,6 +161,26 @@  void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
             c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_lsx;
             c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_lsx;
 
+            c->put_hevc_qpel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
+            c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
+            c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
+            c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx;
+            c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx;
+            c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx;
+            c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx;
+            c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
+            c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
+
+            c->put_hevc_epel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
+            c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
+            c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
+            c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx;
+            c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx;
+            c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx;
+            c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx;
+            c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
+            c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
+
             c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx;
             c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx;
             c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx;
@@ -196,4 +217,26 @@  void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
             c->add_residual[3] = ff_hevc_add_residual32x32_8_lsx;
         }
     }
+
+    if (have_lasx(cpu_flags)) {
+        if (bit_depth == 8) {
+            c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx;
+            c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx;
+            c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx;
+            c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx;
+            c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx;
+            c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
+            c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
+            c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
+
+            c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx;
+            c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx;
+            c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx;
+            c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx;
+            c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx;
+            c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
+            c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
+            c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
+        }
+    }
 }
diff --git a/libavcodec/loongarch/hevcdsp_lasx.h b/libavcodec/loongarch/hevcdsp_lasx.h
new file mode 100644
index 0000000000..819c3c3ecf
--- /dev/null
+++ b/libavcodec/loongarch/hevcdsp_lasx.h
@@ -0,0 +1,53 @@ 
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo <jinbo@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
+#define AVCODEC_LOONGARCH_HEVCDSP_LASX_H
+
+#include "libavcodec/hevcdsp.h"
+
+#define PEL_UNI_W(PEL, DIR, WIDTH)                                       \
+void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lasx(uint8_t *dst,  \
+                                                          ptrdiff_t      \
+                                                          dst_stride,    \
+                                                          const uint8_t *src,  \
+                                                          ptrdiff_t      \
+                                                          src_stride,    \
+                                                          int height,    \
+                                                          int denom,     \
+                                                          int wx,        \
+                                                          int ox,        \
+                                                          intptr_t mx,   \
+                                                          intptr_t my,   \
+                                                          int width)
+
+PEL_UNI_W(pel, pixels, 6);
+PEL_UNI_W(pel, pixels, 8);
+PEL_UNI_W(pel, pixels, 12);
+PEL_UNI_W(pel, pixels, 16);
+PEL_UNI_W(pel, pixels, 24);
+PEL_UNI_W(pel, pixels, 32);
+PEL_UNI_W(pel, pixels, 48);
+PEL_UNI_W(pel, pixels, 64);
+
+#undef PEL_UNI_W
+
+#endif  // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index ac509984fd..0d724a90ef 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -232,4 +232,31 @@  void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t s
 void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
 void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
 
+#define PEL_UNI_W(PEL, DIR, WIDTH)                                      \
+void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lsx(uint8_t *dst,  \
+                                                         ptrdiff_t      \
+                                                         dst_stride,    \
+                                                         const uint8_t *src,  \
+                                                         ptrdiff_t      \
+                                                         src_stride,    \
+                                                         int height,    \
+                                                         int denom,     \
+                                                         int wx,        \
+                                                         int ox,        \
+                                                         intptr_t mx,   \
+                                                         intptr_t my,   \
+                                                         int width)
+
+PEL_UNI_W(pel, pixels, 4);
+PEL_UNI_W(pel, pixels, 6);
+PEL_UNI_W(pel, pixels, 8);
+PEL_UNI_W(pel, pixels, 12);
+PEL_UNI_W(pel, pixels, 16);
+PEL_UNI_W(pel, pixels, 24);
+PEL_UNI_W(pel, pixels, 32);
+PEL_UNI_W(pel, pixels, 48);
+PEL_UNI_W(pel, pixels, 64);
+
+#undef PEL_UNI_W
+
 #endif  // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H