diff mbox series

[FFmpeg-devel,v3,2/7] avcodec/hevc: Add add_residual_4/8/16/32 asm opt

Message ID 20231228082105.31311-2-jinbo@loongson.cn
State New
Headers show
Series [FFmpeg-devel,v3,1/7] avcodec/hevc: Add init for sao_edge_filter | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

金波 Dec. 28, 2023, 8:21 a.m. UTC
After this patch, the peformance of decoding H265 4K 30FPS 30Mbps
on 3A6000 with 8 threads improves 2fps (45fps-->47fsp).
---
 libavcodec/loongarch/Makefile                 |   3 +-
 libavcodec/loongarch/hevc_add_res.S           | 162 ++++++++++++++++++
 libavcodec/loongarch/hevcdsp_init_loongarch.c |   5 +
 libavcodec/loongarch/hevcdsp_lsx.h            |   5 +
 4 files changed, 174 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/loongarch/hevc_add_res.S
diff mbox series

Patch

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 06cfab5c20..07ea97f803 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -27,7 +27,8 @@  LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
                                          loongarch/hevc_lpf_sao_lsx.o \
                                          loongarch/hevc_mc_bi_lsx.o \
                                          loongarch/hevc_mc_uni_lsx.o \
-                                         loongarch/hevc_mc_uniw_lsx.o
+                                         loongarch/hevc_mc_uniw_lsx.o \
+                                         loongarch/hevc_add_res.o
 LSX-OBJS-$(CONFIG_H264DSP)            += loongarch/h264idct.o \
                                          loongarch/h264idct_loongarch.o \
                                          loongarch/h264dsp.o
diff --git a/libavcodec/loongarch/hevc_add_res.S b/libavcodec/loongarch/hevc_add_res.S
new file mode 100644
index 0000000000..dd2d820af8
--- /dev/null
+++ b/libavcodec/loongarch/hevc_add_res.S
@@ -0,0 +1,162 @@ 
+/*
+ * Loongson LSX optimized add_residual functions for HEVC decoding
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo <jinbo@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/*
+ * void ff_hevc_add_residual4x4_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+.macro ADD_RES_LSX_4x4_8
+    vldrepl.w      vr0,    a0,     0
+    add.d          t0,     a0,     a2
+    vldrepl.w      vr1,    t0,     0
+    vld            vr2,    a1,     0
+
+    vilvl.w        vr1,    vr1,    vr0
+    vsllwil.hu.bu  vr1,    vr1,    0
+    vadd.h         vr1,    vr1,    vr2
+    vssrani.bu.h   vr1,    vr1,    0
+
+    vstelm.w       vr1,    a0,     0,    0
+    vstelm.w       vr1,    t0,     0,    1
+.endm
+
+function ff_hevc_add_residual4x4_8_lsx
+    ADD_RES_LSX_4x4_8
+    alsl.d         a0,     a2,     a0,   1
+    addi.d         a1,     a1,     16
+    ADD_RES_LSX_4x4_8
+endfunc
+
+/*
+ * void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+.macro ADD_RES_LSX_8x8_8
+    vldrepl.d      vr0,    a0,     0
+    add.d          t0,     a0,     a2
+    vldrepl.d      vr1,    t0,     0
+    add.d          t1,     t0,     a2
+    vldrepl.d      vr2,    t1,     0
+    add.d          t2,     t1,     a2
+    vldrepl.d      vr3,    t2,     0
+
+    vld            vr4,    a1,     0
+    addi.d         t3,     zero,   16
+    vldx           vr5,    a1,     t3
+    addi.d         t4,     a1,     32
+    vld            vr6,    t4,     0
+    vldx           vr7,    t4,     t3
+
+    vsllwil.hu.bu  vr0,    vr0,    0
+    vsllwil.hu.bu  vr1,    vr1,    0
+    vsllwil.hu.bu  vr2,    vr2,    0
+    vsllwil.hu.bu  vr3,    vr3,    0
+    vadd.h         vr0,    vr0,    vr4
+    vadd.h         vr1,    vr1,    vr5
+    vadd.h         vr2,    vr2,    vr6
+    vadd.h         vr3,    vr3,    vr7
+    vssrani.bu.h   vr1,    vr0,    0
+    vssrani.bu.h   vr3,    vr2,    0
+
+    vstelm.d       vr1,    a0,     0,     0
+    vstelm.d       vr1,    t0,     0,     1
+    vstelm.d       vr3,    t1,     0,     0
+    vstelm.d       vr3,    t2,     0,     1
+.endm
+
+function ff_hevc_add_residual8x8_8_lsx
+    ADD_RES_LSX_8x8_8
+    alsl.d         a0,     a2,     a0,    2
+    addi.d         a1,     a1,     64
+    ADD_RES_LSX_8x8_8
+endfunc
+
+/*
+ * void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+function ff_hevc_add_residual16x16_8_lsx
+.rept 8
+    vld            vr0,    a0,     0
+    vldx           vr2,    a0,     a2
+
+    vld            vr4,    a1,     0
+    addi.d         t0,     zero,   16
+    vldx           vr5,    a1,     t0
+    addi.d         t1,     a1,     32
+    vld            vr6,    t1,     0
+    vldx           vr7,    t1,     t0
+
+    vexth.hu.bu    vr1,    vr0
+    vsllwil.hu.bu  vr0,    vr0,    0
+    vexth.hu.bu    vr3,    vr2
+    vsllwil.hu.bu  vr2,    vr2,    0
+    vadd.h         vr0,    vr0,    vr4
+    vadd.h         vr1,    vr1,    vr5
+    vadd.h         vr2,    vr2,    vr6
+    vadd.h         vr3,    vr3,    vr7
+
+    vssrani.bu.h   vr1,    vr0,    0
+    vssrani.bu.h   vr3,    vr2,    0
+
+    vst            vr1,    a0,     0
+    vstx           vr3,    a0,     a2
+
+    alsl.d         a0,     a2,     a0,   1
+    addi.d         a1,     a1,     64
+.endr
+endfunc
+
+/*
+ * void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
+ */
+function ff_hevc_add_residual32x32_8_lsx
+.rept 32
+    vld            vr0,    a0,     0
+    addi.w         t0,     zero,   16
+    vldx           vr2,    a0,     t0
+
+    vld            vr4,    a1,     0
+    vldx           vr5,    a1,     t0
+    addi.d         t1,     a1,     32
+    vld            vr6,    t1,     0
+    vldx           vr7,    t1,     t0
+
+    vexth.hu.bu    vr1,    vr0
+    vsllwil.hu.bu  vr0,    vr0,    0
+    vexth.hu.bu    vr3,    vr2
+    vsllwil.hu.bu  vr2,    vr2,    0
+    vadd.h         vr0,    vr0,    vr4
+    vadd.h         vr1,    vr1,    vr5
+    vadd.h         vr2,    vr2,    vr6
+    vadd.h         vr3,    vr3,    vr7
+
+    vssrani.bu.h   vr1,    vr0,    0
+    vssrani.bu.h   vr3,    vr2,    0
+
+    vst            vr1,    a0,     0
+    vstx           vr3,    a0,     t0
+
+    add.d          a0,     a0,     a2
+    addi.d         a1,     a1,     64
+.endr
+endfunc
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index 5a96f3a4c9..a8f753dc86 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -189,6 +189,11 @@  void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
             c->idct[1] = ff_hevc_idct_8x8_lsx;
             c->idct[2] = ff_hevc_idct_16x16_lsx;
             c->idct[3] = ff_hevc_idct_32x32_lsx;
+
+            c->add_residual[0] = ff_hevc_add_residual4x4_8_lsx;
+            c->add_residual[1] = ff_hevc_add_residual8x8_8_lsx;
+            c->add_residual[2] = ff_hevc_add_residual16x16_8_lsx;
+            c->add_residual[3] = ff_hevc_add_residual32x32_8_lsx;
         }
     }
 }
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index 0d54196caf..ac509984fd 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -227,4 +227,9 @@  void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit);
 
+void ff_hevc_add_residual4x4_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+
 #endif  // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H