Message ID | tencent_9C8BBDE706B1C8C298781854832CBA865209@qq.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel] lavc/vvc_mc: R-V V avg w_avg | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
Added lpad and resolved conflicts with master. <uk7b@foxmail.com> 于2024年8月3日周六 18:31写道: > From: sunyuechi <sunyuechi@iscas.ac.cn> > > C908 X60 > avg_8_2x2_c : 1.2 1.0 > avg_8_2x2_rvv_i32 : 0.7 0.7 > avg_8_2x4_c : 2.0 2.2 > avg_8_2x4_rvv_i32 : 1.2 1.2 > avg_8_2x8_c : 3.7 4.0 > avg_8_2x8_rvv_i32 : 1.7 1.5 > avg_8_2x16_c : 7.2 7.7 > avg_8_2x16_rvv_i32 : 3.0 2.7 > avg_8_2x32_c : 14.2 15.2 > avg_8_2x32_rvv_i32 : 5.5 5.0 > avg_8_2x64_c : 51.0 43.7 > avg_8_2x64_rvv_i32 : 39.2 29.7 > avg_8_2x128_c : 100.5 79.2 > avg_8_2x128_rvv_i32 : 79.7 68.2 > avg_8_4x2_c : 1.7 2.0 > avg_8_4x2_rvv_i32 : 1.0 0.7 > avg_8_4x4_c : 3.5 3.7 > avg_8_4x4_rvv_i32 : 1.2 1.2 > avg_8_4x8_c : 6.7 7.0 > avg_8_4x8_rvv_i32 : 1.7 1.5 > avg_8_4x16_c : 13.5 14.0 > avg_8_4x16_rvv_i32 : 3.0 2.7 > avg_8_4x32_c : 26.2 27.7 > avg_8_4x32_rvv_i32 : 5.5 4.7 > avg_8_4x64_c : 73.0 73.7 > avg_8_4x64_rvv_i32 : 39.0 32.5 > avg_8_4x128_c : 143.0 137.2 > avg_8_4x128_rvv_i32 : 72.7 68.0 > avg_8_8x2_c : 3.5 3.5 > avg_8_8x2_rvv_i32 : 1.0 0.7 > avg_8_8x4_c : 6.2 6.5 > avg_8_8x4_rvv_i32 : 1.5 1.0 > avg_8_8x8_c : 12.7 13.2 > avg_8_8x8_rvv_i32 : 2.0 1.5 > avg_8_8x16_c : 25.0 26.5 > avg_8_8x16_rvv_i32 : 3.2 2.7 > avg_8_8x32_c : 50.0 52.7 > avg_8_8x32_rvv_i32 : 6.2 5.0 > avg_8_8x64_c : 118.7 122.5 > avg_8_8x64_rvv_i32 : 40.2 31.5 > avg_8_8x128_c : 236.7 220.2 > avg_8_8x128_rvv_i32 : 85.2 67.7 > avg_8_16x2_c : 6.2 6.7 > avg_8_16x2_rvv_i32 : 1.2 0.7 > avg_8_16x4_c : 12.5 13.0 > avg_8_16x4_rvv_i32 : 1.7 1.0 > avg_8_16x8_c : 24.5 26.0 > avg_8_16x8_rvv_i32 : 3.0 1.7 > avg_8_16x16_c : 49.0 51.5 > avg_8_16x16_rvv_i32 : 5.5 3.0 > avg_8_16x32_c : 97.5 102.5 > avg_8_16x32_rvv_i32 : 10.5 5.5 > avg_8_16x64_c : 213.7 222.0 > avg_8_16x64_rvv_i32 : 48.5 34.2 > avg_8_16x128_c : 434.7 420.0 > avg_8_16x128_rvv_i32 : 97.7 74.0 > avg_8_32x2_c : 12.2 12.7 > avg_8_32x2_rvv_i32 : 1.5 1.0 > avg_8_32x4_c : 24.5 25.5 > avg_8_32x4_rvv_i32 : 3.0 1.7 > avg_8_32x8_c : 48.5 50.7 > avg_8_32x8_rvv_i32 : 5.2 2.7 > avg_8_32x16_c : 96.7 101.2 > avg_8_32x16_rvv_i32 : 10.2 5.0 > avg_8_32x32_c : 192.7 202.2 > avg_8_32x32_rvv_i32 : 19.7 9.5 > avg_8_32x64_c : 427.5 426.5 > avg_8_32x64_rvv_i32 : 64.2 18.2 > avg_8_32x128_c : 816.5 821.0 > avg_8_32x128_rvv_i32 : 135.2 75.5 > avg_8_64x2_c : 24.0 25.2 > avg_8_64x2_rvv_i32 : 2.7 1.5 > avg_8_64x4_c : 48.2 50.5 > avg_8_64x4_rvv_i32 : 5.0 2.7 > avg_8_64x8_c : 96.0 100.7 > avg_8_64x8_rvv_i32 : 9.7 4.5 > avg_8_64x16_c : 207.7 201.2 > avg_8_64x16_rvv_i32 : 19.0 9.0 > avg_8_64x32_c : 383.2 402.0 > avg_8_64x32_rvv_i32 : 37.5 17.5 > avg_8_64x64_c : 837.2 828.7 > avg_8_64x64_rvv_i32 : 84.7 35.5 > avg_8_64x128_c : 1640.7 1640.2 > avg_8_64x128_rvv_i32 : 206.0 153.0 > avg_8_128x2_c : 48.7 51.0 > avg_8_128x2_rvv_i32 : 5.2 2.7 > avg_8_128x4_c : 96.7 101.5 > avg_8_128x4_rvv_i32 : 10.2 5.0 > avg_8_128x8_c : 192.2 202.0 > avg_8_128x8_rvv_i32 : 19.7 9.2 > avg_8_128x16_c : 400.7 403.2 > avg_8_128x16_rvv_i32 : 38.7 18.5 > avg_8_128x32_c : 786.7 805.7 > avg_8_128x32_rvv_i32 : 77.0 36.2 > avg_8_128x64_c : 1615.5 1655.5 > avg_8_128x64_rvv_i32 : 189.7 80.7 > avg_8_128x128_c : 3182.0 3238.0 > avg_8_128x128_rvv_i32 : 397.5 308.5 > w_avg_8_2x2_c : 1.7 1.2 > w_avg_8_2x2_rvv_i32 : 1.2 1.0 > w_avg_8_2x4_c : 2.7 2.7 > w_avg_8_2x4_rvv_i32 : 1.7 1.5 > w_avg_8_2x8_c : 21.7 4.7 > w_avg_8_2x8_rvv_i32 : 2.7 2.5 > w_avg_8_2x16_c : 9.5 9.2 > w_avg_8_2x16_rvv_i32 : 4.7 4.2 > w_avg_8_2x32_c : 19.0 18.7 > w_avg_8_2x32_rvv_i32 : 9.0 8.0 > w_avg_8_2x64_c : 62.0 50.2 > w_avg_8_2x64_rvv_i32 : 47.7 33.5 > w_avg_8_2x128_c : 116.7 87.7 > w_avg_8_2x128_rvv_i32 : 80.0 69.5 > w_avg_8_4x2_c : 2.5 2.5 > w_avg_8_4x2_rvv_i32 : 1.2 1.0 > w_avg_8_4x4_c : 4.7 4.5 > w_avg_8_4x4_rvv_i32 : 1.7 1.7 > w_avg_8_4x8_c : 9.0 8.7 > w_avg_8_4x8_rvv_i32 : 2.7 2.5 > w_avg_8_4x16_c : 17.7 17.5 > w_avg_8_4x16_rvv_i32 : 4.7 4.2 > w_avg_8_4x32_c : 35.0 35.0 > w_avg_8_4x32_rvv_i32 : 9.0 8.0 > w_avg_8_4x64_c : 100.5 84.5 > w_avg_8_4x64_rvv_i32 : 42.2 33.7 > w_avg_8_4x128_c : 203.5 151.2 > w_avg_8_4x128_rvv_i32 : 83.0 69.5 > w_avg_8_8x2_c : 4.5 4.5 > w_avg_8_8x2_rvv_i32 : 1.2 1.2 > w_avg_8_8x4_c : 8.7 8.7 > w_avg_8_8x4_rvv_i32 : 2.0 1.7 > w_avg_8_8x8_c : 17.0 17.0 > w_avg_8_8x8_rvv_i32 : 3.2 2.5 > w_avg_8_8x16_c : 34.0 33.5 > w_avg_8_8x16_rvv_i32 : 5.5 4.2 > w_avg_8_8x32_c : 86.0 67.5 > w_avg_8_8x32_rvv_i32 : 10.5 8.0 > w_avg_8_8x64_c : 187.2 149.5 > w_avg_8_8x64_rvv_i32 : 45.0 35.5 > w_avg_8_8x128_c : 342.7 290.0 > w_avg_8_8x128_rvv_i32 : 108.7 70.2 > w_avg_8_16x2_c : 8.5 8.2 > w_avg_8_16x2_rvv_i32 : 2.0 1.2 > w_avg_8_16x4_c : 16.7 16.7 > w_avg_8_16x4_rvv_i32 : 3.0 1.7 > w_avg_8_16x8_c : 33.2 33.5 > w_avg_8_16x8_rvv_i32 : 5.5 3.0 > w_avg_8_16x16_c : 66.2 66.7 > w_avg_8_16x16_rvv_i32 : 10.5 5.0 > w_avg_8_16x32_c : 132.5 131.0 > w_avg_8_16x32_rvv_i32 : 20.0 9.7 > w_avg_8_16x64_c : 340.0 283.5 > w_avg_8_16x64_rvv_i32 : 60.5 37.2 > w_avg_8_16x128_c : 641.2 597.5 > w_avg_8_16x128_rvv_i32 : 118.7 77.7 > w_avg_8_32x2_c : 16.5 16.7 > w_avg_8_32x2_rvv_i32 : 3.2 1.7 > w_avg_8_32x4_c : 33.2 33.2 > w_avg_8_32x4_rvv_i32 : 5.5 2.7 > w_avg_8_32x8_c : 66.0 62.5 > w_avg_8_32x8_rvv_i32 : 10.5 5.0 > w_avg_8_32x16_c : 131.5 132.0 > w_avg_8_32x16_rvv_i32 : 20.2 9.5 > w_avg_8_32x32_c : 261.7 272.0 > w_avg_8_32x32_rvv_i32 : 39.7 18.0 > w_avg_8_32x64_c : 575.2 545.5 > w_avg_8_32x64_rvv_i32 : 105.5 58.7 > w_avg_8_32x128_c : 1154.2 1088.0 > w_avg_8_32x128_rvv_i32 : 207.0 98.0 > w_avg_8_64x2_c : 33.0 33.0 > w_avg_8_64x2_rvv_i32 : 6.2 2.7 > w_avg_8_64x4_c : 65.5 66.0 > w_avg_8_64x4_rvv_i32 : 11.5 5.0 > w_avg_8_64x8_c : 131.2 132.5 > w_avg_8_64x8_rvv_i32 : 22.5 9.5 > w_avg_8_64x16_c : 268.2 262.5 > w_avg_8_64x16_rvv_i32 : 44.2 18.0 > w_avg_8_64x32_c : 561.5 528.7 > w_avg_8_64x32_rvv_i32 : 88.0 35.2 > w_avg_8_64x64_c : 1136.2 1124.0 > w_avg_8_64x64_rvv_i32 : 222.0 82.2 > w_avg_8_64x128_c : 2345.0 2312.7 > w_avg_8_64x128_rvv_i32 : 423.0 190.5 > w_avg_8_128x2_c : 65.7 66.5 > w_avg_8_128x2_rvv_i32 : 11.2 5.5 > w_avg_8_128x4_c : 131.2 132.2 > w_avg_8_128x4_rvv_i32 : 22.0 10.2 > w_avg_8_128x8_c : 263.5 312.0 > w_avg_8_128x8_rvv_i32 : 43.2 19.7 > w_avg_8_128x16_c : 528.7 526.2 > w_avg_8_128x16_rvv_i32 : 85.5 39.5 > w_avg_8_128x32_c : 1067.7 1062.7 > w_avg_8_128x32_rvv_i32 : 171.7 78.2 > w_avg_8_128x64_c : 2234.7 2168.7 > w_avg_8_128x64_rvv_i32 : 400.0 159.0 > w_avg_8_128x128_c : 4752.5 4295.0 > w_avg_8_128x128_rvv_i32 : 757.7 365.5 > --- > libavcodec/riscv/vvc/Makefile | 2 + > libavcodec/riscv/vvc/vvc_mc_rvv.S | 287 +++++++++++++++++++++++++++++ > libavcodec/riscv/vvc/vvcdsp_init.c | 72 ++++++++ > libavcodec/vvc/dsp.c | 2 + > libavcodec/vvc/dsp.h | 1 + > 5 files changed, 364 insertions(+) > create mode 100644 libavcodec/riscv/vvc/Makefile > create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S > create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c > > diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile > new file mode 100644 > index 0000000000..582b051579 > --- /dev/null > +++ b/libavcodec/riscv/vvc/Makefile > @@ -0,0 +1,2 @@ > +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o > +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o > diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S > b/libavcodec/riscv/vvc/vvc_mc_rvv.S > new file mode 100644 > index 0000000000..10e1bd67ee > --- /dev/null > +++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S > @@ -0,0 +1,287 @@ > +/* > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences > (ISCAS). > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#include "libavutil/riscv/asm.S" > + > +.macro vsetvlstatic8 w, vlen > + .if \w == 2 && \vlen == 128 > + vsetivli zero, \w, e8, mf8, ta, ma > + .elseif \w == 4 && \vlen == 128 > + vsetivli zero, \w, e8, mf4, ta, ma > + .elseif \w == 8 && \vlen == 128 > + vsetivli zero, \w, e8, mf2, ta, ma > + .elseif \w == 16 && \vlen == 128 > + vsetivli zero, \w, e8, m1, ta, ma > + .elseif \w == 32 && \vlen == 128 > + li t0, \w > + vsetvli zero, t0, e8, m2, ta, ma > + .elseif \w <= 4 && \vlen == 256 > + vsetivli zero, \w, e8, mf8, ta, ma > + .elseif \w == 8 && \vlen == 256 > + vsetivli zero, \w, e8, mf4, ta, ma > + .elseif \w == 16 && \vlen == 256 > + vsetivli zero, \w, e8, mf2, ta, ma > + .elseif \w == 32 && \vlen == 256 > + li t0, \w > + vsetvli zero, t0, e8, m1, ta, ma > + .elseif \w == 64 && \vlen == 256 > + li t0, \w > + vsetvli zero, t0, e8, m2, ta, ma > + .else > + li t0, \w > + vsetvli zero, t0, e8, m4, ta, ma > + .endif > +.endm > + > +.macro vsetvlstatic16 w, vlen > + .if \w == 2 && \vlen == 128 > + vsetivli zero, \w, e16, mf4, ta, ma > + .elseif \w == 4 && \vlen == 128 > + vsetivli zero, \w, e16, mf2, ta, ma > + .elseif \w == 8 && \vlen == 128 > + vsetivli zero, \w, e16, m1, ta, ma > + .elseif \w == 16 && \vlen == 128 > + vsetivli zero, \w, e16, m2, ta, ma > + .elseif \w == 32 && \vlen == 128 > + li t0, \w > + vsetvli zero, t0, e16, m4, ta, ma > + .elseif \w <= 4 && \vlen == 256 > + vsetivli zero, \w, e16, mf4, ta, ma > + .elseif \w == 8 && \vlen == 256 > + vsetivli zero, \w, e16, mf2, ta, ma > + .elseif \w == 16 && \vlen == 256 > + vsetivli zero, \w, e16, m1, ta, ma > + .elseif \w == 32 && \vlen == 256 > + li t0, \w > + vsetvli zero, t0, e16, m2, ta, ma > + .elseif \w == 64 && \vlen == 256 > + li t0, \w > + vsetvli zero, t0, e16, m4, ta, ma > + .else > + li t0, \w > + vsetvli zero, t0, e16, m8, ta, ma > + .endif > +.endm > + > +.macro vsetvlstatic32 w, vlen > + .if \w == 2 > + vsetivli zero, \w, e32, mf2, ta, ma > + .elseif \w == 4 && \vlen == 128 > + vsetivli zero, \w, e32, m1, ta, ma > + .elseif \w == 8 && \vlen == 128 > + vsetivli zero, \w, e32, m2, ta, ma > + .elseif \w == 16 && \vlen == 128 > + vsetivli zero, \w, e32, m4, ta, ma > + .elseif \w == 4 && \vlen == 256 > + vsetivli zero, \w, e32, mf2, ta, ma > + .elseif \w == 8 && \vlen == 256 > + vsetivli zero, \w, e32, m1, ta, ma > + .elseif \w == 16 && \vlen == 256 > + vsetivli zero, \w, e32, m2, ta, ma > + .elseif \w == 32 && \vlen == 256 > + li t0, \w > + vsetvli zero, t0, e32, m4, ta, ma > + .else > + li t0, \w > + vsetvli zero, t0, e32, m8, ta, ma > + .endif > +.endm > + > +.macro avg w, vlen, id > +\id\w\vlen: > +.if \w < 128 > + vsetvlstatic16 \w, \vlen > + addi t0, a2, 128*2 > + addi t1, a3, 128*2 > + add t2, a0, a1 > + vle16.v v0, (a2) > + vle16.v v8, (a3) > + addi a5, a5, -2 > + vle16.v v16, (t0) > + vle16.v v24, (t1) > + vadd.vv v8, v8, v0 > + vadd.vv v24, v24, v16 > + vmax.vx v8, v8, zero > + vmax.vx v24, v24, zero > + vsetvlstatic8 \w, \vlen > + addi a2, a2, 128*4 > + vnclipu.wi v8, v8, 7 > + vnclipu.wi v24, v24, 7 > + addi a3, a3, 128*4 > + vse8.v v8, (a0) > + vse8.v v24, (t2) > + sh1add a0, a1, a0 > +.else > + addi a5, a5, -1 > + mv t1, a0 > + mv t2, a2 > + mv t3, a3 > + mv t4, a4 > +1: > + vsetvli t0, a4, e16, m8, ta, ma > + sub a4, a4, t0 > + vle16.v v0, (a2) > + vle16.v v8, (a3) > + vadd.vv v8, v8, v0 > + vmax.vx v8, v8, zero > + vsetvli zero, zero, e8, m4, ta, ma > + vnclipu.wi v8, v8, 7 > + vse8.v v8, (a0) > + sh1add a2, t0, a2 > + sh1add a3, t0, a3 > + add a0, a0, t0 > + bnez a4, 1b > + add a0, t1, a1 > + addi a2, t2, 128*2 > + addi a3, t3, 128*2 > + mv a4, t4 > +.endif > + bnez a5, \id\w\vlen\()b > + ret > +.endm > + > + > +.macro AVG_JMP_TABLE id, vlen > +const jmp_table_\id\vlen > + .4byte \id\()2\vlen\()f - jmp_table_\id\vlen > + .4byte \id\()4\vlen\()f - jmp_table_\id\vlen > + .4byte \id\()8\vlen\()f - jmp_table_\id\vlen > + .4byte \id\()16\vlen\()f - jmp_table_\id\vlen > + .4byte \id\()32\vlen\()f - jmp_table_\id\vlen > + .4byte \id\()64\vlen\()f - jmp_table_\id\vlen > + .4byte \id\()128\vlen\()f - jmp_table_\id\vlen > +endconst > +.endm > + > +.macro AVG_J vlen, id > + clz t1, a4 > + neg t1, t1 > + lla t5, jmp_table_\id\vlen > + sh2add t1, t1, t5 > + lw t1, ((__riscv_xlen-2)<<2)(t1) > + add t1, t1, t5 > + jr t1 > +.endm > + > +.macro func_avg vlen > +func ff_vvc_avg_8_rvv_\vlen\(), zve32x > + lpad 0 > + AVG_JMP_TABLE 1, \vlen > + csrwi vxrm, 0 > + AVG_J \vlen, 1 > + .irp w,2,4,8,16,32,64,128 > + avg \w, \vlen, 1 > + .endr > +endfunc > +.endm > + > +func_avg 128 > +func_avg 256 > + > +#if (__riscv_xlen == 64) > +.macro w_avg w, vlen, id > +\id\w\vlen: > +.if \w <= 32 || (\w == 64 && \vlen == 256) > + vsetvlstatic16 \w, \vlen > + addi t0, a2, 128*2 > + addi t1, a3, 128*2 > + vle16.v v0, (a2) > + vle16.v v4, (a3) > + addi a5, a5, -2 > + vle16.v v8, (t0) > + vle16.v v12, (t1) > + vwmul.vx v16, v0, a7 > + vwmul.vx v24, v8, a7 > + vwmacc.vx v16, t3, v4 > + vwmacc.vx v24, t3, v12 > + vsetvlstatic32 \w, \vlen > + add t2, a0, a1 > + vadd.vx v16, v16, t4 > + vadd.vx v24, v24, t4 > + vsetvlstatic16 \w, \vlen > + vnsrl.wx v16, v16, t6 > + vnsrl.wx v24, v24, t6 > + vmax.vx v16, v16, zero > + vmax.vx v24, v24, zero > + vsetvlstatic8 \w, \vlen > + addi a2, a2, 128*4 > + vnclipu.wi v16, v16, 0 > + vnclipu.wi v24, v24, 0 > + vse8.v v16, (a0) > + addi a3, a3, 128*4 > + vse8.v v24, (t2) > + sh1add a0, a1, a0 > +.else > + addi a5, a5, -1 > + mv t1, a0 > + mv t2, a2 > + mv t5, a3 > + mv a6, a4 > +1: > + vsetvli t0, a4, e16, m4, ta, ma > + sub a4, a4, t0 > + vle16.v v0, (a2) > + vle16.v v4, (a3) > + vwmul.vx v16, v0, a7 > + vwmacc.vx v16, t3, v4 > + vsetvli zero, zero, e32, m8, ta, ma > + vadd.vx v16, v16, t4 > + vsetvli zero, zero, e16, m4, ta, ma > + vnsrl.wx v16, v16, t6 > + vmax.vx v16, v16, zero > + vsetvli zero, zero, e8, m2, ta, ma > + vnclipu.wi v16, v16, 0 > + vse8.v v16, (a0) > + sh1add a2, t0, a2 > + sh1add a3, t0, a3 > + add a0, a0, t0 > + bnez a4, 1b > + add a0, t1, a1 > + addi a2, t2, 128*2 > + addi a3, t5, 128*2 > + mv a4, a6 > +.endif > + bnez a5, \id\w\vlen\()b > + ret > +.endm > + > +.macro func_w_avg vlen > +func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x > + lpad 0 > + AVG_JMP_TABLE 2, \vlen > + csrwi vxrm, 0 > + addi t6, a6, 7 > + ld t3, (sp) > + ld t4, 8(sp) > + ld t5, 16(sp) > + addi t4, t4, 1 // o0 + o1 + 1 > + add t4, t4, t5 > + addi t5, t6, -1 // shift - 1 > + sll t4, t4, t5 > + AVG_J \vlen, 2 > + .irp w,2,4,8,16,32,64,128 > + w_avg \w, \vlen, 2 > + .endr > +endfunc > +.endm > + > +func_w_avg 128 > +func_w_avg 256 > +#endif > diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c > b/libavcodec/riscv/vvc/vvcdsp_init.c > new file mode 100644 > index 0000000000..9819a7c570 > --- /dev/null > +++ b/libavcodec/riscv/vvc/vvcdsp_init.c > @@ -0,0 +1,72 @@ > +/* > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences > (ISCAS). > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#include "config.h" > + > +#include "libavutil/attributes.h" > +#include "libavutil/cpu.h" > +#include "libavutil/riscv/cpu.h" > +#include "libavcodec/vvc/dsp.h" > + > +#define bf(fn, bd, opt) fn##_##bd##_##opt > + > +#define AVG_PROTOTYPES(bd, opt) > \ > +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, > \ > + const int16_t *src0, const int16_t *src1, int width, int height); > \ > +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, > \ > + const int16_t *src0, const int16_t *src1, int width, int height, > \ > + int denom, int w0, int w1, int o0, int o1); > + > +AVG_PROTOTYPES(8, rvv_128) > +AVG_PROTOTYPES(8, rvv_256) > + > +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd) > +{ > +#if HAVE_RVV > + const int flags = av_get_cpu_flags(); > + int vlenb = ff_get_rv_vlenb(); > + > + if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) && > + vlenb >= 32) { > + switch (bd) { > + case 8: > + c->inter.avg = ff_vvc_avg_8_rvv_256; > +# if (__riscv_xlen == 64) > + c->inter.w_avg = ff_vvc_w_avg_8_rvv_256; > +# endif > + break; > + default: > + break; > + } > + } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & > AV_CPU_FLAG_RVB_ADDR) && > + vlenb >= 16) { > + switch (bd) { > + case 8: > + c->inter.avg = ff_vvc_avg_8_rvv_128; > +# if (__riscv_xlen == 64) > + c->inter.w_avg = ff_vvc_w_avg_8_rvv_128; > +# endif > + break; > + default: > + break; > + } > + } > +#endif > +} > diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c > index 648d54ebb2..0d2e315395 100644 > --- a/libavcodec/vvc/dsp.c > +++ b/libavcodec/vvc/dsp.c > @@ -123,6 +123,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int > bit_depth) > > #if ARCH_AARCH64 > ff_vvc_dsp_init_aarch64(vvcdsp, bit_depth); > +#elif ARCH_RISCV > + ff_vvc_dsp_init_riscv(vvcdsp, bit_depth); > #elif ARCH_X86 > ff_vvc_dsp_init_x86(vvcdsp, bit_depth); > #endif > diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h > index 0b49b97021..4933cca891 100644 > --- a/libavcodec/vvc/dsp.h > +++ b/libavcodec/vvc/dsp.h > @@ -181,6 +181,7 @@ typedef struct VVCDSPContext { > void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); > > void ff_vvc_dsp_init_aarch64(VVCDSPContext *hpc, const int bit_depth); > +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth); > void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth); > > #endif /* AVCODEC_VVC_DSP_H */ > -- > 2.46.0 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
Le 3 août 2024 13:30:34 GMT+03:00, uk7b@foxmail.com a écrit : >From: sunyuechi <sunyuechi@iscas.ac.cn> > > C908 X60 >avg_8_2x2_c : 1.2 1.0 >avg_8_2x2_rvv_i32 : 0.7 0.7 >avg_8_2x4_c : 2.0 2.2 >avg_8_2x4_rvv_i32 : 1.2 1.2 >avg_8_2x8_c : 3.7 4.0 >avg_8_2x8_rvv_i32 : 1.7 1.5 >avg_8_2x16_c : 7.2 7.7 >avg_8_2x16_rvv_i32 : 3.0 2.7 >avg_8_2x32_c : 14.2 15.2 >avg_8_2x32_rvv_i32 : 5.5 5.0 >avg_8_2x64_c : 51.0 43.7 >avg_8_2x64_rvv_i32 : 39.2 29.7 >avg_8_2x128_c : 100.5 79.2 >avg_8_2x128_rvv_i32 : 79.7 68.2 >avg_8_4x2_c : 1.7 2.0 >avg_8_4x2_rvv_i32 : 1.0 0.7 >avg_8_4x4_c : 3.5 3.7 >avg_8_4x4_rvv_i32 : 1.2 1.2 >avg_8_4x8_c : 6.7 7.0 >avg_8_4x8_rvv_i32 : 1.7 1.5 >avg_8_4x16_c : 13.5 14.0 >avg_8_4x16_rvv_i32 : 3.0 2.7 >avg_8_4x32_c : 26.2 27.7 >avg_8_4x32_rvv_i32 : 5.5 4.7 >avg_8_4x64_c : 73.0 73.7 >avg_8_4x64_rvv_i32 : 39.0 32.5 >avg_8_4x128_c : 143.0 137.2 >avg_8_4x128_rvv_i32 : 72.7 68.0 >avg_8_8x2_c : 3.5 3.5 >avg_8_8x2_rvv_i32 : 1.0 0.7 >avg_8_8x4_c : 6.2 6.5 >avg_8_8x4_rvv_i32 : 1.5 1.0 >avg_8_8x8_c : 12.7 13.2 >avg_8_8x8_rvv_i32 : 2.0 1.5 >avg_8_8x16_c : 25.0 26.5 >avg_8_8x16_rvv_i32 : 3.2 2.7 >avg_8_8x32_c : 50.0 52.7 >avg_8_8x32_rvv_i32 : 6.2 5.0 >avg_8_8x64_c : 118.7 122.5 >avg_8_8x64_rvv_i32 : 40.2 31.5 >avg_8_8x128_c : 236.7 220.2 >avg_8_8x128_rvv_i32 : 85.2 67.7 >avg_8_16x2_c : 6.2 6.7 >avg_8_16x2_rvv_i32 : 1.2 0.7 >avg_8_16x4_c : 12.5 13.0 >avg_8_16x4_rvv_i32 : 1.7 1.0 >avg_8_16x8_c : 24.5 26.0 >avg_8_16x8_rvv_i32 : 3.0 1.7 >avg_8_16x16_c : 49.0 51.5 >avg_8_16x16_rvv_i32 : 5.5 3.0 >avg_8_16x32_c : 97.5 102.5 >avg_8_16x32_rvv_i32 : 10.5 5.5 >avg_8_16x64_c : 213.7 222.0 >avg_8_16x64_rvv_i32 : 48.5 34.2 >avg_8_16x128_c : 434.7 420.0 >avg_8_16x128_rvv_i32 : 97.7 74.0 >avg_8_32x2_c : 12.2 12.7 >avg_8_32x2_rvv_i32 : 1.5 1.0 >avg_8_32x4_c : 24.5 25.5 >avg_8_32x4_rvv_i32 : 3.0 1.7 >avg_8_32x8_c : 48.5 50.7 >avg_8_32x8_rvv_i32 : 5.2 2.7 >avg_8_32x16_c : 96.7 101.2 >avg_8_32x16_rvv_i32 : 10.2 5.0 >avg_8_32x32_c : 192.7 202.2 >avg_8_32x32_rvv_i32 : 19.7 9.5 >avg_8_32x64_c : 427.5 426.5 >avg_8_32x64_rvv_i32 : 64.2 18.2 >avg_8_32x128_c : 816.5 821.0 >avg_8_32x128_rvv_i32 : 135.2 75.5 >avg_8_64x2_c : 24.0 25.2 >avg_8_64x2_rvv_i32 : 2.7 1.5 >avg_8_64x4_c : 48.2 50.5 >avg_8_64x4_rvv_i32 : 5.0 2.7 >avg_8_64x8_c : 96.0 100.7 >avg_8_64x8_rvv_i32 : 9.7 4.5 >avg_8_64x16_c : 207.7 201.2 >avg_8_64x16_rvv_i32 : 19.0 9.0 >avg_8_64x32_c : 383.2 402.0 >avg_8_64x32_rvv_i32 : 37.5 17.5 >avg_8_64x64_c : 837.2 828.7 >avg_8_64x64_rvv_i32 : 84.7 35.5 >avg_8_64x128_c : 1640.7 1640.2 >avg_8_64x128_rvv_i32 : 206.0 153.0 >avg_8_128x2_c : 48.7 51.0 >avg_8_128x2_rvv_i32 : 5.2 2.7 >avg_8_128x4_c : 96.7 101.5 >avg_8_128x4_rvv_i32 : 10.2 5.0 >avg_8_128x8_c : 192.2 202.0 >avg_8_128x8_rvv_i32 : 19.7 9.2 >avg_8_128x16_c : 400.7 403.2 >avg_8_128x16_rvv_i32 : 38.7 18.5 >avg_8_128x32_c : 786.7 805.7 >avg_8_128x32_rvv_i32 : 77.0 36.2 >avg_8_128x64_c : 1615.5 1655.5 >avg_8_128x64_rvv_i32 : 189.7 80.7 >avg_8_128x128_c : 3182.0 3238.0 >avg_8_128x128_rvv_i32 : 397.5 308.5 >w_avg_8_2x2_c : 1.7 1.2 >w_avg_8_2x2_rvv_i32 : 1.2 1.0 >w_avg_8_2x4_c : 2.7 2.7 >w_avg_8_2x4_rvv_i32 : 1.7 1.5 >w_avg_8_2x8_c : 21.7 4.7 >w_avg_8_2x8_rvv_i32 : 2.7 2.5 >w_avg_8_2x16_c : 9.5 9.2 >w_avg_8_2x16_rvv_i32 : 4.7 4.2 >w_avg_8_2x32_c : 19.0 18.7 >w_avg_8_2x32_rvv_i32 : 9.0 8.0 >w_avg_8_2x64_c : 62.0 50.2 >w_avg_8_2x64_rvv_i32 : 47.7 33.5 >w_avg_8_2x128_c : 116.7 87.7 >w_avg_8_2x128_rvv_i32 : 80.0 69.5 >w_avg_8_4x2_c : 2.5 2.5 >w_avg_8_4x2_rvv_i32 : 1.2 1.0 >w_avg_8_4x4_c : 4.7 4.5 >w_avg_8_4x4_rvv_i32 : 1.7 1.7 >w_avg_8_4x8_c : 9.0 8.7 >w_avg_8_4x8_rvv_i32 : 2.7 2.5 >w_avg_8_4x16_c : 17.7 17.5 >w_avg_8_4x16_rvv_i32 : 4.7 4.2 >w_avg_8_4x32_c : 35.0 35.0 >w_avg_8_4x32_rvv_i32 : 9.0 8.0 >w_avg_8_4x64_c : 100.5 84.5 >w_avg_8_4x64_rvv_i32 : 42.2 33.7 >w_avg_8_4x128_c : 203.5 151.2 >w_avg_8_4x128_rvv_i32 : 83.0 69.5 >w_avg_8_8x2_c : 4.5 4.5 >w_avg_8_8x2_rvv_i32 : 1.2 1.2 >w_avg_8_8x4_c : 8.7 8.7 >w_avg_8_8x4_rvv_i32 : 2.0 1.7 >w_avg_8_8x8_c : 17.0 17.0 >w_avg_8_8x8_rvv_i32 : 3.2 2.5 >w_avg_8_8x16_c : 34.0 33.5 >w_avg_8_8x16_rvv_i32 : 5.5 4.2 >w_avg_8_8x32_c : 86.0 67.5 >w_avg_8_8x32_rvv_i32 : 10.5 8.0 >w_avg_8_8x64_c : 187.2 149.5 >w_avg_8_8x64_rvv_i32 : 45.0 35.5 >w_avg_8_8x128_c : 342.7 290.0 >w_avg_8_8x128_rvv_i32 : 108.7 70.2 >w_avg_8_16x2_c : 8.5 8.2 >w_avg_8_16x2_rvv_i32 : 2.0 1.2 >w_avg_8_16x4_c : 16.7 16.7 >w_avg_8_16x4_rvv_i32 : 3.0 1.7 >w_avg_8_16x8_c : 33.2 33.5 >w_avg_8_16x8_rvv_i32 : 5.5 3.0 >w_avg_8_16x16_c : 66.2 66.7 >w_avg_8_16x16_rvv_i32 : 10.5 5.0 >w_avg_8_16x32_c : 132.5 131.0 >w_avg_8_16x32_rvv_i32 : 20.0 9.7 >w_avg_8_16x64_c : 340.0 283.5 >w_avg_8_16x64_rvv_i32 : 60.5 37.2 >w_avg_8_16x128_c : 641.2 597.5 >w_avg_8_16x128_rvv_i32 : 118.7 77.7 >w_avg_8_32x2_c : 16.5 16.7 >w_avg_8_32x2_rvv_i32 : 3.2 1.7 >w_avg_8_32x4_c : 33.2 33.2 >w_avg_8_32x4_rvv_i32 : 5.5 2.7 >w_avg_8_32x8_c : 66.0 62.5 >w_avg_8_32x8_rvv_i32 : 10.5 5.0 >w_avg_8_32x16_c : 131.5 132.0 >w_avg_8_32x16_rvv_i32 : 20.2 9.5 >w_avg_8_32x32_c : 261.7 272.0 >w_avg_8_32x32_rvv_i32 : 39.7 18.0 >w_avg_8_32x64_c : 575.2 545.5 >w_avg_8_32x64_rvv_i32 : 105.5 58.7 >w_avg_8_32x128_c : 1154.2 1088.0 >w_avg_8_32x128_rvv_i32 : 207.0 98.0 >w_avg_8_64x2_c : 33.0 33.0 >w_avg_8_64x2_rvv_i32 : 6.2 2.7 >w_avg_8_64x4_c : 65.5 66.0 >w_avg_8_64x4_rvv_i32 : 11.5 5.0 >w_avg_8_64x8_c : 131.2 132.5 >w_avg_8_64x8_rvv_i32 : 22.5 9.5 >w_avg_8_64x16_c : 268.2 262.5 >w_avg_8_64x16_rvv_i32 : 44.2 18.0 >w_avg_8_64x32_c : 561.5 528.7 >w_avg_8_64x32_rvv_i32 : 88.0 35.2 >w_avg_8_64x64_c : 1136.2 1124.0 >w_avg_8_64x64_rvv_i32 : 222.0 82.2 >w_avg_8_64x128_c : 2345.0 2312.7 >w_avg_8_64x128_rvv_i32 : 423.0 190.5 >w_avg_8_128x2_c : 65.7 66.5 >w_avg_8_128x2_rvv_i32 : 11.2 5.5 >w_avg_8_128x4_c : 131.2 132.2 >w_avg_8_128x4_rvv_i32 : 22.0 10.2 >w_avg_8_128x8_c : 263.5 312.0 >w_avg_8_128x8_rvv_i32 : 43.2 19.7 >w_avg_8_128x16_c : 528.7 526.2 >w_avg_8_128x16_rvv_i32 : 85.5 39.5 >w_avg_8_128x32_c : 1067.7 1062.7 >w_avg_8_128x32_rvv_i32 : 171.7 78.2 >w_avg_8_128x64_c : 2234.7 2168.7 >w_avg_8_128x64_rvv_i32 : 400.0 159.0 >w_avg_8_128x128_c : 4752.5 4295.0 >w_avg_8_128x128_rvv_i32 : 757.7 365.5 >--- > libavcodec/riscv/vvc/Makefile | 2 + > libavcodec/riscv/vvc/vvc_mc_rvv.S | 287 +++++++++++++++++++++++++++++ > libavcodec/riscv/vvc/vvcdsp_init.c | 72 ++++++++ > libavcodec/vvc/dsp.c | 2 + > libavcodec/vvc/dsp.h | 1 + > 5 files changed, 364 insertions(+) > create mode 100644 libavcodec/riscv/vvc/Makefile > create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S > create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c > >diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile >new file mode 100644 >index 0000000000..582b051579 >--- /dev/null >+++ b/libavcodec/riscv/vvc/Makefile >@@ -0,0 +1,2 @@ >+OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o >+RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o >diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S b/libavcodec/riscv/vvc/vvc_mc_rvv.S >new file mode 100644 >index 0000000000..10e1bd67ee >--- /dev/null >+++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S >@@ -0,0 +1,287 @@ >+/* >+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS). >+ * >+ * This file is part of FFmpeg. >+ * >+ * FFmpeg is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU Lesser General Public >+ * License as published by the Free Software Foundation; either >+ * version 2.1 of the License, or (at your option) any later version. >+ * >+ * FFmpeg is distributed in the hope that it will be useful, >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >+ * Lesser General Public License for more details. >+ * >+ * You should have received a copy of the GNU Lesser General Public >+ * License along with FFmpeg; if not, write to the Free Software >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA >+ */ >+ >+#include "libavutil/riscv/asm.S" >+ >+.macro vsetvlstatic8 w, vlen >+ .if \w == 2 && \vlen == 128 >+ vsetivli zero, \w, e8, mf8, ta, ma >+ .elseif \w == 4 && \vlen == 128 >+ vsetivli zero, \w, e8, mf4, ta, ma >+ .elseif \w == 8 && \vlen == 128 >+ vsetivli zero, \w, e8, mf2, ta, ma >+ .elseif \w == 16 && \vlen == 128 >+ vsetivli zero, \w, e8, m1, ta, ma >+ .elseif \w == 32 && \vlen == 128 >+ li t0, \w >+ vsetvli zero, t0, e8, m2, ta, ma >+ .elseif \w <= 4 && \vlen == 256 >+ vsetivli zero, \w, e8, mf8, ta, ma >+ .elseif \w == 8 && \vlen == 256 >+ vsetivli zero, \w, e8, mf4, ta, ma >+ .elseif \w == 16 && \vlen == 256 >+ vsetivli zero, \w, e8, mf2, ta, ma >+ .elseif \w == 32 && \vlen == 256 >+ li t0, \w >+ vsetvli zero, t0, e8, m1, ta, ma >+ .elseif \w == 64 && \vlen == 256 >+ li t0, \w >+ vsetvli zero, t0, e8, m2, ta, ma >+ .else >+ li t0, \w >+ vsetvli zero, t0, e8, m4, ta, ma >+ .endif >+.endm >+ >+.macro vsetvlstatic16 w, vlen >+ .if \w == 2 && \vlen == 128 >+ vsetivli zero, \w, e16, mf4, ta, ma >+ .elseif \w == 4 && \vlen == 128 >+ vsetivli zero, \w, e16, mf2, ta, ma >+ .elseif \w == 8 && \vlen == 128 >+ vsetivli zero, \w, e16, m1, ta, ma >+ .elseif \w == 16 && \vlen == 128 >+ vsetivli zero, \w, e16, m2, ta, ma >+ .elseif \w == 32 && \vlen == 128 >+ li t0, \w >+ vsetvli zero, t0, e16, m4, ta, ma >+ .elseif \w <= 4 && \vlen == 256 >+ vsetivli zero, \w, e16, mf4, ta, ma >+ .elseif \w == 8 && \vlen == 256 >+ vsetivli zero, \w, e16, mf2, ta, ma >+ .elseif \w == 16 && \vlen == 256 >+ vsetivli zero, \w, e16, m1, ta, ma >+ .elseif \w == 32 && \vlen == 256 >+ li t0, \w >+ vsetvli zero, t0, e16, m2, ta, ma >+ .elseif \w == 64 && \vlen == 256 >+ li t0, \w >+ vsetvli zero, t0, e16, m4, ta, ma >+ .else >+ li t0, \w >+ vsetvli zero, t0, e16, m8, ta, ma >+ .endif >+.endm >+ >+.macro vsetvlstatic32 w, vlen >+ .if \w == 2 >+ vsetivli zero, \w, e32, mf2, ta, ma >+ .elseif \w == 4 && \vlen == 128 >+ vsetivli zero, \w, e32, m1, ta, ma >+ .elseif \w == 8 && \vlen == 128 >+ vsetivli zero, \w, e32, m2, ta, ma >+ .elseif \w == 16 && \vlen == 128 >+ vsetivli zero, \w, e32, m4, ta, ma >+ .elseif \w == 4 && \vlen == 256 >+ vsetivli zero, \w, e32, mf2, ta, ma >+ .elseif \w == 8 && \vlen == 256 >+ vsetivli zero, \w, e32, m1, ta, ma >+ .elseif \w == 16 && \vlen == 256 >+ vsetivli zero, \w, e32, m2, ta, ma >+ .elseif \w == 32 && \vlen == 256 >+ li t0, \w >+ vsetvli zero, t0, e32, m4, ta, ma >+ .else >+ li t0, \w >+ vsetvli zero, t0, e32, m8, ta, ma >+ .endif >+.endm >+ >+.macro avg w, vlen, id >+\id\w\vlen: >+.if \w < 128 >+ vsetvlstatic16 \w, \vlen >+ addi t0, a2, 128*2 >+ addi t1, a3, 128*2 >+ add t2, a0, a1 >+ vle16.v v0, (a2) >+ vle16.v v8, (a3) >+ addi a5, a5, -2 >+ vle16.v v16, (t0) >+ vle16.v v24, (t1) >+ vadd.vv v8, v8, v0 >+ vadd.vv v24, v24, v16 >+ vmax.vx v8, v8, zero >+ vmax.vx v24, v24, zero With short widths, scaling vertically (rather than horizontally) with strides is likely faster. See also the h.264 weight and biweight functions, which provide a similar algorithm. >+ vsetvlstatic8 \w, \vlen >+ addi a2, a2, 128*4 >+ vnclipu.wi v8, v8, 7 >+ vnclipu.wi v24, v24, 7 >+ addi a3, a3, 128*4 >+ vse8.v v8, (a0) >+ vse8.v v24, (t2) >+ sh1add a0, a1, a0 >+.else >+ addi a5, a5, -1 >+ mv t1, a0 >+ mv t2, a2 >+ mv t3, a3 >+ mv t4, a4 >+1: >+ vsetvli t0, a4, e16, m8, ta, ma >+ sub a4, a4, t0 >+ vle16.v v0, (a2) >+ vle16.v v8, (a3) >+ vadd.vv v8, v8, v0 >+ vmax.vx v8, v8, zero >+ vsetvli zero, zero, e8, m4, ta, ma >+ vnclipu.wi v8, v8, 7 >+ vse8.v v8, (a0) >+ sh1add a2, t0, a2 >+ sh1add a3, t0, a3 >+ add a0, a0, t0 >+ bnez a4, 1b >+ add a0, t1, a1 >+ addi a2, t2, 128*2 >+ addi a3, t3, 128*2 >+ mv a4, t4 >+.endif >+ bnez a5, \id\w\vlen\()b >+ ret >+.endm >+ >+ >+.macro AVG_JMP_TABLE id, vlen >+const jmp_table_\id\vlen >+ .4byte \id\()2\vlen\()f - jmp_table_\id\vlen >+ .4byte \id\()4\vlen\()f - jmp_table_\id\vlen >+ .4byte \id\()8\vlen\()f - jmp_table_\id\vlen >+ .4byte \id\()16\vlen\()f - jmp_table_\id\vlen >+ .4byte \id\()32\vlen\()f - jmp_table_\id\vlen >+ .4byte \id\()64\vlen\()f - jmp_table_\id\vlen >+ .4byte \id\()128\vlen\()f - jmp_table_\id\vlen >+endconst >+.endm >+ >+.macro AVG_J vlen, id >+ clz t1, a4 >+ neg t1, t1 >+ lla t5, jmp_table_\id\vlen >+ sh2add t1, t1, t5 >+ lw t1, ((__riscv_xlen-2)<<2)(t1) >+ add t1, t1, t5 >+ jr t1 >+.endm >+ >+.macro func_avg vlen >+func ff_vvc_avg_8_rvv_\vlen\(), zve32x >+ lpad 0 >+ AVG_JMP_TABLE 1, \vlen >+ csrwi vxrm, 0 >+ AVG_J \vlen, 1 >+ .irp w,2,4,8,16,32,64,128 >+ avg \w, \vlen, 1 >+ .endr >+endfunc >+.endm >+ >+func_avg 128 >+func_avg 256 >+ >+#if (__riscv_xlen == 64) >+.macro w_avg w, vlen, id >+\id\w\vlen: >+.if \w <= 32 || (\w == 64 && \vlen == 256) >+ vsetvlstatic16 \w, \vlen >+ addi t0, a2, 128*2 >+ addi t1, a3, 128*2 >+ vle16.v v0, (a2) >+ vle16.v v4, (a3) >+ addi a5, a5, -2 >+ vle16.v v8, (t0) >+ vle16.v v12, (t1) >+ vwmul.vx v16, v0, a7 >+ vwmul.vx v24, v8, a7 >+ vwmacc.vx v16, t3, v4 >+ vwmacc.vx v24, t3, v12 >+ vsetvlstatic32 \w, \vlen >+ add t2, a0, a1 >+ vadd.vx v16, v16, t4 >+ vadd.vx v24, v24, t4 >+ vsetvlstatic16 \w, \vlen >+ vnsrl.wx v16, v16, t6 >+ vnsrl.wx v24, v24, t6 >+ vmax.vx v16, v16, zero >+ vmax.vx v24, v24, zero >+ vsetvlstatic8 \w, \vlen >+ addi a2, a2, 128*4 >+ vnclipu.wi v16, v16, 0 >+ vnclipu.wi v24, v24, 0 >+ vse8.v v16, (a0) >+ addi a3, a3, 128*4 >+ vse8.v v24, (t2) >+ sh1add a0, a1, a0 >+.else >+ addi a5, a5, -1 >+ mv t1, a0 >+ mv t2, a2 >+ mv t5, a3 >+ mv a6, a4 >+1: >+ vsetvli t0, a4, e16, m4, ta, ma >+ sub a4, a4, t0 >+ vle16.v v0, (a2) >+ vle16.v v4, (a3) >+ vwmul.vx v16, v0, a7 >+ vwmacc.vx v16, t3, v4 >+ vsetvli zero, zero, e32, m8, ta, ma >+ vadd.vx v16, v16, t4 >+ vsetvli zero, zero, e16, m4, ta, ma >+ vnsrl.wx v16, v16, t6 >+ vmax.vx v16, v16, zero >+ vsetvli zero, zero, e8, m2, ta, ma >+ vnclipu.wi v16, v16, 0 >+ vse8.v v16, (a0) >+ sh1add a2, t0, a2 >+ sh1add a3, t0, a3 >+ add a0, a0, t0 >+ bnez a4, 1b >+ add a0, t1, a1 >+ addi a2, t2, 128*2 >+ addi a3, t5, 128*2 >+ mv a4, a6 >+.endif >+ bnez a5, \id\w\vlen\()b >+ ret >+.endm >+ >+.macro func_w_avg vlen >+func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x >+ lpad 0 >+ AVG_JMP_TABLE 2, \vlen >+ csrwi vxrm, 0 >+ addi t6, a6, 7 >+ ld t3, (sp) >+ ld t4, 8(sp) >+ ld t5, 16(sp) >+ addi t4, t4, 1 // o0 + o1 + 1 >+ add t4, t4, t5 >+ addi t5, t6, -1 // shift - 1 >+ sll t4, t4, t5 >+ AVG_J \vlen, 2 >+ .irp w,2,4,8,16,32,64,128 >+ w_avg \w, \vlen, 2 >+ .endr >+endfunc >+.endm >+ >+func_w_avg 128 >+func_w_avg 256 >+#endif >diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c b/libavcodec/riscv/vvc/vvcdsp_init.c >new file mode 100644 >index 0000000000..9819a7c570 >--- /dev/null >+++ b/libavcodec/riscv/vvc/vvcdsp_init.c >@@ -0,0 +1,72 @@ >+/* >+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS). >+ * >+ * This file is part of FFmpeg. >+ * >+ * FFmpeg is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU Lesser General Public >+ * License as published by the Free Software Foundation; either >+ * version 2.1 of the License, or (at your option) any later version. >+ * >+ * FFmpeg is distributed in the hope that it will be useful, >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >+ * Lesser General Public License for more details. >+ * >+ * You should have received a copy of the GNU Lesser General Public >+ * License along with FFmpeg; if not, write to the Free Software >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA >+ */ >+ >+#include "config.h" >+ >+#include "libavutil/attributes.h" >+#include "libavutil/cpu.h" >+#include "libavutil/riscv/cpu.h" >+#include "libavcodec/vvc/dsp.h" >+ >+#define bf(fn, bd, opt) fn##_##bd##_##opt >+ >+#define AVG_PROTOTYPES(bd, opt) \ >+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ >+ const int16_t *src0, const int16_t *src1, int width, int height); \ >+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ >+ const int16_t *src0, const int16_t *src1, int width, int height, \ >+ int denom, int w0, int w1, int o0, int o1); >+ >+AVG_PROTOTYPES(8, rvv_128) >+AVG_PROTOTYPES(8, rvv_256) >+ >+void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd) >+{ >+#if HAVE_RVV >+ const int flags = av_get_cpu_flags(); >+ int vlenb = ff_get_rv_vlenb(); >+ >+ if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) && >+ vlenb >= 32) { >+ switch (bd) { >+ case 8: >+ c->inter.avg = ff_vvc_avg_8_rvv_256; >+# if (__riscv_xlen == 64) >+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_256; >+# endif >+ break; >+ default: >+ break; >+ } >+ } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) && >+ vlenb >= 16) { >+ switch (bd) { >+ case 8: >+ c->inter.avg = ff_vvc_avg_8_rvv_128; >+# if (__riscv_xlen == 64) >+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_128; >+# endif >+ break; >+ default: >+ break; >+ } >+ } >+#endif >+} >diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c >index 648d54ebb2..0d2e315395 100644 >--- a/libavcodec/vvc/dsp.c >+++ b/libavcodec/vvc/dsp.c >@@ -123,6 +123,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth) > > #if ARCH_AARCH64 > ff_vvc_dsp_init_aarch64(vvcdsp, bit_depth); >+#elif ARCH_RISCV >+ ff_vvc_dsp_init_riscv(vvcdsp, bit_depth); > #elif ARCH_X86 > ff_vvc_dsp_init_x86(vvcdsp, bit_depth); > #endif >diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h >index 0b49b97021..4933cca891 100644 >--- a/libavcodec/vvc/dsp.h >+++ b/libavcodec/vvc/dsp.h >@@ -181,6 +181,7 @@ typedef struct VVCDSPContext { > void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); > > void ff_vvc_dsp_init_aarch64(VVCDSPContext *hpc, const int bit_depth); >+void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth); > void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth); > > #endif /* AVCODEC_VVC_DSP_H */
How can I test the weight and biweight of H.264? I haven't seen the related test code.. tests/checkasm/checkasm --bench --test=h264dsp Rémi Denis-Courmont <remi@remlab.net> 于2024年8月15日周四 16:10写道: > > > Le 3 août 2024 13:30:34 GMT+03:00, uk7b@foxmail.com a écrit : > >From: sunyuechi <sunyuechi@iscas.ac.cn> > > > > C908 X60 > >avg_8_2x2_c : 1.2 1.0 > >avg_8_2x2_rvv_i32 : 0.7 0.7 > >avg_8_2x4_c : 2.0 2.2 > >avg_8_2x4_rvv_i32 : 1.2 1.2 > >avg_8_2x8_c : 3.7 4.0 > >avg_8_2x8_rvv_i32 : 1.7 1.5 > >avg_8_2x16_c : 7.2 7.7 > >avg_8_2x16_rvv_i32 : 3.0 2.7 > >avg_8_2x32_c : 14.2 15.2 > >avg_8_2x32_rvv_i32 : 5.5 5.0 > >avg_8_2x64_c : 51.0 43.7 > >avg_8_2x64_rvv_i32 : 39.2 29.7 > >avg_8_2x128_c : 100.5 79.2 > >avg_8_2x128_rvv_i32 : 79.7 68.2 > >avg_8_4x2_c : 1.7 2.0 > >avg_8_4x2_rvv_i32 : 1.0 0.7 > >avg_8_4x4_c : 3.5 3.7 > >avg_8_4x4_rvv_i32 : 1.2 1.2 > >avg_8_4x8_c : 6.7 7.0 > >avg_8_4x8_rvv_i32 : 1.7 1.5 > >avg_8_4x16_c : 13.5 14.0 > >avg_8_4x16_rvv_i32 : 3.0 2.7 > >avg_8_4x32_c : 26.2 27.7 > >avg_8_4x32_rvv_i32 : 5.5 4.7 > >avg_8_4x64_c : 73.0 73.7 > >avg_8_4x64_rvv_i32 : 39.0 32.5 > >avg_8_4x128_c : 143.0 137.2 > >avg_8_4x128_rvv_i32 : 72.7 68.0 > >avg_8_8x2_c : 3.5 3.5 > >avg_8_8x2_rvv_i32 : 1.0 0.7 > >avg_8_8x4_c : 6.2 6.5 > >avg_8_8x4_rvv_i32 : 1.5 1.0 > >avg_8_8x8_c : 12.7 13.2 > >avg_8_8x8_rvv_i32 : 2.0 1.5 > >avg_8_8x16_c : 25.0 26.5 > >avg_8_8x16_rvv_i32 : 3.2 2.7 > >avg_8_8x32_c : 50.0 52.7 > >avg_8_8x32_rvv_i32 : 6.2 5.0 > >avg_8_8x64_c : 118.7 122.5 > >avg_8_8x64_rvv_i32 : 40.2 31.5 > >avg_8_8x128_c : 236.7 220.2 > >avg_8_8x128_rvv_i32 : 85.2 67.7 > >avg_8_16x2_c : 6.2 6.7 > >avg_8_16x2_rvv_i32 : 1.2 0.7 > >avg_8_16x4_c : 12.5 13.0 > >avg_8_16x4_rvv_i32 : 1.7 1.0 > >avg_8_16x8_c : 24.5 26.0 > >avg_8_16x8_rvv_i32 : 3.0 1.7 > >avg_8_16x16_c : 49.0 51.5 > >avg_8_16x16_rvv_i32 : 5.5 3.0 > >avg_8_16x32_c : 97.5 102.5 > >avg_8_16x32_rvv_i32 : 10.5 5.5 > >avg_8_16x64_c : 213.7 222.0 > >avg_8_16x64_rvv_i32 : 48.5 34.2 > >avg_8_16x128_c : 434.7 420.0 > >avg_8_16x128_rvv_i32 : 97.7 74.0 > >avg_8_32x2_c : 12.2 12.7 > >avg_8_32x2_rvv_i32 : 1.5 1.0 > >avg_8_32x4_c : 24.5 25.5 > >avg_8_32x4_rvv_i32 : 3.0 1.7 > >avg_8_32x8_c : 48.5 50.7 > >avg_8_32x8_rvv_i32 : 5.2 2.7 > >avg_8_32x16_c : 96.7 101.2 > >avg_8_32x16_rvv_i32 : 10.2 5.0 > >avg_8_32x32_c : 192.7 202.2 > >avg_8_32x32_rvv_i32 : 19.7 9.5 > >avg_8_32x64_c : 427.5 426.5 > >avg_8_32x64_rvv_i32 : 64.2 18.2 > >avg_8_32x128_c : 816.5 821.0 > >avg_8_32x128_rvv_i32 : 135.2 75.5 > >avg_8_64x2_c : 24.0 25.2 > >avg_8_64x2_rvv_i32 : 2.7 1.5 > >avg_8_64x4_c : 48.2 50.5 > >avg_8_64x4_rvv_i32 : 5.0 2.7 > >avg_8_64x8_c : 96.0 100.7 > >avg_8_64x8_rvv_i32 : 9.7 4.5 > >avg_8_64x16_c : 207.7 201.2 > >avg_8_64x16_rvv_i32 : 19.0 9.0 > >avg_8_64x32_c : 383.2 402.0 > >avg_8_64x32_rvv_i32 : 37.5 17.5 > >avg_8_64x64_c : 837.2 828.7 > >avg_8_64x64_rvv_i32 : 84.7 35.5 > >avg_8_64x128_c : 1640.7 1640.2 > >avg_8_64x128_rvv_i32 : 206.0 153.0 > >avg_8_128x2_c : 48.7 51.0 > >avg_8_128x2_rvv_i32 : 5.2 2.7 > >avg_8_128x4_c : 96.7 101.5 > >avg_8_128x4_rvv_i32 : 10.2 5.0 > >avg_8_128x8_c : 192.2 202.0 > >avg_8_128x8_rvv_i32 : 19.7 9.2 > >avg_8_128x16_c : 400.7 403.2 > >avg_8_128x16_rvv_i32 : 38.7 18.5 > >avg_8_128x32_c : 786.7 805.7 > >avg_8_128x32_rvv_i32 : 77.0 36.2 > >avg_8_128x64_c : 1615.5 1655.5 > >avg_8_128x64_rvv_i32 : 189.7 80.7 > >avg_8_128x128_c : 3182.0 3238.0 > >avg_8_128x128_rvv_i32 : 397.5 308.5 > >w_avg_8_2x2_c : 1.7 1.2 > >w_avg_8_2x2_rvv_i32 : 1.2 1.0 > >w_avg_8_2x4_c : 2.7 2.7 > >w_avg_8_2x4_rvv_i32 : 1.7 1.5 > >w_avg_8_2x8_c : 21.7 4.7 > >w_avg_8_2x8_rvv_i32 : 2.7 2.5 > >w_avg_8_2x16_c : 9.5 9.2 > >w_avg_8_2x16_rvv_i32 : 4.7 4.2 > >w_avg_8_2x32_c : 19.0 18.7 > >w_avg_8_2x32_rvv_i32 : 9.0 8.0 > >w_avg_8_2x64_c : 62.0 50.2 > >w_avg_8_2x64_rvv_i32 : 47.7 33.5 > >w_avg_8_2x128_c : 116.7 87.7 > >w_avg_8_2x128_rvv_i32 : 80.0 69.5 > >w_avg_8_4x2_c : 2.5 2.5 > >w_avg_8_4x2_rvv_i32 : 1.2 1.0 > >w_avg_8_4x4_c : 4.7 4.5 > >w_avg_8_4x4_rvv_i32 : 1.7 1.7 > >w_avg_8_4x8_c : 9.0 8.7 > >w_avg_8_4x8_rvv_i32 : 2.7 2.5 > >w_avg_8_4x16_c : 17.7 17.5 > >w_avg_8_4x16_rvv_i32 : 4.7 4.2 > >w_avg_8_4x32_c : 35.0 35.0 > >w_avg_8_4x32_rvv_i32 : 9.0 8.0 > >w_avg_8_4x64_c : 100.5 84.5 > >w_avg_8_4x64_rvv_i32 : 42.2 33.7 > >w_avg_8_4x128_c : 203.5 151.2 > >w_avg_8_4x128_rvv_i32 : 83.0 69.5 > >w_avg_8_8x2_c : 4.5 4.5 > >w_avg_8_8x2_rvv_i32 : 1.2 1.2 > >w_avg_8_8x4_c : 8.7 8.7 > >w_avg_8_8x4_rvv_i32 : 2.0 1.7 > >w_avg_8_8x8_c : 17.0 17.0 > >w_avg_8_8x8_rvv_i32 : 3.2 2.5 > >w_avg_8_8x16_c : 34.0 33.5 > >w_avg_8_8x16_rvv_i32 : 5.5 4.2 > >w_avg_8_8x32_c : 86.0 67.5 > >w_avg_8_8x32_rvv_i32 : 10.5 8.0 > >w_avg_8_8x64_c : 187.2 149.5 > >w_avg_8_8x64_rvv_i32 : 45.0 35.5 > >w_avg_8_8x128_c : 342.7 290.0 > >w_avg_8_8x128_rvv_i32 : 108.7 70.2 > >w_avg_8_16x2_c : 8.5 8.2 > >w_avg_8_16x2_rvv_i32 : 2.0 1.2 > >w_avg_8_16x4_c : 16.7 16.7 > >w_avg_8_16x4_rvv_i32 : 3.0 1.7 > >w_avg_8_16x8_c : 33.2 33.5 > >w_avg_8_16x8_rvv_i32 : 5.5 3.0 > >w_avg_8_16x16_c : 66.2 66.7 > >w_avg_8_16x16_rvv_i32 : 10.5 5.0 > >w_avg_8_16x32_c : 132.5 131.0 > >w_avg_8_16x32_rvv_i32 : 20.0 9.7 > >w_avg_8_16x64_c : 340.0 283.5 > >w_avg_8_16x64_rvv_i32 : 60.5 37.2 > >w_avg_8_16x128_c : 641.2 597.5 > >w_avg_8_16x128_rvv_i32 : 118.7 77.7 > >w_avg_8_32x2_c : 16.5 16.7 > >w_avg_8_32x2_rvv_i32 : 3.2 1.7 > >w_avg_8_32x4_c : 33.2 33.2 > >w_avg_8_32x4_rvv_i32 : 5.5 2.7 > >w_avg_8_32x8_c : 66.0 62.5 > >w_avg_8_32x8_rvv_i32 : 10.5 5.0 > >w_avg_8_32x16_c : 131.5 132.0 > >w_avg_8_32x16_rvv_i32 : 20.2 9.5 > >w_avg_8_32x32_c : 261.7 272.0 > >w_avg_8_32x32_rvv_i32 : 39.7 18.0 > >w_avg_8_32x64_c : 575.2 545.5 > >w_avg_8_32x64_rvv_i32 : 105.5 58.7 > >w_avg_8_32x128_c : 1154.2 1088.0 > >w_avg_8_32x128_rvv_i32 : 207.0 98.0 > >w_avg_8_64x2_c : 33.0 33.0 > >w_avg_8_64x2_rvv_i32 : 6.2 2.7 > >w_avg_8_64x4_c : 65.5 66.0 > >w_avg_8_64x4_rvv_i32 : 11.5 5.0 > >w_avg_8_64x8_c : 131.2 132.5 > >w_avg_8_64x8_rvv_i32 : 22.5 9.5 > >w_avg_8_64x16_c : 268.2 262.5 > >w_avg_8_64x16_rvv_i32 : 44.2 18.0 > >w_avg_8_64x32_c : 561.5 528.7 > >w_avg_8_64x32_rvv_i32 : 88.0 35.2 > >w_avg_8_64x64_c : 1136.2 1124.0 > >w_avg_8_64x64_rvv_i32 : 222.0 82.2 > >w_avg_8_64x128_c : 2345.0 2312.7 > >w_avg_8_64x128_rvv_i32 : 423.0 190.5 > >w_avg_8_128x2_c : 65.7 66.5 > >w_avg_8_128x2_rvv_i32 : 11.2 5.5 > >w_avg_8_128x4_c : 131.2 132.2 > >w_avg_8_128x4_rvv_i32 : 22.0 10.2 > >w_avg_8_128x8_c : 263.5 312.0 > >w_avg_8_128x8_rvv_i32 : 43.2 19.7 > >w_avg_8_128x16_c : 528.7 526.2 > >w_avg_8_128x16_rvv_i32 : 85.5 39.5 > >w_avg_8_128x32_c : 1067.7 1062.7 > >w_avg_8_128x32_rvv_i32 : 171.7 78.2 > >w_avg_8_128x64_c : 2234.7 2168.7 > >w_avg_8_128x64_rvv_i32 : 400.0 159.0 > >w_avg_8_128x128_c : 4752.5 4295.0 > >w_avg_8_128x128_rvv_i32 : 757.7 365.5 > >--- > > libavcodec/riscv/vvc/Makefile | 2 + > > libavcodec/riscv/vvc/vvc_mc_rvv.S | 287 +++++++++++++++++++++++++++++ > > libavcodec/riscv/vvc/vvcdsp_init.c | 72 ++++++++ > > libavcodec/vvc/dsp.c | 2 + > > libavcodec/vvc/dsp.h | 1 + > > 5 files changed, 364 insertions(+) > > create mode 100644 libavcodec/riscv/vvc/Makefile > > create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S > > create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c > > > >diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile > >new file mode 100644 > >index 0000000000..582b051579 > >--- /dev/null > >+++ b/libavcodec/riscv/vvc/Makefile > >@@ -0,0 +1,2 @@ > >+OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o > >+RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o > >diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S > b/libavcodec/riscv/vvc/vvc_mc_rvv.S > >new file mode 100644 > >index 0000000000..10e1bd67ee > >--- /dev/null > >+++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S > >@@ -0,0 +1,287 @@ > >+/* > >+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences > (ISCAS). > >+ * > >+ * This file is part of FFmpeg. > >+ * > >+ * FFmpeg is free software; you can redistribute it and/or > >+ * modify it under the terms of the GNU Lesser General Public > >+ * License as published by the Free Software Foundation; either > >+ * version 2.1 of the License, or (at your option) any later version. > >+ * > >+ * FFmpeg is distributed in the hope that it will be useful, > >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of > >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >+ * Lesser General Public License for more details. > >+ * > >+ * You should have received a copy of the GNU Lesser General Public > >+ * License along with FFmpeg; if not, write to the Free Software > >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > >+ */ > >+ > >+#include "libavutil/riscv/asm.S" > >+ > >+.macro vsetvlstatic8 w, vlen > >+ .if \w == 2 && \vlen == 128 > >+ vsetivli zero, \w, e8, mf8, ta, ma > >+ .elseif \w == 4 && \vlen == 128 > >+ vsetivli zero, \w, e8, mf4, ta, ma > >+ .elseif \w == 8 && \vlen == 128 > >+ vsetivli zero, \w, e8, mf2, ta, ma > >+ .elseif \w == 16 && \vlen == 128 > >+ vsetivli zero, \w, e8, m1, ta, ma > >+ .elseif \w == 32 && \vlen == 128 > >+ li t0, \w > >+ vsetvli zero, t0, e8, m2, ta, ma > >+ .elseif \w <= 4 && \vlen == 256 > >+ vsetivli zero, \w, e8, mf8, ta, ma > >+ .elseif \w == 8 && \vlen == 256 > >+ vsetivli zero, \w, e8, mf4, ta, ma > >+ .elseif \w == 16 && \vlen == 256 > >+ vsetivli zero, \w, e8, mf2, ta, ma > >+ .elseif \w == 32 && \vlen == 256 > >+ li t0, \w > >+ vsetvli zero, t0, e8, m1, ta, ma > >+ .elseif \w == 64 && \vlen == 256 > >+ li t0, \w > >+ vsetvli zero, t0, e8, m2, ta, ma > >+ .else > >+ li t0, \w > >+ vsetvli zero, t0, e8, m4, ta, ma > >+ .endif > >+.endm > >+ > >+.macro vsetvlstatic16 w, vlen > >+ .if \w == 2 && \vlen == 128 > >+ vsetivli zero, \w, e16, mf4, ta, ma > >+ .elseif \w == 4 && \vlen == 128 > >+ vsetivli zero, \w, e16, mf2, ta, ma > >+ .elseif \w == 8 && \vlen == 128 > >+ vsetivli zero, \w, e16, m1, ta, ma > >+ .elseif \w == 16 && \vlen == 128 > >+ vsetivli zero, \w, e16, m2, ta, ma > >+ .elseif \w == 32 && \vlen == 128 > >+ li t0, \w > >+ vsetvli zero, t0, e16, m4, ta, ma > >+ .elseif \w <= 4 && \vlen == 256 > >+ vsetivli zero, \w, e16, mf4, ta, ma > >+ .elseif \w == 8 && \vlen == 256 > >+ vsetivli zero, \w, e16, mf2, ta, ma > >+ .elseif \w == 16 && \vlen == 256 > >+ vsetivli zero, \w, e16, m1, ta, ma > >+ .elseif \w == 32 && \vlen == 256 > >+ li t0, \w > >+ vsetvli zero, t0, e16, m2, ta, ma > >+ .elseif \w == 64 && \vlen == 256 > >+ li t0, \w > >+ vsetvli zero, t0, e16, m4, ta, ma > >+ .else > >+ li t0, \w > >+ vsetvli zero, t0, e16, m8, ta, ma > >+ .endif > >+.endm > >+ > >+.macro vsetvlstatic32 w, vlen > >+ .if \w == 2 > >+ vsetivli zero, \w, e32, mf2, ta, ma > >+ .elseif \w == 4 && \vlen == 128 > >+ vsetivli zero, \w, e32, m1, ta, ma > >+ .elseif \w == 8 && \vlen == 128 > >+ vsetivli zero, \w, e32, m2, ta, ma > >+ .elseif \w == 16 && \vlen == 128 > >+ vsetivli zero, \w, e32, m4, ta, ma > >+ .elseif \w == 4 && \vlen == 256 > >+ vsetivli zero, \w, e32, mf2, ta, ma > >+ .elseif \w == 8 && \vlen == 256 > >+ vsetivli zero, \w, e32, m1, ta, ma > >+ .elseif \w == 16 && \vlen == 256 > >+ vsetivli zero, \w, e32, m2, ta, ma > >+ .elseif \w == 32 && \vlen == 256 > >+ li t0, \w > >+ vsetvli zero, t0, e32, m4, ta, ma > >+ .else > >+ li t0, \w > >+ vsetvli zero, t0, e32, m8, ta, ma > >+ .endif > >+.endm > >+ > >+.macro avg w, vlen, id > >+\id\w\vlen: > >+.if \w < 128 > >+ vsetvlstatic16 \w, \vlen > >+ addi t0, a2, 128*2 > >+ addi t1, a3, 128*2 > >+ add t2, a0, a1 > >+ vle16.v v0, (a2) > >+ vle16.v v8, (a3) > >+ addi a5, a5, -2 > >+ vle16.v v16, (t0) > >+ vle16.v v24, (t1) > >+ vadd.vv v8, v8, v0 > >+ vadd.vv v24, v24, v16 > >+ vmax.vx v8, v8, zero > >+ vmax.vx v24, v24, zero > > With short widths, scaling vertically (rather than horizontally) with > strides is likely faster. See also the h.264 weight and biweight functions, > which provide a similar algorithm. > > >+ vsetvlstatic8 \w, \vlen > >+ addi a2, a2, 128*4 > >+ vnclipu.wi v8, v8, 7 > >+ vnclipu.wi v24, v24, 7 > >+ addi a3, a3, 128*4 > >+ vse8.v v8, (a0) > >+ vse8.v v24, (t2) > >+ sh1add a0, a1, a0 > >+.else > >+ addi a5, a5, -1 > >+ mv t1, a0 > >+ mv t2, a2 > >+ mv t3, a3 > >+ mv t4, a4 > >+1: > >+ vsetvli t0, a4, e16, m8, ta, ma > >+ sub a4, a4, t0 > >+ vle16.v v0, (a2) > >+ vle16.v v8, (a3) > >+ vadd.vv v8, v8, v0 > >+ vmax.vx v8, v8, zero > >+ vsetvli zero, zero, e8, m4, ta, ma > >+ vnclipu.wi v8, v8, 7 > >+ vse8.v v8, (a0) > >+ sh1add a2, t0, a2 > >+ sh1add a3, t0, a3 > >+ add a0, a0, t0 > >+ bnez a4, 1b > >+ add a0, t1, a1 > >+ addi a2, t2, 128*2 > >+ addi a3, t3, 128*2 > >+ mv a4, t4 > >+.endif > >+ bnez a5, \id\w\vlen\()b > >+ ret > >+.endm > >+ > >+ > >+.macro AVG_JMP_TABLE id, vlen > >+const jmp_table_\id\vlen > >+ .4byte \id\()2\vlen\()f - jmp_table_\id\vlen > >+ .4byte \id\()4\vlen\()f - jmp_table_\id\vlen > >+ .4byte \id\()8\vlen\()f - jmp_table_\id\vlen > >+ .4byte \id\()16\vlen\()f - jmp_table_\id\vlen > >+ .4byte \id\()32\vlen\()f - jmp_table_\id\vlen > >+ .4byte \id\()64\vlen\()f - jmp_table_\id\vlen > >+ .4byte \id\()128\vlen\()f - jmp_table_\id\vlen > >+endconst > >+.endm > >+ > >+.macro AVG_J vlen, id > >+ clz t1, a4 > >+ neg t1, t1 > >+ lla t5, jmp_table_\id\vlen > >+ sh2add t1, t1, t5 > >+ lw t1, ((__riscv_xlen-2)<<2)(t1) > >+ add t1, t1, t5 > >+ jr t1 > >+.endm > >+ > >+.macro func_avg vlen > >+func ff_vvc_avg_8_rvv_\vlen\(), zve32x > >+ lpad 0 > >+ AVG_JMP_TABLE 1, \vlen > >+ csrwi vxrm, 0 > >+ AVG_J \vlen, 1 > >+ .irp w,2,4,8,16,32,64,128 > >+ avg \w, \vlen, 1 > >+ .endr > >+endfunc > >+.endm > >+ > >+func_avg 128 > >+func_avg 256 > >+ > >+#if (__riscv_xlen == 64) > >+.macro w_avg w, vlen, id > >+\id\w\vlen: > >+.if \w <= 32 || (\w == 64 && \vlen == 256) > >+ vsetvlstatic16 \w, \vlen > >+ addi t0, a2, 128*2 > >+ addi t1, a3, 128*2 > >+ vle16.v v0, (a2) > >+ vle16.v v4, (a3) > >+ addi a5, a5, -2 > >+ vle16.v v8, (t0) > >+ vle16.v v12, (t1) > >+ vwmul.vx v16, v0, a7 > >+ vwmul.vx v24, v8, a7 > >+ vwmacc.vx v16, t3, v4 > >+ vwmacc.vx v24, t3, v12 > >+ vsetvlstatic32 \w, \vlen > >+ add t2, a0, a1 > >+ vadd.vx v16, v16, t4 > >+ vadd.vx v24, v24, t4 > >+ vsetvlstatic16 \w, \vlen > >+ vnsrl.wx v16, v16, t6 > >+ vnsrl.wx v24, v24, t6 > >+ vmax.vx v16, v16, zero > >+ vmax.vx v24, v24, zero > >+ vsetvlstatic8 \w, \vlen > >+ addi a2, a2, 128*4 > >+ vnclipu.wi v16, v16, 0 > >+ vnclipu.wi v24, v24, 0 > >+ vse8.v v16, (a0) > >+ addi a3, a3, 128*4 > >+ vse8.v v24, (t2) > >+ sh1add a0, a1, a0 > >+.else > >+ addi a5, a5, -1 > >+ mv t1, a0 > >+ mv t2, a2 > >+ mv t5, a3 > >+ mv a6, a4 > >+1: > >+ vsetvli t0, a4, e16, m4, ta, ma > >+ sub a4, a4, t0 > >+ vle16.v v0, (a2) > >+ vle16.v v4, (a3) > >+ vwmul.vx v16, v0, a7 > >+ vwmacc.vx v16, t3, v4 > >+ vsetvli zero, zero, e32, m8, ta, ma > >+ vadd.vx v16, v16, t4 > >+ vsetvli zero, zero, e16, m4, ta, ma > >+ vnsrl.wx v16, v16, t6 > >+ vmax.vx v16, v16, zero > >+ vsetvli zero, zero, e8, m2, ta, ma > >+ vnclipu.wi v16, v16, 0 > >+ vse8.v v16, (a0) > >+ sh1add a2, t0, a2 > >+ sh1add a3, t0, a3 > >+ add a0, a0, t0 > >+ bnez a4, 1b > >+ add a0, t1, a1 > >+ addi a2, t2, 128*2 > >+ addi a3, t5, 128*2 > >+ mv a4, a6 > >+.endif > >+ bnez a5, \id\w\vlen\()b > >+ ret > >+.endm > >+ > >+.macro func_w_avg vlen > >+func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x > >+ lpad 0 > >+ AVG_JMP_TABLE 2, \vlen > >+ csrwi vxrm, 0 > >+ addi t6, a6, 7 > >+ ld t3, (sp) > >+ ld t4, 8(sp) > >+ ld t5, 16(sp) > >+ addi t4, t4, 1 // o0 + o1 + 1 > >+ add t4, t4, t5 > >+ addi t5, t6, -1 // shift - 1 > >+ sll t4, t4, t5 > >+ AVG_J \vlen, 2 > >+ .irp w,2,4,8,16,32,64,128 > >+ w_avg \w, \vlen, 2 > >+ .endr > >+endfunc > >+.endm > >+ > >+func_w_avg 128 > >+func_w_avg 256 > >+#endif > >diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c > b/libavcodec/riscv/vvc/vvcdsp_init.c > >new file mode 100644 > >index 0000000000..9819a7c570 > >--- /dev/null > >+++ b/libavcodec/riscv/vvc/vvcdsp_init.c > >@@ -0,0 +1,72 @@ > >+/* > >+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences > (ISCAS). > >+ * > >+ * This file is part of FFmpeg. > >+ * > >+ * FFmpeg is free software; you can redistribute it and/or > >+ * modify it under the terms of the GNU Lesser General Public > >+ * License as published by the Free Software Foundation; either > >+ * version 2.1 of the License, or (at your option) any later version. > >+ * > >+ * FFmpeg is distributed in the hope that it will be useful, > >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of > >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >+ * Lesser General Public License for more details. > >+ * > >+ * You should have received a copy of the GNU Lesser General Public > >+ * License along with FFmpeg; if not, write to the Free Software > >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > >+ */ > >+ > >+#include "config.h" > >+ > >+#include "libavutil/attributes.h" > >+#include "libavutil/cpu.h" > >+#include "libavutil/riscv/cpu.h" > >+#include "libavcodec/vvc/dsp.h" > >+ > >+#define bf(fn, bd, opt) fn##_##bd##_##opt > >+ > >+#define AVG_PROTOTYPES(bd, opt) > \ > >+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, > \ > >+ const int16_t *src0, const int16_t *src1, int width, int height); > \ > >+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, > \ > >+ const int16_t *src0, const int16_t *src1, int width, int height, > \ > >+ int denom, int w0, int w1, int o0, int o1); > >+ > >+AVG_PROTOTYPES(8, rvv_128) > >+AVG_PROTOTYPES(8, rvv_256) > >+ > >+void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd) > >+{ > >+#if HAVE_RVV > >+ const int flags = av_get_cpu_flags(); > >+ int vlenb = ff_get_rv_vlenb(); > >+ > >+ if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) > && > >+ vlenb >= 32) { > >+ switch (bd) { > >+ case 8: > >+ c->inter.avg = ff_vvc_avg_8_rvv_256; > >+# if (__riscv_xlen == 64) > >+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_256; > >+# endif > >+ break; > >+ default: > >+ break; > >+ } > >+ } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & > AV_CPU_FLAG_RVB_ADDR) && > >+ vlenb >= 16) { > >+ switch (bd) { > >+ case 8: > >+ c->inter.avg = ff_vvc_avg_8_rvv_128; > >+# if (__riscv_xlen == 64) > >+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_128; > >+# endif > >+ break; > >+ default: > >+ break; > >+ } > >+ } > >+#endif > >+} > >diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c > >index 648d54ebb2..0d2e315395 100644 > >--- a/libavcodec/vvc/dsp.c > >+++ b/libavcodec/vvc/dsp.c > >@@ -123,6 +123,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int > bit_depth) > > > > #if ARCH_AARCH64 > > ff_vvc_dsp_init_aarch64(vvcdsp, bit_depth); > >+#elif ARCH_RISCV > >+ ff_vvc_dsp_init_riscv(vvcdsp, bit_depth); > > #elif ARCH_X86 > > ff_vvc_dsp_init_x86(vvcdsp, bit_depth); > > #endif > >diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h > >index 0b49b97021..4933cca891 100644 > >--- a/libavcodec/vvc/dsp.h > >+++ b/libavcodec/vvc/dsp.h > >@@ -181,6 +181,7 @@ typedef struct VVCDSPContext { > > void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); > > > > void ff_vvc_dsp_init_aarch64(VVCDSPContext *hpc, const int bit_depth); > >+void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth); > > void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth); > > > > #endif /* AVCODEC_VVC_DSP_H */ > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
I wrote `ff_vvc_w_avg_8_rvv` by mimicking the h264 weight function. Based on the test results for 49 different resolutions, most of them were significantly slower. Only 2x32 and 2x64 had similar performance, without noticeable speed improvement. I'm not sure about the reason. Some differences are that `ff_h264_weight_pixels_8_rvv` only requires one `vlsseg2e8.v`, while `ff_vvc_w_avg_8_rvv` requires two `vlsseg2e16.v`. The v calculations inside the loop of `ff_h264_weight_pixels_8_rvv` are more than in `ff_vvc_w_avg_8_rvv`, but there are fewer scalar operations. Rémi Denis-Courmont <remi@remlab.net> 于2024年8月15日周四 16:10写道: > > > Le 3 août 2024 13:30:34 GMT+03:00, uk7b@foxmail.com a écrit : > >From: sunyuechi <sunyuechi@iscas.ac.cn> > > > > C908 X60 > >avg_8_2x2_c : 1.2 1.0 > >avg_8_2x2_rvv_i32 : 0.7 0.7 > >avg_8_2x4_c : 2.0 2.2 > >avg_8_2x4_rvv_i32 : 1.2 1.2 > >avg_8_2x8_c : 3.7 4.0 > >avg_8_2x8_rvv_i32 : 1.7 1.5 > >avg_8_2x16_c : 7.2 7.7 > >avg_8_2x16_rvv_i32 : 3.0 2.7 > >avg_8_2x32_c : 14.2 15.2 > >avg_8_2x32_rvv_i32 : 5.5 5.0 > >avg_8_2x64_c : 51.0 43.7 > >avg_8_2x64_rvv_i32 : 39.2 29.7 > >avg_8_2x128_c : 100.5 79.2 > >avg_8_2x128_rvv_i32 : 79.7 68.2 > >avg_8_4x2_c : 1.7 2.0 > >avg_8_4x2_rvv_i32 : 1.0 0.7 > >avg_8_4x4_c : 3.5 3.7 > >avg_8_4x4_rvv_i32 : 1.2 1.2 > >avg_8_4x8_c : 6.7 7.0 > >avg_8_4x8_rvv_i32 : 1.7 1.5 > >avg_8_4x16_c : 13.5 14.0 > >avg_8_4x16_rvv_i32 : 3.0 2.7 > >avg_8_4x32_c : 26.2 27.7 > >avg_8_4x32_rvv_i32 : 5.5 4.7 > >avg_8_4x64_c : 73.0 73.7 > >avg_8_4x64_rvv_i32 : 39.0 32.5 > >avg_8_4x128_c : 143.0 137.2 > >avg_8_4x128_rvv_i32 : 72.7 68.0 > >avg_8_8x2_c : 3.5 3.5 > >avg_8_8x2_rvv_i32 : 1.0 0.7 > >avg_8_8x4_c : 6.2 6.5 > >avg_8_8x4_rvv_i32 : 1.5 1.0 > >avg_8_8x8_c : 12.7 13.2 > >avg_8_8x8_rvv_i32 : 2.0 1.5 > >avg_8_8x16_c : 25.0 26.5 > >avg_8_8x16_rvv_i32 : 3.2 2.7 > >avg_8_8x32_c : 50.0 52.7 > >avg_8_8x32_rvv_i32 : 6.2 5.0 > >avg_8_8x64_c : 118.7 122.5 > >avg_8_8x64_rvv_i32 : 40.2 31.5 > >avg_8_8x128_c : 236.7 220.2 > >avg_8_8x128_rvv_i32 : 85.2 67.7 > >avg_8_16x2_c : 6.2 6.7 > >avg_8_16x2_rvv_i32 : 1.2 0.7 > >avg_8_16x4_c : 12.5 13.0 > >avg_8_16x4_rvv_i32 : 1.7 1.0 > >avg_8_16x8_c : 24.5 26.0 > >avg_8_16x8_rvv_i32 : 3.0 1.7 > >avg_8_16x16_c : 49.0 51.5 > >avg_8_16x16_rvv_i32 : 5.5 3.0 > >avg_8_16x32_c : 97.5 102.5 > >avg_8_16x32_rvv_i32 : 10.5 5.5 > >avg_8_16x64_c : 213.7 222.0 > >avg_8_16x64_rvv_i32 : 48.5 34.2 > >avg_8_16x128_c : 434.7 420.0 > >avg_8_16x128_rvv_i32 : 97.7 74.0 > >avg_8_32x2_c : 12.2 12.7 > >avg_8_32x2_rvv_i32 : 1.5 1.0 > >avg_8_32x4_c : 24.5 25.5 > >avg_8_32x4_rvv_i32 : 3.0 1.7 > >avg_8_32x8_c : 48.5 50.7 > >avg_8_32x8_rvv_i32 : 5.2 2.7 > >avg_8_32x16_c : 96.7 101.2 > >avg_8_32x16_rvv_i32 : 10.2 5.0 > >avg_8_32x32_c : 192.7 202.2 > >avg_8_32x32_rvv_i32 : 19.7 9.5 > >avg_8_32x64_c : 427.5 426.5 > >avg_8_32x64_rvv_i32 : 64.2 18.2 > >avg_8_32x128_c : 816.5 821.0 > >avg_8_32x128_rvv_i32 : 135.2 75.5 > >avg_8_64x2_c : 24.0 25.2 > >avg_8_64x2_rvv_i32 : 2.7 1.5 > >avg_8_64x4_c : 48.2 50.5 > >avg_8_64x4_rvv_i32 : 5.0 2.7 > >avg_8_64x8_c : 96.0 100.7 > >avg_8_64x8_rvv_i32 : 9.7 4.5 > >avg_8_64x16_c : 207.7 201.2 > >avg_8_64x16_rvv_i32 : 19.0 9.0 > >avg_8_64x32_c : 383.2 402.0 > >avg_8_64x32_rvv_i32 : 37.5 17.5 > >avg_8_64x64_c : 837.2 828.7 > >avg_8_64x64_rvv_i32 : 84.7 35.5 > >avg_8_64x128_c : 1640.7 1640.2 > >avg_8_64x128_rvv_i32 : 206.0 153.0 > >avg_8_128x2_c : 48.7 51.0 > >avg_8_128x2_rvv_i32 : 5.2 2.7 > >avg_8_128x4_c : 96.7 101.5 > >avg_8_128x4_rvv_i32 : 10.2 5.0 > >avg_8_128x8_c : 192.2 202.0 > >avg_8_128x8_rvv_i32 : 19.7 9.2 > >avg_8_128x16_c : 400.7 403.2 > >avg_8_128x16_rvv_i32 : 38.7 18.5 > >avg_8_128x32_c : 786.7 805.7 > >avg_8_128x32_rvv_i32 : 77.0 36.2 > >avg_8_128x64_c : 1615.5 1655.5 > >avg_8_128x64_rvv_i32 : 189.7 80.7 > >avg_8_128x128_c : 3182.0 3238.0 > >avg_8_128x128_rvv_i32 : 397.5 308.5 > >w_avg_8_2x2_c : 1.7 1.2 > >w_avg_8_2x2_rvv_i32 : 1.2 1.0 > >w_avg_8_2x4_c : 2.7 2.7 > >w_avg_8_2x4_rvv_i32 : 1.7 1.5 > >w_avg_8_2x8_c : 21.7 4.7 > >w_avg_8_2x8_rvv_i32 : 2.7 2.5 > >w_avg_8_2x16_c : 9.5 9.2 > >w_avg_8_2x16_rvv_i32 : 4.7 4.2 > >w_avg_8_2x32_c : 19.0 18.7 > >w_avg_8_2x32_rvv_i32 : 9.0 8.0 > >w_avg_8_2x64_c : 62.0 50.2 > >w_avg_8_2x64_rvv_i32 : 47.7 33.5 > >w_avg_8_2x128_c : 116.7 87.7 > >w_avg_8_2x128_rvv_i32 : 80.0 69.5 > >w_avg_8_4x2_c : 2.5 2.5 > >w_avg_8_4x2_rvv_i32 : 1.2 1.0 > >w_avg_8_4x4_c : 4.7 4.5 > >w_avg_8_4x4_rvv_i32 : 1.7 1.7 > >w_avg_8_4x8_c : 9.0 8.7 > >w_avg_8_4x8_rvv_i32 : 2.7 2.5 > >w_avg_8_4x16_c : 17.7 17.5 > >w_avg_8_4x16_rvv_i32 : 4.7 4.2 > >w_avg_8_4x32_c : 35.0 35.0 > >w_avg_8_4x32_rvv_i32 : 9.0 8.0 > >w_avg_8_4x64_c : 100.5 84.5 > >w_avg_8_4x64_rvv_i32 : 42.2 33.7 > >w_avg_8_4x128_c : 203.5 151.2 > >w_avg_8_4x128_rvv_i32 : 83.0 69.5 > >w_avg_8_8x2_c : 4.5 4.5 > >w_avg_8_8x2_rvv_i32 : 1.2 1.2 > >w_avg_8_8x4_c : 8.7 8.7 > >w_avg_8_8x4_rvv_i32 : 2.0 1.7 > >w_avg_8_8x8_c : 17.0 17.0 > >w_avg_8_8x8_rvv_i32 : 3.2 2.5 > >w_avg_8_8x16_c : 34.0 33.5 > >w_avg_8_8x16_rvv_i32 : 5.5 4.2 > >w_avg_8_8x32_c : 86.0 67.5 > >w_avg_8_8x32_rvv_i32 : 10.5 8.0 > >w_avg_8_8x64_c : 187.2 149.5 > >w_avg_8_8x64_rvv_i32 : 45.0 35.5 > >w_avg_8_8x128_c : 342.7 290.0 > >w_avg_8_8x128_rvv_i32 : 108.7 70.2 > >w_avg_8_16x2_c : 8.5 8.2 > >w_avg_8_16x2_rvv_i32 : 2.0 1.2 > >w_avg_8_16x4_c : 16.7 16.7 > >w_avg_8_16x4_rvv_i32 : 3.0 1.7 > >w_avg_8_16x8_c : 33.2 33.5 > >w_avg_8_16x8_rvv_i32 : 5.5 3.0 > >w_avg_8_16x16_c : 66.2 66.7 > >w_avg_8_16x16_rvv_i32 : 10.5 5.0 > >w_avg_8_16x32_c : 132.5 131.0 > >w_avg_8_16x32_rvv_i32 : 20.0 9.7 > >w_avg_8_16x64_c : 340.0 283.5 > >w_avg_8_16x64_rvv_i32 : 60.5 37.2 > >w_avg_8_16x128_c : 641.2 597.5 > >w_avg_8_16x128_rvv_i32 : 118.7 77.7 > >w_avg_8_32x2_c : 16.5 16.7 > >w_avg_8_32x2_rvv_i32 : 3.2 1.7 > >w_avg_8_32x4_c : 33.2 33.2 > >w_avg_8_32x4_rvv_i32 : 5.5 2.7 > >w_avg_8_32x8_c : 66.0 62.5 > >w_avg_8_32x8_rvv_i32 : 10.5 5.0 > >w_avg_8_32x16_c : 131.5 132.0 > >w_avg_8_32x16_rvv_i32 : 20.2 9.5 > >w_avg_8_32x32_c : 261.7 272.0 > >w_avg_8_32x32_rvv_i32 : 39.7 18.0 > >w_avg_8_32x64_c : 575.2 545.5 > >w_avg_8_32x64_rvv_i32 : 105.5 58.7 > >w_avg_8_32x128_c : 1154.2 1088.0 > >w_avg_8_32x128_rvv_i32 : 207.0 98.0 > >w_avg_8_64x2_c : 33.0 33.0 > >w_avg_8_64x2_rvv_i32 : 6.2 2.7 > >w_avg_8_64x4_c : 65.5 66.0 > >w_avg_8_64x4_rvv_i32 : 11.5 5.0 > >w_avg_8_64x8_c : 131.2 132.5 > >w_avg_8_64x8_rvv_i32 : 22.5 9.5 > >w_avg_8_64x16_c : 268.2 262.5 > >w_avg_8_64x16_rvv_i32 : 44.2 18.0 > >w_avg_8_64x32_c : 561.5 528.7 > >w_avg_8_64x32_rvv_i32 : 88.0 35.2 > >w_avg_8_64x64_c : 1136.2 1124.0 > >w_avg_8_64x64_rvv_i32 : 222.0 82.2 > >w_avg_8_64x128_c : 2345.0 2312.7 > >w_avg_8_64x128_rvv_i32 : 423.0 190.5 > >w_avg_8_128x2_c : 65.7 66.5 > >w_avg_8_128x2_rvv_i32 : 11.2 5.5 > >w_avg_8_128x4_c : 131.2 132.2 > >w_avg_8_128x4_rvv_i32 : 22.0 10.2 > >w_avg_8_128x8_c : 263.5 312.0 > >w_avg_8_128x8_rvv_i32 : 43.2 19.7 > >w_avg_8_128x16_c : 528.7 526.2 > >w_avg_8_128x16_rvv_i32 : 85.5 39.5 > >w_avg_8_128x32_c : 1067.7 1062.7 > >w_avg_8_128x32_rvv_i32 : 171.7 78.2 > >w_avg_8_128x64_c : 2234.7 2168.7 > >w_avg_8_128x64_rvv_i32 : 400.0 159.0 > >w_avg_8_128x128_c : 4752.5 4295.0 > >w_avg_8_128x128_rvv_i32 : 757.7 365.5 > >--- > > libavcodec/riscv/vvc/Makefile | 2 + > > libavcodec/riscv/vvc/vvc_mc_rvv.S | 287 +++++++++++++++++++++++++++++ > > libavcodec/riscv/vvc/vvcdsp_init.c | 72 ++++++++ > > libavcodec/vvc/dsp.c | 2 + > > libavcodec/vvc/dsp.h | 1 + > > 5 files changed, 364 insertions(+) > > create mode 100644 libavcodec/riscv/vvc/Makefile > > create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S > > create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c > > > >diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile > >new file mode 100644 > >index 0000000000..582b051579 > >--- /dev/null > >+++ b/libavcodec/riscv/vvc/Makefile > >@@ -0,0 +1,2 @@ > >+OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o > >+RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o > >diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S > b/libavcodec/riscv/vvc/vvc_mc_rvv.S > >new file mode 100644 > >index 0000000000..10e1bd67ee > >--- /dev/null > >+++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S > >@@ -0,0 +1,287 @@ > >+/* > >+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences > (ISCAS). > >+ * > >+ * This file is part of FFmpeg. > >+ * > >+ * FFmpeg is free software; you can redistribute it and/or > >+ * modify it under the terms of the GNU Lesser General Public > >+ * License as published by the Free Software Foundation; either > >+ * version 2.1 of the License, or (at your option) any later version. > >+ * > >+ * FFmpeg is distributed in the hope that it will be useful, > >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of > >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >+ * Lesser General Public License for more details. > >+ * > >+ * You should have received a copy of the GNU Lesser General Public > >+ * License along with FFmpeg; if not, write to the Free Software > >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > >+ */ > >+ > >+#include "libavutil/riscv/asm.S" > >+ > >+.macro vsetvlstatic8 w, vlen > >+ .if \w == 2 && \vlen == 128 > >+ vsetivli zero, \w, e8, mf8, ta, ma > >+ .elseif \w == 4 && \vlen == 128 > >+ vsetivli zero, \w, e8, mf4, ta, ma > >+ .elseif \w == 8 && \vlen == 128 > >+ vsetivli zero, \w, e8, mf2, ta, ma > >+ .elseif \w == 16 && \vlen == 128 > >+ vsetivli zero, \w, e8, m1, ta, ma > >+ .elseif \w == 32 && \vlen == 128 > >+ li t0, \w > >+ vsetvli zero, t0, e8, m2, ta, ma > >+ .elseif \w <= 4 && \vlen == 256 > >+ vsetivli zero, \w, e8, mf8, ta, ma > >+ .elseif \w == 8 && \vlen == 256 > >+ vsetivli zero, \w, e8, mf4, ta, ma > >+ .elseif \w == 16 && \vlen == 256 > >+ vsetivli zero, \w, e8, mf2, ta, ma > >+ .elseif \w == 32 && \vlen == 256 > >+ li t0, \w > >+ vsetvli zero, t0, e8, m1, ta, ma > >+ .elseif \w == 64 && \vlen == 256 > >+ li t0, \w > >+ vsetvli zero, t0, e8, m2, ta, ma > >+ .else > >+ li t0, \w > >+ vsetvli zero, t0, e8, m4, ta, ma > >+ .endif > >+.endm > >+ > >+.macro vsetvlstatic16 w, vlen > >+ .if \w == 2 && \vlen == 128 > >+ vsetivli zero, \w, e16, mf4, ta, ma > >+ .elseif \w == 4 && \vlen == 128 > >+ vsetivli zero, \w, e16, mf2, ta, ma > >+ .elseif \w == 8 && \vlen == 128 > >+ vsetivli zero, \w, e16, m1, ta, ma > >+ .elseif \w == 16 && \vlen == 128 > >+ vsetivli zero, \w, e16, m2, ta, ma > >+ .elseif \w == 32 && \vlen == 128 > >+ li t0, \w > >+ vsetvli zero, t0, e16, m4, ta, ma > >+ .elseif \w <= 4 && \vlen == 256 > >+ vsetivli zero, \w, e16, mf4, ta, ma > >+ .elseif \w == 8 && \vlen == 256 > >+ vsetivli zero, \w, e16, mf2, ta, ma > >+ .elseif \w == 16 && \vlen == 256 > >+ vsetivli zero, \w, e16, m1, ta, ma > >+ .elseif \w == 32 && \vlen == 256 > >+ li t0, \w > >+ vsetvli zero, t0, e16, m2, ta, ma > >+ .elseif \w == 64 && \vlen == 256 > >+ li t0, \w > >+ vsetvli zero, t0, e16, m4, ta, ma > >+ .else > >+ li t0, \w > >+ vsetvli zero, t0, e16, m8, ta, ma > >+ .endif > >+.endm > >+ > >+.macro vsetvlstatic32 w, vlen > >+ .if \w == 2 > >+ vsetivli zero, \w, e32, mf2, ta, ma > >+ .elseif \w == 4 && \vlen == 128 > >+ vsetivli zero, \w, e32, m1, ta, ma > >+ .elseif \w == 8 && \vlen == 128 > >+ vsetivli zero, \w, e32, m2, ta, ma > >+ .elseif \w == 16 && \vlen == 128 > >+ vsetivli zero, \w, e32, m4, ta, ma > >+ .elseif \w == 4 && \vlen == 256 > >+ vsetivli zero, \w, e32, mf2, ta, ma > >+ .elseif \w == 8 && \vlen == 256 > >+ vsetivli zero, \w, e32, m1, ta, ma > >+ .elseif \w == 16 && \vlen == 256 > >+ vsetivli zero, \w, e32, m2, ta, ma > >+ .elseif \w == 32 && \vlen == 256 > >+ li t0, \w > >+ vsetvli zero, t0, e32, m4, ta, ma > >+ .else > >+ li t0, \w > >+ vsetvli zero, t0, e32, m8, ta, ma > >+ .endif > >+.endm > >+ > >+.macro avg w, vlen, id > >+\id\w\vlen: > >+.if \w < 128 > >+ vsetvlstatic16 \w, \vlen > >+ addi t0, a2, 128*2 > >+ addi t1, a3, 128*2 > >+ add t2, a0, a1 > >+ vle16.v v0, (a2) > >+ vle16.v v8, (a3) > >+ addi a5, a5, -2 > >+ vle16.v v16, (t0) > >+ vle16.v v24, (t1) > >+ vadd.vv v8, v8, v0 > >+ vadd.vv v24, v24, v16 > >+ vmax.vx v8, v8, zero > >+ vmax.vx v24, v24, zero > > With short widths, scaling vertically (rather than horizontally) with > strides is likely faster. See also the h.264 weight and biweight functions, > which provide a similar algorithm. > > >+ vsetvlstatic8 \w, \vlen > >+ addi a2, a2, 128*4 > >+ vnclipu.wi v8, v8, 7 > >+ vnclipu.wi v24, v24, 7 > >+ addi a3, a3, 128*4 > >+ vse8.v v8, (a0) > >+ vse8.v v24, (t2) > >+ sh1add a0, a1, a0 > >+.else > >+ addi a5, a5, -1 > >+ mv t1, a0 > >+ mv t2, a2 > >+ mv t3, a3 > >+ mv t4, a4 > >+1: > >+ vsetvli t0, a4, e16, m8, ta, ma > >+ sub a4, a4, t0 > >+ vle16.v v0, (a2) > >+ vle16.v v8, (a3) > >+ vadd.vv v8, v8, v0 > >+ vmax.vx v8, v8, zero > >+ vsetvli zero, zero, e8, m4, ta, ma > >+ vnclipu.wi v8, v8, 7 > >+ vse8.v v8, (a0) > >+ sh1add a2, t0, a2 > >+ sh1add a3, t0, a3 > >+ add a0, a0, t0 > >+ bnez a4, 1b > >+ add a0, t1, a1 > >+ addi a2, t2, 128*2 > >+ addi a3, t3, 128*2 > >+ mv a4, t4 > >+.endif > >+ bnez a5, \id\w\vlen\()b > >+ ret > >+.endm > >+ > >+ > >+.macro AVG_JMP_TABLE id, vlen > >+const jmp_table_\id\vlen > >+ .4byte \id\()2\vlen\()f - jmp_table_\id\vlen > >+ .4byte \id\()4\vlen\()f - jmp_table_\id\vlen > >+ .4byte \id\()8\vlen\()f - jmp_table_\id\vlen > >+ .4byte \id\()16\vlen\()f - jmp_table_\id\vlen > >+ .4byte \id\()32\vlen\()f - jmp_table_\id\vlen > >+ .4byte \id\()64\vlen\()f - jmp_table_\id\vlen > >+ .4byte \id\()128\vlen\()f - jmp_table_\id\vlen > >+endconst > >+.endm > >+ > >+.macro AVG_J vlen, id > >+ clz t1, a4 > >+ neg t1, t1 > >+ lla t5, jmp_table_\id\vlen > >+ sh2add t1, t1, t5 > >+ lw t1, ((__riscv_xlen-2)<<2)(t1) > >+ add t1, t1, t5 > >+ jr t1 > >+.endm > >+ > >+.macro func_avg vlen > >+func ff_vvc_avg_8_rvv_\vlen\(), zve32x > >+ lpad 0 > >+ AVG_JMP_TABLE 1, \vlen > >+ csrwi vxrm, 0 > >+ AVG_J \vlen, 1 > >+ .irp w,2,4,8,16,32,64,128 > >+ avg \w, \vlen, 1 > >+ .endr > >+endfunc > >+.endm > >+ > >+func_avg 128 > >+func_avg 256 > >+ > >+#if (__riscv_xlen == 64) > >+.macro w_avg w, vlen, id > >+\id\w\vlen: > >+.if \w <= 32 || (\w == 64 && \vlen == 256) > >+ vsetvlstatic16 \w, \vlen > >+ addi t0, a2, 128*2 > >+ addi t1, a3, 128*2 > >+ vle16.v v0, (a2) > >+ vle16.v v4, (a3) > >+ addi a5, a5, -2 > >+ vle16.v v8, (t0) > >+ vle16.v v12, (t1) > >+ vwmul.vx v16, v0, a7 > >+ vwmul.vx v24, v8, a7 > >+ vwmacc.vx v16, t3, v4 > >+ vwmacc.vx v24, t3, v12 > >+ vsetvlstatic32 \w, \vlen > >+ add t2, a0, a1 > >+ vadd.vx v16, v16, t4 > >+ vadd.vx v24, v24, t4 > >+ vsetvlstatic16 \w, \vlen > >+ vnsrl.wx v16, v16, t6 > >+ vnsrl.wx v24, v24, t6 > >+ vmax.vx v16, v16, zero > >+ vmax.vx v24, v24, zero > >+ vsetvlstatic8 \w, \vlen > >+ addi a2, a2, 128*4 > >+ vnclipu.wi v16, v16, 0 > >+ vnclipu.wi v24, v24, 0 > >+ vse8.v v16, (a0) > >+ addi a3, a3, 128*4 > >+ vse8.v v24, (t2) > >+ sh1add a0, a1, a0 > >+.else > >+ addi a5, a5, -1 > >+ mv t1, a0 > >+ mv t2, a2 > >+ mv t5, a3 > >+ mv a6, a4 > >+1: > >+ vsetvli t0, a4, e16, m4, ta, ma > >+ sub a4, a4, t0 > >+ vle16.v v0, (a2) > >+ vle16.v v4, (a3) > >+ vwmul.vx v16, v0, a7 > >+ vwmacc.vx v16, t3, v4 > >+ vsetvli zero, zero, e32, m8, ta, ma > >+ vadd.vx v16, v16, t4 > >+ vsetvli zero, zero, e16, m4, ta, ma > >+ vnsrl.wx v16, v16, t6 > >+ vmax.vx v16, v16, zero > >+ vsetvli zero, zero, e8, m2, ta, ma > >+ vnclipu.wi v16, v16, 0 > >+ vse8.v v16, (a0) > >+ sh1add a2, t0, a2 > >+ sh1add a3, t0, a3 > >+ add a0, a0, t0 > >+ bnez a4, 1b > >+ add a0, t1, a1 > >+ addi a2, t2, 128*2 > >+ addi a3, t5, 128*2 > >+ mv a4, a6 > >+.endif > >+ bnez a5, \id\w\vlen\()b > >+ ret > >+.endm > >+ > >+.macro func_w_avg vlen > >+func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x > >+ lpad 0 > >+ AVG_JMP_TABLE 2, \vlen > >+ csrwi vxrm, 0 > >+ addi t6, a6, 7 > >+ ld t3, (sp) > >+ ld t4, 8(sp) > >+ ld t5, 16(sp) > >+ addi t4, t4, 1 // o0 + o1 + 1 > >+ add t4, t4, t5 > >+ addi t5, t6, -1 // shift - 1 > >+ sll t4, t4, t5 > >+ AVG_J \vlen, 2 > >+ .irp w,2,4,8,16,32,64,128 > >+ w_avg \w, \vlen, 2 > >+ .endr > >+endfunc > >+.endm > >+ > >+func_w_avg 128 > >+func_w_avg 256 > >+#endif > >diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c > b/libavcodec/riscv/vvc/vvcdsp_init.c > >new file mode 100644 > >index 0000000000..9819a7c570 > >--- /dev/null > >+++ b/libavcodec/riscv/vvc/vvcdsp_init.c > >@@ -0,0 +1,72 @@ > >+/* > >+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences > (ISCAS). > >+ * > >+ * This file is part of FFmpeg. > >+ * > >+ * FFmpeg is free software; you can redistribute it and/or > >+ * modify it under the terms of the GNU Lesser General Public > >+ * License as published by the Free Software Foundation; either > >+ * version 2.1 of the License, or (at your option) any later version. > >+ * > >+ * FFmpeg is distributed in the hope that it will be useful, > >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of > >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >+ * Lesser General Public License for more details. > >+ * > >+ * You should have received a copy of the GNU Lesser General Public > >+ * License along with FFmpeg; if not, write to the Free Software > >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > >+ */ > >+ > >+#include "config.h" > >+ > >+#include "libavutil/attributes.h" > >+#include "libavutil/cpu.h" > >+#include "libavutil/riscv/cpu.h" > >+#include "libavcodec/vvc/dsp.h" > >+ > >+#define bf(fn, bd, opt) fn##_##bd##_##opt > >+ > >+#define AVG_PROTOTYPES(bd, opt) > \ > >+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, > \ > >+ const int16_t *src0, const int16_t *src1, int width, int height); > \ > >+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, > \ > >+ const int16_t *src0, const int16_t *src1, int width, int height, > \ > >+ int denom, int w0, int w1, int o0, int o1); > >+ > >+AVG_PROTOTYPES(8, rvv_128) > >+AVG_PROTOTYPES(8, rvv_256) > >+ > >+void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd) > >+{ > >+#if HAVE_RVV > >+ const int flags = av_get_cpu_flags(); > >+ int vlenb = ff_get_rv_vlenb(); > >+ > >+ if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) > && > >+ vlenb >= 32) { > >+ switch (bd) { > >+ case 8: > >+ c->inter.avg = ff_vvc_avg_8_rvv_256; > >+# if (__riscv_xlen == 64) > >+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_256; > >+# endif > >+ break; > >+ default: > >+ break; > >+ } > >+ } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & > AV_CPU_FLAG_RVB_ADDR) && > >+ vlenb >= 16) { > >+ switch (bd) { > >+ case 8: > >+ c->inter.avg = ff_vvc_avg_8_rvv_128; > >+# if (__riscv_xlen == 64) > >+ c->inter.w_avg = ff_vvc_w_avg_8_rvv_128; > >+# endif > >+ break; > >+ default: > >+ break; > >+ } > >+ } > >+#endif > >+} > >diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c > >index 648d54ebb2..0d2e315395 100644 > >--- a/libavcodec/vvc/dsp.c > >+++ b/libavcodec/vvc/dsp.c > >@@ -123,6 +123,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int > bit_depth) > > > > #if ARCH_AARCH64 > > ff_vvc_dsp_init_aarch64(vvcdsp, bit_depth); > >+#elif ARCH_RISCV > >+ ff_vvc_dsp_init_riscv(vvcdsp, bit_depth); > > #elif ARCH_X86 > > ff_vvc_dsp_init_x86(vvcdsp, bit_depth); > > #endif > >diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h > >index 0b49b97021..4933cca891 100644 > >--- a/libavcodec/vvc/dsp.h > >+++ b/libavcodec/vvc/dsp.h > >@@ -181,6 +181,7 @@ typedef struct VVCDSPContext { > > void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); > > > > void ff_vvc_dsp_init_aarch64(VVCDSPContext *hpc, const int bit_depth); > >+void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth); > > void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth); > > > > #endif /* AVCODEC_VVC_DSP_H */ > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
Le sunnuntaina 18. elokuuta 2024, 13.19.33 EEST flow gg a écrit : > Based on the test results for 49 different resolutions, most of them were > significantly slower. OK, thanks for checking. The heights are probably not large enough for this to work. I don't have any objection on the RISC-V side of this patchset (reviewed only, not tested).
diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile new file mode 100644 index 0000000000..582b051579 --- /dev/null +++ b/libavcodec/riscv/vvc/Makefile @@ -0,0 +1,2 @@ +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S b/libavcodec/riscv/vvc/vvc_mc_rvv.S new file mode 100644 index 0000000000..10e1bd67ee --- /dev/null +++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +.macro vsetvlstatic8 w, vlen + .if \w == 2 && \vlen == 128 + vsetivli zero, \w, e8, mf8, ta, ma + .elseif \w == 4 && \vlen == 128 + vsetivli zero, \w, e8, mf4, ta, ma + .elseif \w == 8 && \vlen == 128 + vsetivli zero, \w, e8, mf2, ta, ma + .elseif \w == 16 && \vlen == 128 + vsetivli zero, \w, e8, m1, ta, ma + .elseif \w == 32 && \vlen == 128 + li t0, \w + vsetvli zero, t0, e8, m2, ta, ma + .elseif \w <= 4 && \vlen == 256 + vsetivli zero, \w, e8, mf8, ta, ma + .elseif \w == 8 && \vlen == 256 + vsetivli zero, \w, e8, mf4, ta, ma + .elseif \w == 16 && \vlen == 256 + vsetivli zero, \w, e8, mf2, ta, ma + .elseif \w == 32 && \vlen == 256 + li t0, \w + vsetvli zero, t0, e8, m1, ta, ma + .elseif \w == 64 && \vlen == 256 + li t0, \w + vsetvli zero, t0, e8, m2, ta, ma + .else + li t0, \w + vsetvli zero, t0, e8, m4, ta, ma + .endif +.endm + +.macro vsetvlstatic16 w, vlen + .if \w == 2 && \vlen == 128 + vsetivli zero, \w, e16, mf4, ta, ma + .elseif \w == 4 && \vlen == 128 + vsetivli zero, \w, e16, mf2, ta, ma + .elseif \w == 8 && \vlen == 128 + vsetivli zero, \w, e16, m1, ta, ma + .elseif \w == 16 && \vlen == 128 + vsetivli zero, \w, e16, m2, ta, ma + .elseif \w == 32 && \vlen == 128 + li t0, \w + vsetvli zero, t0, e16, m4, ta, ma + .elseif \w <= 4 && \vlen == 256 + vsetivli zero, \w, e16, mf4, ta, ma + .elseif \w == 8 && \vlen == 256 + vsetivli zero, \w, e16, mf2, ta, ma + .elseif \w == 16 && \vlen == 256 + vsetivli zero, \w, e16, m1, ta, ma + .elseif \w == 32 && \vlen == 256 + li t0, \w + vsetvli zero, t0, e16, m2, ta, ma + .elseif \w == 64 && \vlen == 256 + li t0, \w + vsetvli zero, t0, e16, m4, ta, ma + .else + li t0, \w + vsetvli zero, t0, e16, m8, ta, ma + .endif +.endm + +.macro vsetvlstatic32 w, vlen + .if \w == 2 + vsetivli zero, \w, e32, mf2, ta, ma + .elseif \w == 4 && \vlen == 128 + vsetivli zero, \w, e32, m1, ta, ma + .elseif \w == 8 && \vlen == 128 + vsetivli zero, \w, e32, m2, ta, ma + .elseif \w == 16 && \vlen == 128 + vsetivli zero, \w, e32, m4, ta, ma + .elseif \w == 4 && \vlen == 256 + vsetivli zero, \w, e32, mf2, ta, ma + .elseif \w == 8 && \vlen == 256 + vsetivli zero, \w, e32, m1, ta, ma + .elseif \w == 16 && \vlen == 256 + vsetivli zero, \w, e32, m2, ta, ma + .elseif \w == 32 && \vlen == 256 + li t0, \w + vsetvli zero, t0, e32, m4, ta, ma + .else + li t0, \w + vsetvli zero, t0, e32, m8, ta, ma + .endif +.endm + +.macro avg w, vlen, id +\id\w\vlen: +.if \w < 128 + vsetvlstatic16 \w, \vlen + addi t0, a2, 128*2 + addi t1, a3, 128*2 + add t2, a0, a1 + vle16.v v0, (a2) + vle16.v v8, (a3) + addi a5, a5, -2 + vle16.v v16, (t0) + vle16.v v24, (t1) + vadd.vv v8, v8, v0 + vadd.vv v24, v24, v16 + vmax.vx v8, v8, zero + vmax.vx v24, v24, zero + vsetvlstatic8 \w, \vlen + addi a2, a2, 128*4 + vnclipu.wi v8, v8, 7 + vnclipu.wi v24, v24, 7 + addi a3, a3, 128*4 + vse8.v v8, (a0) + vse8.v v24, (t2) + sh1add a0, a1, a0 +.else + addi a5, a5, -1 + mv t1, a0 + mv t2, a2 + mv t3, a3 + mv t4, a4 +1: + vsetvli t0, a4, e16, m8, ta, ma + sub a4, a4, t0 + vle16.v v0, (a2) + vle16.v v8, (a3) + vadd.vv v8, v8, v0 + vmax.vx v8, v8, zero + vsetvli zero, zero, e8, m4, ta, ma + vnclipu.wi v8, v8, 7 + vse8.v v8, (a0) + sh1add a2, t0, a2 + sh1add a3, t0, a3 + add a0, a0, t0 + bnez a4, 1b + add a0, t1, a1 + addi a2, t2, 128*2 + addi a3, t3, 128*2 + mv a4, t4 +.endif + bnez a5, \id\w\vlen\()b + ret +.endm + + +.macro AVG_JMP_TABLE id, vlen +const jmp_table_\id\vlen + .4byte \id\()2\vlen\()f - jmp_table_\id\vlen + .4byte \id\()4\vlen\()f - jmp_table_\id\vlen + .4byte \id\()8\vlen\()f - jmp_table_\id\vlen + .4byte \id\()16\vlen\()f - jmp_table_\id\vlen + .4byte \id\()32\vlen\()f - jmp_table_\id\vlen + .4byte \id\()64\vlen\()f - jmp_table_\id\vlen + .4byte \id\()128\vlen\()f - jmp_table_\id\vlen +endconst +.endm + +.macro AVG_J vlen, id + clz t1, a4 + neg t1, t1 + lla t5, jmp_table_\id\vlen + sh2add t1, t1, t5 + lw t1, ((__riscv_xlen-2)<<2)(t1) + add t1, t1, t5 + jr t1 +.endm + +.macro func_avg vlen +func ff_vvc_avg_8_rvv_\vlen\(), zve32x + lpad 0 + AVG_JMP_TABLE 1, \vlen + csrwi vxrm, 0 + AVG_J \vlen, 1 + .irp w,2,4,8,16,32,64,128 + avg \w, \vlen, 1 + .endr +endfunc +.endm + +func_avg 128 +func_avg 256 + +#if (__riscv_xlen == 64) +.macro w_avg w, vlen, id +\id\w\vlen: +.if \w <= 32 || (\w == 64 && \vlen == 256) + vsetvlstatic16 \w, \vlen + addi t0, a2, 128*2 + addi t1, a3, 128*2 + vle16.v v0, (a2) + vle16.v v4, (a3) + addi a5, a5, -2 + vle16.v v8, (t0) + vle16.v v12, (t1) + vwmul.vx v16, v0, a7 + vwmul.vx v24, v8, a7 + vwmacc.vx v16, t3, v4 + vwmacc.vx v24, t3, v12 + vsetvlstatic32 \w, \vlen + add t2, a0, a1 + vadd.vx v16, v16, t4 + vadd.vx v24, v24, t4 + vsetvlstatic16 \w, \vlen + vnsrl.wx v16, v16, t6 + vnsrl.wx v24, v24, t6 + vmax.vx v16, v16, zero + vmax.vx v24, v24, zero + vsetvlstatic8 \w, \vlen + addi a2, a2, 128*4 + vnclipu.wi v16, v16, 0 + vnclipu.wi v24, v24, 0 + vse8.v v16, (a0) + addi a3, a3, 128*4 + vse8.v v24, (t2) + sh1add a0, a1, a0 +.else + addi a5, a5, -1 + mv t1, a0 + mv t2, a2 + mv t5, a3 + mv a6, a4 +1: + vsetvli t0, a4, e16, m4, ta, ma + sub a4, a4, t0 + vle16.v v0, (a2) + vle16.v v4, (a3) + vwmul.vx v16, v0, a7 + vwmacc.vx v16, t3, v4 + vsetvli zero, zero, e32, m8, ta, ma + vadd.vx v16, v16, t4 + vsetvli zero, zero, e16, m4, ta, ma + vnsrl.wx v16, v16, t6 + vmax.vx v16, v16, zero + vsetvli zero, zero, e8, m2, ta, ma + vnclipu.wi v16, v16, 0 + vse8.v v16, (a0) + sh1add a2, t0, a2 + sh1add a3, t0, a3 + add a0, a0, t0 + bnez a4, 1b + add a0, t1, a1 + addi a2, t2, 128*2 + addi a3, t5, 128*2 + mv a4, a6 +.endif + bnez a5, \id\w\vlen\()b + ret +.endm + +.macro func_w_avg vlen +func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x + lpad 0 + AVG_JMP_TABLE 2, \vlen + csrwi vxrm, 0 + addi t6, a6, 7 + ld t3, (sp) + ld t4, 8(sp) + ld t5, 16(sp) + addi t4, t4, 1 // o0 + o1 + 1 + add t4, t4, t5 + addi t5, t6, -1 // shift - 1 + sll t4, t4, t5 + AVG_J \vlen, 2 + .irp w,2,4,8,16,32,64,128 + w_avg \w, \vlen, 2 + .endr +endfunc +.endm + +func_w_avg 128 +func_w_avg 256 +#endif diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c b/libavcodec/riscv/vvc/vvcdsp_init.c new file mode 100644 index 0000000000..9819a7c570 --- /dev/null +++ b/libavcodec/riscv/vvc/vvcdsp_init.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/riscv/cpu.h" +#include "libavcodec/vvc/dsp.h" + +#define bf(fn, bd, opt) fn##_##bd##_##opt + +#define AVG_PROTOTYPES(bd, opt) \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *src0, const int16_t *src1, int width, int height); \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *src0, const int16_t *src1, int width, int height, \ + int denom, int w0, int w1, int o0, int o1); + +AVG_PROTOTYPES(8, rvv_128) +AVG_PROTOTYPES(8, rvv_256) + +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd) +{ +#if HAVE_RVV + const int flags = av_get_cpu_flags(); + int vlenb = ff_get_rv_vlenb(); + + if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) && + vlenb >= 32) { + switch (bd) { + case 8: + c->inter.avg = ff_vvc_avg_8_rvv_256; +# if (__riscv_xlen == 64) + c->inter.w_avg = ff_vvc_w_avg_8_rvv_256; +# endif + break; + default: + break; + } + } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) && + vlenb >= 16) { + switch (bd) { + case 8: + c->inter.avg = ff_vvc_avg_8_rvv_128; +# if (__riscv_xlen == 64) + c->inter.w_avg = ff_vvc_w_avg_8_rvv_128; +# endif + break; + default: + break; + } + } +#endif +} diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c index 648d54ebb2..0d2e315395 100644 --- a/libavcodec/vvc/dsp.c +++ b/libavcodec/vvc/dsp.c @@ -123,6 +123,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth) #if ARCH_AARCH64 ff_vvc_dsp_init_aarch64(vvcdsp, bit_depth); +#elif ARCH_RISCV + ff_vvc_dsp_init_riscv(vvcdsp, bit_depth); #elif ARCH_X86 ff_vvc_dsp_init_x86(vvcdsp, bit_depth); #endif diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h index 0b49b97021..4933cca891 100644 --- a/libavcodec/vvc/dsp.h +++ b/libavcodec/vvc/dsp.h @@ -181,6 +181,7 @@ typedef struct VVCDSPContext { void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); void ff_vvc_dsp_init_aarch64(VVCDSPContext *hpc, const int bit_depth); +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth); void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth); #endif /* AVCODEC_VVC_DSP_H */
From: sunyuechi <sunyuechi@iscas.ac.cn> C908 X60 avg_8_2x2_c : 1.2 1.0 avg_8_2x2_rvv_i32 : 0.7 0.7 avg_8_2x4_c : 2.0 2.2 avg_8_2x4_rvv_i32 : 1.2 1.2 avg_8_2x8_c : 3.7 4.0 avg_8_2x8_rvv_i32 : 1.7 1.5 avg_8_2x16_c : 7.2 7.7 avg_8_2x16_rvv_i32 : 3.0 2.7 avg_8_2x32_c : 14.2 15.2 avg_8_2x32_rvv_i32 : 5.5 5.0 avg_8_2x64_c : 51.0 43.7 avg_8_2x64_rvv_i32 : 39.2 29.7 avg_8_2x128_c : 100.5 79.2 avg_8_2x128_rvv_i32 : 79.7 68.2 avg_8_4x2_c : 1.7 2.0 avg_8_4x2_rvv_i32 : 1.0 0.7 avg_8_4x4_c : 3.5 3.7 avg_8_4x4_rvv_i32 : 1.2 1.2 avg_8_4x8_c : 6.7 7.0 avg_8_4x8_rvv_i32 : 1.7 1.5 avg_8_4x16_c : 13.5 14.0 avg_8_4x16_rvv_i32 : 3.0 2.7 avg_8_4x32_c : 26.2 27.7 avg_8_4x32_rvv_i32 : 5.5 4.7 avg_8_4x64_c : 73.0 73.7 avg_8_4x64_rvv_i32 : 39.0 32.5 avg_8_4x128_c : 143.0 137.2 avg_8_4x128_rvv_i32 : 72.7 68.0 avg_8_8x2_c : 3.5 3.5 avg_8_8x2_rvv_i32 : 1.0 0.7 avg_8_8x4_c : 6.2 6.5 avg_8_8x4_rvv_i32 : 1.5 1.0 avg_8_8x8_c : 12.7 13.2 avg_8_8x8_rvv_i32 : 2.0 1.5 avg_8_8x16_c : 25.0 26.5 avg_8_8x16_rvv_i32 : 3.2 2.7 avg_8_8x32_c : 50.0 52.7 avg_8_8x32_rvv_i32 : 6.2 5.0 avg_8_8x64_c : 118.7 122.5 avg_8_8x64_rvv_i32 : 40.2 31.5 avg_8_8x128_c : 236.7 220.2 avg_8_8x128_rvv_i32 : 85.2 67.7 avg_8_16x2_c : 6.2 6.7 avg_8_16x2_rvv_i32 : 1.2 0.7 avg_8_16x4_c : 12.5 13.0 avg_8_16x4_rvv_i32 : 1.7 1.0 avg_8_16x8_c : 24.5 26.0 avg_8_16x8_rvv_i32 : 3.0 1.7 avg_8_16x16_c : 49.0 51.5 avg_8_16x16_rvv_i32 : 5.5 3.0 avg_8_16x32_c : 97.5 102.5 avg_8_16x32_rvv_i32 : 10.5 5.5 avg_8_16x64_c : 213.7 222.0 avg_8_16x64_rvv_i32 : 48.5 34.2 avg_8_16x128_c : 434.7 420.0 avg_8_16x128_rvv_i32 : 97.7 74.0 avg_8_32x2_c : 12.2 12.7 avg_8_32x2_rvv_i32 : 1.5 1.0 avg_8_32x4_c : 24.5 25.5 avg_8_32x4_rvv_i32 : 3.0 1.7 avg_8_32x8_c : 48.5 50.7 avg_8_32x8_rvv_i32 : 5.2 2.7 avg_8_32x16_c : 96.7 101.2 avg_8_32x16_rvv_i32 : 10.2 5.0 avg_8_32x32_c : 192.7 202.2 avg_8_32x32_rvv_i32 : 19.7 9.5 avg_8_32x64_c : 427.5 426.5 avg_8_32x64_rvv_i32 : 64.2 18.2 avg_8_32x128_c : 816.5 821.0 avg_8_32x128_rvv_i32 : 135.2 75.5 avg_8_64x2_c : 24.0 25.2 avg_8_64x2_rvv_i32 : 2.7 1.5 avg_8_64x4_c : 48.2 50.5 avg_8_64x4_rvv_i32 : 5.0 2.7 avg_8_64x8_c : 96.0 100.7 avg_8_64x8_rvv_i32 : 9.7 4.5 avg_8_64x16_c : 207.7 201.2 avg_8_64x16_rvv_i32 : 19.0 9.0 avg_8_64x32_c : 383.2 402.0 avg_8_64x32_rvv_i32 : 37.5 17.5 avg_8_64x64_c : 837.2 828.7 avg_8_64x64_rvv_i32 : 84.7 35.5 avg_8_64x128_c : 1640.7 1640.2 avg_8_64x128_rvv_i32 : 206.0 153.0 avg_8_128x2_c : 48.7 51.0 avg_8_128x2_rvv_i32 : 5.2 2.7 avg_8_128x4_c : 96.7 101.5 avg_8_128x4_rvv_i32 : 10.2 5.0 avg_8_128x8_c : 192.2 202.0 avg_8_128x8_rvv_i32 : 19.7 9.2 avg_8_128x16_c : 400.7 403.2 avg_8_128x16_rvv_i32 : 38.7 18.5 avg_8_128x32_c : 786.7 805.7 avg_8_128x32_rvv_i32 : 77.0 36.2 avg_8_128x64_c : 1615.5 1655.5 avg_8_128x64_rvv_i32 : 189.7 80.7 avg_8_128x128_c : 3182.0 3238.0 avg_8_128x128_rvv_i32 : 397.5 308.5 w_avg_8_2x2_c : 1.7 1.2 w_avg_8_2x2_rvv_i32 : 1.2 1.0 w_avg_8_2x4_c : 2.7 2.7 w_avg_8_2x4_rvv_i32 : 1.7 1.5 w_avg_8_2x8_c : 21.7 4.7 w_avg_8_2x8_rvv_i32 : 2.7 2.5 w_avg_8_2x16_c : 9.5 9.2 w_avg_8_2x16_rvv_i32 : 4.7 4.2 w_avg_8_2x32_c : 19.0 18.7 w_avg_8_2x32_rvv_i32 : 9.0 8.0 w_avg_8_2x64_c : 62.0 50.2 w_avg_8_2x64_rvv_i32 : 47.7 33.5 w_avg_8_2x128_c : 116.7 87.7 w_avg_8_2x128_rvv_i32 : 80.0 69.5 w_avg_8_4x2_c : 2.5 2.5 w_avg_8_4x2_rvv_i32 : 1.2 1.0 w_avg_8_4x4_c : 4.7 4.5 w_avg_8_4x4_rvv_i32 : 1.7 1.7 w_avg_8_4x8_c : 9.0 8.7 w_avg_8_4x8_rvv_i32 : 2.7 2.5 w_avg_8_4x16_c : 17.7 17.5 w_avg_8_4x16_rvv_i32 : 4.7 4.2 w_avg_8_4x32_c : 35.0 35.0 w_avg_8_4x32_rvv_i32 : 9.0 8.0 w_avg_8_4x64_c : 100.5 84.5 w_avg_8_4x64_rvv_i32 : 42.2 33.7 w_avg_8_4x128_c : 203.5 151.2 w_avg_8_4x128_rvv_i32 : 83.0 69.5 w_avg_8_8x2_c : 4.5 4.5 w_avg_8_8x2_rvv_i32 : 1.2 1.2 w_avg_8_8x4_c : 8.7 8.7 w_avg_8_8x4_rvv_i32 : 2.0 1.7 w_avg_8_8x8_c : 17.0 17.0 w_avg_8_8x8_rvv_i32 : 3.2 2.5 w_avg_8_8x16_c : 34.0 33.5 w_avg_8_8x16_rvv_i32 : 5.5 4.2 w_avg_8_8x32_c : 86.0 67.5 w_avg_8_8x32_rvv_i32 : 10.5 8.0 w_avg_8_8x64_c : 187.2 149.5 w_avg_8_8x64_rvv_i32 : 45.0 35.5 w_avg_8_8x128_c : 342.7 290.0 w_avg_8_8x128_rvv_i32 : 108.7 70.2 w_avg_8_16x2_c : 8.5 8.2 w_avg_8_16x2_rvv_i32 : 2.0 1.2 w_avg_8_16x4_c : 16.7 16.7 w_avg_8_16x4_rvv_i32 : 3.0 1.7 w_avg_8_16x8_c : 33.2 33.5 w_avg_8_16x8_rvv_i32 : 5.5 3.0 w_avg_8_16x16_c : 66.2 66.7 w_avg_8_16x16_rvv_i32 : 10.5 5.0 w_avg_8_16x32_c : 132.5 131.0 w_avg_8_16x32_rvv_i32 : 20.0 9.7 w_avg_8_16x64_c : 340.0 283.5 w_avg_8_16x64_rvv_i32 : 60.5 37.2 w_avg_8_16x128_c : 641.2 597.5 w_avg_8_16x128_rvv_i32 : 118.7 77.7 w_avg_8_32x2_c : 16.5 16.7 w_avg_8_32x2_rvv_i32 : 3.2 1.7 w_avg_8_32x4_c : 33.2 33.2 w_avg_8_32x4_rvv_i32 : 5.5 2.7 w_avg_8_32x8_c : 66.0 62.5 w_avg_8_32x8_rvv_i32 : 10.5 5.0 w_avg_8_32x16_c : 131.5 132.0 w_avg_8_32x16_rvv_i32 : 20.2 9.5 w_avg_8_32x32_c : 261.7 272.0 w_avg_8_32x32_rvv_i32 : 39.7 18.0 w_avg_8_32x64_c : 575.2 545.5 w_avg_8_32x64_rvv_i32 : 105.5 58.7 w_avg_8_32x128_c : 1154.2 1088.0 w_avg_8_32x128_rvv_i32 : 207.0 98.0 w_avg_8_64x2_c : 33.0 33.0 w_avg_8_64x2_rvv_i32 : 6.2 2.7 w_avg_8_64x4_c : 65.5 66.0 w_avg_8_64x4_rvv_i32 : 11.5 5.0 w_avg_8_64x8_c : 131.2 132.5 w_avg_8_64x8_rvv_i32 : 22.5 9.5 w_avg_8_64x16_c : 268.2 262.5 w_avg_8_64x16_rvv_i32 : 44.2 18.0 w_avg_8_64x32_c : 561.5 528.7 w_avg_8_64x32_rvv_i32 : 88.0 35.2 w_avg_8_64x64_c : 1136.2 1124.0 w_avg_8_64x64_rvv_i32 : 222.0 82.2 w_avg_8_64x128_c : 2345.0 2312.7 w_avg_8_64x128_rvv_i32 : 423.0 190.5 w_avg_8_128x2_c : 65.7 66.5 w_avg_8_128x2_rvv_i32 : 11.2 5.5 w_avg_8_128x4_c : 131.2 132.2 w_avg_8_128x4_rvv_i32 : 22.0 10.2 w_avg_8_128x8_c : 263.5 312.0 w_avg_8_128x8_rvv_i32 : 43.2 19.7 w_avg_8_128x16_c : 528.7 526.2 w_avg_8_128x16_rvv_i32 : 85.5 39.5 w_avg_8_128x32_c : 1067.7 1062.7 w_avg_8_128x32_rvv_i32 : 171.7 78.2 w_avg_8_128x64_c : 2234.7 2168.7 w_avg_8_128x64_rvv_i32 : 400.0 159.0 w_avg_8_128x128_c : 4752.5 4295.0 w_avg_8_128x128_rvv_i32 : 757.7 365.5 --- libavcodec/riscv/vvc/Makefile | 2 + libavcodec/riscv/vvc/vvc_mc_rvv.S | 287 +++++++++++++++++++++++++++++ libavcodec/riscv/vvc/vvcdsp_init.c | 72 ++++++++ libavcodec/vvc/dsp.c | 2 + libavcodec/vvc/dsp.h | 1 + 5 files changed, 364 insertions(+) create mode 100644 libavcodec/riscv/vvc/Makefile create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c