Message ID | 20240715055039.592571-1-jdek@itanimul.li |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,v3] avcodec/riscv: add h264 dc idct rvv | expand |
Context | Check | Description |
---|---|---|
andriy/configure_x86 | warning | Failed to apply patch |
Le 15 juillet 2024 08:50:38 GMT+03:00, "J. Dekker" <jdek@itanimul.li> a écrit : >checkasm: bench runs 131072 (1 << 17) >h264_idct4_add_dc_8bpp_c: 1.5 >h264_idct4_add_dc_8bpp_rvv_i64: 0.7 >h264_idct4_add_dc_9bpp_c: 1.5 >h264_idct4_add_dc_9bpp_rvv_i64: 0.7 >h264_idct4_add_dc_10bpp_c: 1.5 >h264_idct4_add_dc_10bpp_rvv_i64: 0.7 >h264_idct4_add_dc_12bpp_c: 1.2 >h264_idct4_add_dc_12bpp_rvv_i64: 0.7 >h264_idct4_add_dc_14bpp_c: 1.2 >h264_idct4_add_dc_14bpp_rvv_i64: 0.7 >h264_idct8_add_dc_8bpp_c: 5.2 >h264_idct8_add_dc_8bpp_rvv_i64: 1.5 >h264_idct8_add_dc_9bpp_c: 5.5 >h264_idct8_add_dc_9bpp_rvv_i64: 1.2 >h264_idct8_add_dc_10bpp_c: 5.5 >h264_idct8_add_dc_10bpp_rvv_i64: 1.2 >h264_idct8_add_dc_12bpp_c: 4.2 >h264_idct8_add_dc_12bpp_rvv_i64: 1.2 >h264_idct8_add_dc_14bpp_c: 4.2 >h264_idct8_add_dc_14bpp_rvv_i64: 1.2 > >Signed-off-by: J. Dekker <jdek@itanimul.li> >--- > libavcodec/riscv/Makefile | 1 + > libavcodec/riscv/h264dsp_init.c | 44 ++++++++++- > libavcodec/riscv/h264dsp_rvv.S | 130 ++++++++++++++++++++++++++++++++ > 3 files changed, 172 insertions(+), 3 deletions(-) > create mode 100644 libavcodec/riscv/h264dsp_rvv.S > > As Remi mentioned, high bit-depth 4x4 could be done in the same way as low > bit-depth. I've left it with the high bit-depth intentionally since this eases > templating. Use of segments removed and repeated instructions changed to use > m8. > >diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile >index c180223141..a1510e8c6e 100644 >--- a/libavcodec/riscv/Makefile >+++ b/libavcodec/riscv/Makefile >@@ -31,6 +31,7 @@ RVV-OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_rvv.o > OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o > RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o > OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o >+RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_rvv.o > OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o > RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o > OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o >diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c >index dbbf3db400..3256199303 100644 >--- a/libavcodec/riscv/h264dsp_init.c >+++ b/libavcodec/riscv/h264dsp_init.c >@@ -1,4 +1,5 @@ > /* >+ * Copyright (c) 2024 J. Dekker <jdek@itanimul.li> > * Copyright © 2024 Rémi Denis-Courmont. > * > * This file is part of FFmpeg. >@@ -24,22 +25,59 @@ > > #include "libavutil/attributes.h" > #include "libavutil/cpu.h" >+#include "libavutil/riscv/cpu.h" > #include "libavcodec/h264dsp.h" > > extern int ff_startcode_find_candidate_rvb(const uint8_t *, int); > extern int ff_startcode_find_candidate_rvv(const uint8_t *, int); >+void ff_h264_idct4_dc_add_8_rvv(uint8_t *, int16_t *, int); >+void ff_h264_idct8_dc_add_8_rvv(uint8_t *, int16_t *, int); >+void ff_h264_idct4_dc_add_9_rvv(uint8_t *, int16_t *, int); >+void ff_h264_idct8_dc_add_9_rvv(uint8_t *, int16_t *, int); >+void ff_h264_idct4_dc_add_10_rvv(uint8_t *, int16_t *, int); >+void ff_h264_idct8_dc_add_10_rvv(uint8_t *, int16_t *, int); >+void ff_h264_idct4_dc_add_12_rvv(uint8_t *, int16_t *, int); >+void ff_h264_idct8_dc_add_12_rvv(uint8_t *, int16_t *, int); >+void ff_h264_idct4_dc_add_14_rvv(uint8_t *, int16_t *, int); >+void ff_h264_idct8_dc_add_14_rvv(uint8_t *, int16_t *, int); > >-av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, >+av_cold void ff_h264dsp_init_riscv(H264DSPContext *c, const int bit_depth, > const int chroma_format_idc) > { > #if HAVE_RV > int flags = av_get_cpu_flags(); > > if (flags & AV_CPU_FLAG_RVB_BASIC) >- dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb; >+ c->startcode_find_candidate = ff_startcode_find_candidate_rvb; > # if HAVE_RVV > if (flags & AV_CPU_FLAG_RVV_I32) >- dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv; >+ c->startcode_find_candidate = ff_startcode_find_candidate_rvv; > # endif >+ if (ff_rv_vlen_least(128)) { >+ switch(bit_depth) { >+ case 8: >+ if (flags & AV_CPU_FLAG_RVV_I64) { >+ c->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv; >+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv; >+ } >+ break; >+ case 9: >+ c->h264_idct_dc_add = ff_h264_idct4_dc_add_9_rvv; >+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_9_rvv; >+ break; >+ case 10: >+ c->h264_idct_dc_add = ff_h264_idct4_dc_add_10_rvv; >+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_rvv; >+ break; >+ case 12: >+ c->h264_idct_dc_add = ff_h264_idct4_dc_add_12_rvv; >+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_12_rvv; >+ break; >+ case 14: >+ c->h264_idct_dc_add = ff_h264_idct4_dc_add_14_rvv; >+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_14_rvv; >+ break; >+ } >+ } > #endif > } >diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S >new file mode 100644 >index 0000000000..0e6c2e49e9 >--- /dev/null >+++ b/libavcodec/riscv/h264dsp_rvv.S >@@ -0,0 +1,130 @@ >+/* >+ * SPDX-License-Identifier: BSD-2-Clause >+ * >+ * Copyright (c) 2024 J. Dekker <jdek@itanimul.li> >+ * >+ * Redistribution and use in source and binary forms, with or without >+ * modification, are permitted provided that the following conditions >+ * are met: >+ * 1. Redistributions of source code must retain the above copyright >+ * notice, this list of conditions and the following disclaimer. >+ * 2. Redistributions in binary form must reproduce the above copyright >+ * notice, this list of conditions and the following disclaimer in the >+ * documentation and/or other materials provided with the distribution. >+ * >+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" >+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE >+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE >+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE >+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR >+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF >+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS >+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN >+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) >+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE >+ * POSSIBILITY OF SUCH DAMAGE. >+ */ >+ >+#include "libavutil/riscv/asm.S" >+ >+.macro idct_dc_add8 width >+func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba >+.if \width == 8 >+ vsetivli zero, \width, e16, m1, ta, ma >+.else >+ vsetivli zero, \width, e16, mf2, ta, ma >+.endif >+ lh a3, 0(a1) >+ addi a3, a3, 32 >+ srai a3, a3, 6 >+ sh zero, 0(a1) >+.if \width == 8 >+ vlse64.v v24, (a0), a2 >+ vsetvli t0, zero, e16, m8, ta, ma >+.else >+ vlse32.v v24, (a0), a2 >+ vsetvli t0, zero, e16, m4, ta, ma >+.endif >+ vzext.vf2 v0, v24 >+ vadd.vx v0, v0, a3 >+ vmax.vx v0, v0, zero >+.if \width == 8 >+ vsetvli zero, zero, e8, m4, ta, ma >+.else >+ vsetvli zero, zero, e8, m2, ta, ma >+.endif >+ vnclipu.wi v24, v0, 0 >+ vsetivli zero, \width, e8, m1, ta, ma Should work with mf2/mf4, though I'm not sure if it makes any significant performance difference. Or alternatively, `e64, m4/m2` would match the LMUL above, but that is purely cosmetic since SEW is not used here. No objections but it looks like this needs rebasing. >+.if \width == 8 >+ vsse64.v v24, (a0), a2 >+.else >+ vsse32.v v24, (a0), a2 >+.endif >+ ret >+endfunc >+.endm >+ >+idct_dc_add8 4 >+idct_dc_add8 8 >+ >+.macro idct_dc_add width >+func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba >+ vsetivli zero, \width, e16, m1, ta, ma >+ lw a3, 0(a1) >+ addi a3, a3, 32 >+ srai a3, a3, 6 >+ sw zero, 0(a1) >+ add t4, a0, a2 >+ sh1add t5, a2, a0 >+ sh1add t6, a2, t4 >+.if \width == 8 >+ sh2add t0, a2, a0 >+ sh2add t1, a2, t4 >+ sh2add t2, a2, t5 >+ sh2add t3, a2, t6 >+.endif >+ vle16.v v0, (a0) >+ vle16.v v1, (t4) >+ vle16.v v2, (t5) >+ vle16.v v3, (t6) >+.if \width == 8 >+ vle16.v v4, (t0) >+ vle16.v v5, (t1) >+ vle16.v v6, (t2) >+ vle16.v v7, (t3) >+ vsetvli a6, zero, e16, m8, ta, ma >+.else >+ vsetvli a6, zero, e16, m4, ta, ma >+.endif >+ vadd.vx v0, v0, a3 >+ vmax.vx v0, v0, zero >+ vmin.vx v0, v0, a5 >+ vsetivli zero, \width, e16, m1, ta, ma >+ vse16.v v0, (a0) >+ vse16.v v1, (t4) >+ vse16.v v2, (t5) >+ vse16.v v3, (t6) >+.if \width == 8 >+ vse16.v v4, (t0) >+ vse16.v v5, (t1) >+ vse16.v v6, (t2) >+ vse16.v v7, (t3) >+.endif >+ ret >+endfunc >+.endm >+ >+idct_dc_add 4 >+idct_dc_add 8 >+ >+.irp depth,9,10,12,14 >+func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x >+ li a5, (1 << \depth) - 1 >+ j ff_h264_idct4_dc_add_16_rvv >+endfunc >+ >+func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x >+ li a5, (1 << \depth) - 1 >+ j ff_h264_idct8_dc_add_16_rvv >+endfunc >+.endr >-- >2.44.1 > >_______________________________________________ >ffmpeg-devel mailing list >ffmpeg-devel@ffmpeg.org >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > >To unsubscribe, visit link above, or email >ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index c180223141..a1510e8c6e 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -31,6 +31,7 @@ RVV-OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_rvv.o OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o +RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c index dbbf3db400..3256199303 100644 --- a/libavcodec/riscv/h264dsp_init.c +++ b/libavcodec/riscv/h264dsp_init.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2024 J. Dekker <jdek@itanimul.li> * Copyright © 2024 Rémi Denis-Courmont. * * This file is part of FFmpeg. @@ -24,22 +25,59 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" +#include "libavutil/riscv/cpu.h" #include "libavcodec/h264dsp.h" extern int ff_startcode_find_candidate_rvb(const uint8_t *, int); extern int ff_startcode_find_candidate_rvv(const uint8_t *, int); +void ff_h264_idct4_dc_add_8_rvv(uint8_t *, int16_t *, int); +void ff_h264_idct8_dc_add_8_rvv(uint8_t *, int16_t *, int); +void ff_h264_idct4_dc_add_9_rvv(uint8_t *, int16_t *, int); +void ff_h264_idct8_dc_add_9_rvv(uint8_t *, int16_t *, int); +void ff_h264_idct4_dc_add_10_rvv(uint8_t *, int16_t *, int); +void ff_h264_idct8_dc_add_10_rvv(uint8_t *, int16_t *, int); +void ff_h264_idct4_dc_add_12_rvv(uint8_t *, int16_t *, int); +void ff_h264_idct8_dc_add_12_rvv(uint8_t *, int16_t *, int); +void ff_h264_idct4_dc_add_14_rvv(uint8_t *, int16_t *, int); +void ff_h264_idct8_dc_add_14_rvv(uint8_t *, int16_t *, int); -av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, +av_cold void ff_h264dsp_init_riscv(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { #if HAVE_RV int flags = av_get_cpu_flags(); if (flags & AV_CPU_FLAG_RVB_BASIC) - dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb; + c->startcode_find_candidate = ff_startcode_find_candidate_rvb; # if HAVE_RVV if (flags & AV_CPU_FLAG_RVV_I32) - dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv; + c->startcode_find_candidate = ff_startcode_find_candidate_rvv; # endif + if (ff_rv_vlen_least(128)) { + switch(bit_depth) { + case 8: + if (flags & AV_CPU_FLAG_RVV_I64) { + c->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv; + } + break; + case 9: + c->h264_idct_dc_add = ff_h264_idct4_dc_add_9_rvv; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_9_rvv; + break; + case 10: + c->h264_idct_dc_add = ff_h264_idct4_dc_add_10_rvv; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_rvv; + break; + case 12: + c->h264_idct_dc_add = ff_h264_idct4_dc_add_12_rvv; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_12_rvv; + break; + case 14: + c->h264_idct_dc_add = ff_h264_idct4_dc_add_14_rvv; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_14_rvv; + break; + } + } #endif } diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S new file mode 100644 index 0000000000..0e6c2e49e9 --- /dev/null +++ b/libavcodec/riscv/h264dsp_rvv.S @@ -0,0 +1,130 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 J. Dekker <jdek@itanimul.li> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "libavutil/riscv/asm.S" + +.macro idct_dc_add8 width +func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba +.if \width == 8 + vsetivli zero, \width, e16, m1, ta, ma +.else + vsetivli zero, \width, e16, mf2, ta, ma +.endif + lh a3, 0(a1) + addi a3, a3, 32 + srai a3, a3, 6 + sh zero, 0(a1) +.if \width == 8 + vlse64.v v24, (a0), a2 + vsetvli t0, zero, e16, m8, ta, ma +.else + vlse32.v v24, (a0), a2 + vsetvli t0, zero, e16, m4, ta, ma +.endif + vzext.vf2 v0, v24 + vadd.vx v0, v0, a3 + vmax.vx v0, v0, zero +.if \width == 8 + vsetvli zero, zero, e8, m4, ta, ma +.else + vsetvli zero, zero, e8, m2, ta, ma +.endif + vnclipu.wi v24, v0, 0 + vsetivli zero, \width, e8, m1, ta, ma +.if \width == 8 + vsse64.v v24, (a0), a2 +.else + vsse32.v v24, (a0), a2 +.endif + ret +endfunc +.endm + +idct_dc_add8 4 +idct_dc_add8 8 + +.macro idct_dc_add width +func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba + vsetivli zero, \width, e16, m1, ta, ma + lw a3, 0(a1) + addi a3, a3, 32 + srai a3, a3, 6 + sw zero, 0(a1) + add t4, a0, a2 + sh1add t5, a2, a0 + sh1add t6, a2, t4 +.if \width == 8 + sh2add t0, a2, a0 + sh2add t1, a2, t4 + sh2add t2, a2, t5 + sh2add t3, a2, t6 +.endif + vle16.v v0, (a0) + vle16.v v1, (t4) + vle16.v v2, (t5) + vle16.v v3, (t6) +.if \width == 8 + vle16.v v4, (t0) + vle16.v v5, (t1) + vle16.v v6, (t2) + vle16.v v7, (t3) + vsetvli a6, zero, e16, m8, ta, ma +.else + vsetvli a6, zero, e16, m4, ta, ma +.endif + vadd.vx v0, v0, a3 + vmax.vx v0, v0, zero + vmin.vx v0, v0, a5 + vsetivli zero, \width, e16, m1, ta, ma + vse16.v v0, (a0) + vse16.v v1, (t4) + vse16.v v2, (t5) + vse16.v v3, (t6) +.if \width == 8 + vse16.v v4, (t0) + vse16.v v5, (t1) + vse16.v v6, (t2) + vse16.v v7, (t3) +.endif + ret +endfunc +.endm + +idct_dc_add 4 +idct_dc_add 8 + +.irp depth,9,10,12,14 +func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x + li a5, (1 << \depth) - 1 + j ff_h264_idct4_dc_add_16_rvv +endfunc + +func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x + li a5, (1 << \depth) - 1 + j ff_h264_idct8_dc_add_16_rvv +endfunc +.endr
checkasm: bench runs 131072 (1 << 17) h264_idct4_add_dc_8bpp_c: 1.5 h264_idct4_add_dc_8bpp_rvv_i64: 0.7 h264_idct4_add_dc_9bpp_c: 1.5 h264_idct4_add_dc_9bpp_rvv_i64: 0.7 h264_idct4_add_dc_10bpp_c: 1.5 h264_idct4_add_dc_10bpp_rvv_i64: 0.7 h264_idct4_add_dc_12bpp_c: 1.2 h264_idct4_add_dc_12bpp_rvv_i64: 0.7 h264_idct4_add_dc_14bpp_c: 1.2 h264_idct4_add_dc_14bpp_rvv_i64: 0.7 h264_idct8_add_dc_8bpp_c: 5.2 h264_idct8_add_dc_8bpp_rvv_i64: 1.5 h264_idct8_add_dc_9bpp_c: 5.5 h264_idct8_add_dc_9bpp_rvv_i64: 1.2 h264_idct8_add_dc_10bpp_c: 5.5 h264_idct8_add_dc_10bpp_rvv_i64: 1.2 h264_idct8_add_dc_12bpp_c: 4.2 h264_idct8_add_dc_12bpp_rvv_i64: 1.2 h264_idct8_add_dc_14bpp_c: 4.2 h264_idct8_add_dc_14bpp_rvv_i64: 1.2 Signed-off-by: J. Dekker <jdek@itanimul.li> --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/h264dsp_init.c | 44 ++++++++++- libavcodec/riscv/h264dsp_rvv.S | 130 ++++++++++++++++++++++++++++++++ 3 files changed, 172 insertions(+), 3 deletions(-) create mode 100644 libavcodec/riscv/h264dsp_rvv.S As Remi mentioned, high bit-depth 4x4 could be done in the same way as low bit-depth. I've left it with the high bit-depth intentionally since this eases templating. Use of segments removed and repeated instructions changed to use m8.