diff mbox series

[FFmpeg-devel,v2] avcodec/riscv: add h264 dc idct rvv

Message ID 20240703104730.883009-1-jdek@itanimul.li
State New
Headers show
Series [FFmpeg-devel,v2] avcodec/riscv: add h264 dc idct rvv | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 fail Make fate failed
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

J. Dekker July 3, 2024, 10:47 a.m. UTC
checkasm: bench runs 131072 (1 << 17)
h264_idct4_add_dc_8bpp_c: 1.5
h264_idct4_add_dc_8bpp_rvv_i64: 0.7
h264_idct4_add_dc_9bpp_c: 1.5
h264_idct4_add_dc_9bpp_rvv_i64: 0.7
h264_idct4_add_dc_10bpp_c: 1.5
h264_idct4_add_dc_10bpp_rvv_i64: 0.7
h264_idct4_add_dc_12bpp_c: 1.2
h264_idct4_add_dc_12bpp_rvv_i64: 0.7
h264_idct4_add_dc_14bpp_c: 1.2
h264_idct4_add_dc_14bpp_rvv_i64: 0.7
h264_idct8_add_dc_8bpp_c: 5.2
h264_idct8_add_dc_8bpp_rvv_i64: 1.5
h264_idct8_add_dc_9bpp_c: 5.5
h264_idct8_add_dc_9bpp_rvv_i64: 1.2
h264_idct8_add_dc_10bpp_c: 5.5
h264_idct8_add_dc_10bpp_rvv_i64: 1.2
h264_idct8_add_dc_12bpp_c: 4.2
h264_idct8_add_dc_12bpp_rvv_i64: 1.2
h264_idct8_add_dc_14bpp_c: 4.2
h264_idct8_add_dc_14bpp_rvv_i64: 1.2

Signed-off-by: J. Dekker <jdek@itanimul.li>
---

 rdcycle always returns 0 on my board, clock_gettime() seems as noisy as
 rdtime (just with bigger numbers).

 libavcodec/riscv/Makefile       |   1 +
 libavcodec/riscv/h264dsp_init.c |  42 +++++++-
 libavcodec/riscv/h264dsp_rvv.S  | 176 ++++++++++++++++++++++++++++++++
 3 files changed, 216 insertions(+), 3 deletions(-)
 create mode 100644 libavcodec/riscv/h264dsp_rvv.S

Comments

Rémi Denis-Courmont July 3, 2024, 3:13 p.m. UTC | #1
Le keskiviikkona 3. heinäkuuta 2024, 13.47.29 EEST J. Dekker a écrit :
> checkasm: bench runs 131072 (1 << 17)
> h264_idct4_add_dc_8bpp_c: 1.5
> h264_idct4_add_dc_8bpp_rvv_i64: 0.7
> h264_idct4_add_dc_9bpp_c: 1.5
> h264_idct4_add_dc_9bpp_rvv_i64: 0.7
> h264_idct4_add_dc_10bpp_c: 1.5
> h264_idct4_add_dc_10bpp_rvv_i64: 0.7
> h264_idct4_add_dc_12bpp_c: 1.2
> h264_idct4_add_dc_12bpp_rvv_i64: 0.7
> h264_idct4_add_dc_14bpp_c: 1.2
> h264_idct4_add_dc_14bpp_rvv_i64: 0.7
> h264_idct8_add_dc_8bpp_c: 5.2
> h264_idct8_add_dc_8bpp_rvv_i64: 1.5
> h264_idct8_add_dc_9bpp_c: 5.5
> h264_idct8_add_dc_9bpp_rvv_i64: 1.2
> h264_idct8_add_dc_10bpp_c: 5.5
> h264_idct8_add_dc_10bpp_rvv_i64: 1.2
> h264_idct8_add_dc_12bpp_c: 4.2
> h264_idct8_add_dc_12bpp_rvv_i64: 1.2
> h264_idct8_add_dc_14bpp_c: 4.2
> h264_idct8_add_dc_14bpp_rvv_i64: 1.2
> 
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> 
>  rdcycle always returns 0 on my board, clock_gettime() seems as noisy as
>  rdtime (just with bigger numbers).

On K230? Odd. Maybe vendor made some updates in later builds.

>  libavcodec/riscv/Makefile       |   1 +
>  libavcodec/riscv/h264dsp_init.c |  42 +++++++-
>  libavcodec/riscv/h264dsp_rvv.S  | 176 ++++++++++++++++++++++++++++++++
>  3 files changed, 216 insertions(+), 3 deletions(-)
>  create mode 100644 libavcodec/riscv/h264dsp_rvv.S
> 
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index c180223141..a1510e8c6e 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -31,6 +31,7 @@ RVV-OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_rvv.o
>  OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
>  RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
>  OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
> +RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_rvv.o
>  OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
>  RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
>  OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
> diff --git a/libavcodec/riscv/h264dsp_init.c
> b/libavcodec/riscv/h264dsp_init.c index dbbf3db400..8c77303ec6 100644
> --- a/libavcodec/riscv/h264dsp_init.c
> +++ b/libavcodec/riscv/h264dsp_init.c
> @@ -1,4 +1,5 @@
>  /*
> + * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
>   * Copyright © 2024 Rémi Denis-Courmont.
>   *
>   * This file is part of FFmpeg.
> @@ -24,22 +25,57 @@
> 
>  #include "libavutil/attributes.h"
>  #include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
>  #include "libavcodec/h264dsp.h"
> 
>  extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
>  extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
> +void ff_h264_idct4_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct4_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct4_dc_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
> +void ff_h264_idct8_dc_add_10_rvv(uint8_t *dst, int16_t *block, int
> stride); +void ff_h264_idct4_dc_add_12_rvv(uint8_t *dst, int16_t *block,
> int stride); +void ff_h264_idct8_dc_add_12_rvv(uint8_t *dst, int16_t
> *block, int stride); +void ff_h264_idct4_dc_add_14_rvv(uint8_t *dst,
> int16_t *block, int stride); +void ff_h264_idct8_dc_add_14_rvv(uint8_t
> *dst, int16_t *block, int stride);
> 
> -av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int
> bit_depth, +av_cold void ff_h264dsp_init_riscv(H264DSPContext *c, const int
> bit_depth, const int chroma_format_idc)
>  {
>  #if HAVE_RV
>      int flags = av_get_cpu_flags();
> 
>      if (flags & AV_CPU_FLAG_RVB_BASIC)
> -        dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
> +        c->startcode_find_candidate = ff_startcode_find_candidate_rvb;
>  # if HAVE_RVV
>      if (flags & AV_CPU_FLAG_RVV_I32)
> -        dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv;
> +        c->startcode_find_candidate = ff_startcode_find_candidate_rvv;
>  # endif
> +    if ((flags & AV_CPU_FLAG_RVV_I64) && ff_rv_vlen_least(16)) {

The assembler below does not seem to require 64-bit elements for anything?
Also ff_rv_vlen_least() expects bits, not bytes.

> +        switch(bit_depth) {
> +        case 8:
> +            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_8_rvv;
> +            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv;
> +            break;
> +        case 9:
> +            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_9_rvv;
> +            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_9_rvv;
> +            break;
> +        case 10:
> +            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_10_rvv;
> +            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_rvv;
> +            break;
> +        case 12:
> +            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_12_rvv;
> +            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_12_rvv;
> +            break;
> +        case 14:
> +            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_14_rvv;
> +            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_14_rvv;
> +            break;
> +       }
> +    }
>  #endif
>  }
> diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
> new file mode 100644
> index 0000000000..57f0433f7c
> --- /dev/null
> +++ b/libavcodec/riscv/h264dsp_rvv.S
> @@ -0,0 +1,176 @@
> +/*
> + * SPDX-License-Identifier: BSD-2-Clause
> + *
> + * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
> IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
> TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
> PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY
> THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT
> (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE
> OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro idct_dc_add8 width
> +func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
> +        vsetivli        zero, \width, e16, m1, ta, ma

mf2 should be faster if \width == 4.

> +        lh              a3, 0(a1)
> +        addi            a3, a3, 32
> +        srai            a3, a3, 6
> +        sh              zero, 0(a1)
> +.if \width == 8
> +        vlsseg8e8.v     v24, (a0), a2
> +.else
> +        vlsseg4e8.v     v24, (a0), a2
> +.endif

You could probably use vlse64.v or vlse32.v here, treating each row as an 
element - and respectively vsse{64,32}.v at the end. You can then load a whole 
8x8 or 4x4 matrix in a single vector group.

This should work fine given that this function does not need to identify rows 
or columns. This is probably faster than using segments. It would also avoid 
repeating each instruction 8 or 4 times below.

That should also work for the 16-bit 4x4 function. Unfortunately, it won't 
work for 16-bit 8x8, as vlse128.v does not exist.

> +        vzext.vf2       v0, v24
> +        vzext.vf2       v2, v25
> +        vzext.vf2       v4, v26
> +        vzext.vf2       v6, v27
> +.if \width == 8
> +        vzext.vf2       v10, v28
> +        vzext.vf2       v12, v29
> +        vzext.vf2       v14, v30
> +        vzext.vf2       v16, v31
> +.endif
> +        vadd.vx         v0, v0, a3
> +        vadd.vx         v2, v2, a3
> +        vadd.vx         v4, v4, a3
> +        vadd.vx         v6, v6, a3
> +.if \width == 8
> +        vadd.vx         v10, v10, a3
> +        vadd.vx         v12, v12, a3
> +        vadd.vx         v14, v14, a3
> +        vadd.vx         v16, v16, a3
> +.endif
> +        vmax.vx         v0, v0, zero
> +        vmax.vx         v2, v2, zero
> +        vmax.vx         v4, v4, zero
> +        vmax.vx         v6, v6, zero
> +.if \width == 8
> +        vmax.vx         v10, v10, zero
> +        vmax.vx         v12, v12, zero
> +        vmax.vx         v14, v14, zero
> +        vmax.vx         v16, v16, zero
> +.endif
> +        vsetvli         zero, zero, e8, mf2, ta, ma
> +        vnclipu.wi      v24, v0, 0
> +        vnclipu.wi      v25, v2, 0
> +        vnclipu.wi      v26, v4, 0
> +        vnclipu.wi      v27, v6, 0
> +.if \width == 8
> +        vnclipu.wi      v28, v10, 0
> +        vnclipu.wi      v29, v12, 0
> +        vnclipu.wi      v30, v14, 0
> +        vnclipu.wi      v31, v16, 0
> +        vssseg8e8.v     v24, (a0), a2
> +.else
> +        vssseg4e8.v     v24, (a0), a2
> +.endif
> +        ret
> +endfunc
> +.endm
> +
> +idct_dc_add8 4
> +idct_dc_add8 8
> +
> +.macro idct_dc_add width
> +func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
> +        vsetivli        zero, \width, e16, m1, ta, ma
> +        lw              a3, 0(a1)
> +        addi            a3, a3, 32
> +        srai            a3, a3, 6
> +        sw              zero, 0(a1)
> +        add             t4, a0, a2
> +        sh1add          t5, a2, a0
> +        sh1add          t6, a2, t4
> +.if \width == 8
> +        sh2add          t0, a2, a0
> +        sh2add          t1, a2, t4
> +        sh2add          t2, a2, t5
> +        sh2add          t3, a2, t6
> +.endif
> +        vle16.v         v0, (a0)
> +        vle16.v         v2, (t4)
> +        vle16.v         v4, (t5)
> +        vle16.v         v6, (t6)
> +.if \width == 8
> +        vle16.v         v10, (t0)
> +        vle16.v         v12, (t1)
> +        vle16.v         v14, (t2)
> +        vle16.v         v16, (t3)
> +.endif
> +        vadd.vx         v0, v0, a3
> +        vadd.vx         v2, v2, a3
> +        vadd.vx         v4, v4, a3
> +        vadd.vx         v6, v6, a3
> +.if \width == 8
> +        vadd.vx         v10, v10, a3
> +        vadd.vx         v12, v12, a3
> +        vadd.vx         v14, v14, a3
> +        vadd.vx         v16, v16, a3
> +.endif
> +        vmax.vx         v0, v0, zero
> +        vmax.vx         v2, v2, zero
> +        vmax.vx         v4, v4, zero
> +        vmax.vx         v6, v6, zero
> +.if \width == 8
> +        vmax.vx         v10, v10, zero
> +        vmax.vx         v12, v12, zero
> +        vmax.vx         v14, v14, zero
> +        vmax.vx         v16, v16, zero
> +.endif
> +        vmin.vx         v0, v0, a5
> +        vmin.vx         v2, v2, a5
> +        vmin.vx         v4, v4, a5
> +        vmin.vx         v6, v6, a5
> +.if \width == 8
> +        vmin.vx         v10, v10, a5
> +        vmin.vx         v12, v12, a5
> +        vmin.vx         v14, v14, a5
> +        vmin.vx         v16, v16, a5
> +.endif
> +        vse16.v         v0, (a0)
> +        vse16.v         v2, (t4)
> +        vse16.v         v4, (t5)
> +        vse16.v         v6, (t6)
> +.if \width == 8
> +        vse16.v         v10, (t0)
> +        vse16.v         v12, (t1)
> +        vse16.v         v14, (t2)
> +        vse16.v         v16, (t3)
> +.endif
> +        ret
> +endfunc
> +.endm
> +
> +idct_dc_add 4
> +idct_dc_add 8
> +
> +.irp depth,9,10,12,14
> +func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
> +        li              a5, (1 << \depth) - 1
> +        j               ff_h264_idct4_dc_add_16_rvv
> +endfunc
> +
> +func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
> +        li              a5, (1 << \depth) - 1
> +        j               ff_h264_idct8_dc_add_16_rvv
> +endfunc
> +.endr
diff mbox series

Patch

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index c180223141..a1510e8c6e 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -31,6 +31,7 @@  RVV-OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_rvv.o
 OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
 RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
 OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
+RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
 OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index dbbf3db400..8c77303ec6 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -1,4 +1,5 @@ 
 /*
+ * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
  * Copyright © 2024 Rémi Denis-Courmont.
  *
  * This file is part of FFmpeg.
@@ -24,22 +25,57 @@ 
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
 #include "libavcodec/h264dsp.h"
 
 extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
 extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
+void ff_h264_idct4_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct4_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct4_dc_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct4_dc_add_12_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_12_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct4_dc_add_14_rvv(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_14_rvv(uint8_t *dst, int16_t *block, int stride);
 
-av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
+av_cold void ff_h264dsp_init_riscv(H264DSPContext *c, const int bit_depth,
                                    const int chroma_format_idc)
 {
 #if HAVE_RV
     int flags = av_get_cpu_flags();
 
     if (flags & AV_CPU_FLAG_RVB_BASIC)
-        dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
+        c->startcode_find_candidate = ff_startcode_find_candidate_rvb;
 # if HAVE_RVV
     if (flags & AV_CPU_FLAG_RVV_I32)
-        dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv;
+        c->startcode_find_candidate = ff_startcode_find_candidate_rvv;
 # endif
+    if ((flags & AV_CPU_FLAG_RVV_I64) && ff_rv_vlen_least(16)) {
+        switch(bit_depth) {
+        case 8:
+            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_8_rvv;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv;
+            break;
+        case 9:
+            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_9_rvv;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_9_rvv;
+            break;
+        case 10:
+            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_10_rvv;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_rvv;
+            break;
+        case 12:
+            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_12_rvv;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_12_rvv;
+            break;
+        case 14:
+            c->h264_idct_dc_add  = ff_h264_idct4_dc_add_14_rvv;
+            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_14_rvv;
+            break;
+       }
+    }
 #endif
 }
diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
new file mode 100644
index 0000000000..57f0433f7c
--- /dev/null
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -0,0 +1,176 @@ 
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro idct_dc_add8 width
+func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
+        vsetivli        zero, \width, e16, m1, ta, ma
+        lh              a3, 0(a1)
+        addi            a3, a3, 32
+        srai            a3, a3, 6
+        sh              zero, 0(a1)
+.if \width == 8
+        vlsseg8e8.v     v24, (a0), a2
+.else
+        vlsseg4e8.v     v24, (a0), a2
+.endif
+        vzext.vf2       v0, v24
+        vzext.vf2       v2, v25
+        vzext.vf2       v4, v26
+        vzext.vf2       v6, v27
+.if \width == 8
+        vzext.vf2       v10, v28
+        vzext.vf2       v12, v29
+        vzext.vf2       v14, v30
+        vzext.vf2       v16, v31
+.endif
+        vadd.vx         v0, v0, a3
+        vadd.vx         v2, v2, a3
+        vadd.vx         v4, v4, a3
+        vadd.vx         v6, v6, a3
+.if \width == 8
+        vadd.vx         v10, v10, a3
+        vadd.vx         v12, v12, a3
+        vadd.vx         v14, v14, a3
+        vadd.vx         v16, v16, a3
+.endif
+        vmax.vx         v0, v0, zero
+        vmax.vx         v2, v2, zero
+        vmax.vx         v4, v4, zero
+        vmax.vx         v6, v6, zero
+.if \width == 8
+        vmax.vx         v10, v10, zero
+        vmax.vx         v12, v12, zero
+        vmax.vx         v14, v14, zero
+        vmax.vx         v16, v16, zero
+.endif
+        vsetvli         zero, zero, e8, mf2, ta, ma
+        vnclipu.wi      v24, v0, 0
+        vnclipu.wi      v25, v2, 0
+        vnclipu.wi      v26, v4, 0
+        vnclipu.wi      v27, v6, 0
+.if \width == 8
+        vnclipu.wi      v28, v10, 0
+        vnclipu.wi      v29, v12, 0
+        vnclipu.wi      v30, v14, 0
+        vnclipu.wi      v31, v16, 0
+        vssseg8e8.v     v24, (a0), a2
+.else
+        vssseg4e8.v     v24, (a0), a2
+.endif
+        ret
+endfunc
+.endm
+
+idct_dc_add8 4
+idct_dc_add8 8
+
+.macro idct_dc_add width
+func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
+        vsetivli        zero, \width, e16, m1, ta, ma
+        lw              a3, 0(a1)
+        addi            a3, a3, 32
+        srai            a3, a3, 6
+        sw              zero, 0(a1)
+        add             t4, a0, a2
+        sh1add          t5, a2, a0
+        sh1add          t6, a2, t4
+.if \width == 8
+        sh2add          t0, a2, a0
+        sh2add          t1, a2, t4
+        sh2add          t2, a2, t5
+        sh2add          t3, a2, t6
+.endif
+        vle16.v         v0, (a0)
+        vle16.v         v2, (t4)
+        vle16.v         v4, (t5)
+        vle16.v         v6, (t6)
+.if \width == 8
+        vle16.v         v10, (t0)
+        vle16.v         v12, (t1)
+        vle16.v         v14, (t2)
+        vle16.v         v16, (t3)
+.endif
+        vadd.vx         v0, v0, a3
+        vadd.vx         v2, v2, a3
+        vadd.vx         v4, v4, a3
+        vadd.vx         v6, v6, a3
+.if \width == 8
+        vadd.vx         v10, v10, a3
+        vadd.vx         v12, v12, a3
+        vadd.vx         v14, v14, a3
+        vadd.vx         v16, v16, a3
+.endif
+        vmax.vx         v0, v0, zero
+        vmax.vx         v2, v2, zero
+        vmax.vx         v4, v4, zero
+        vmax.vx         v6, v6, zero
+.if \width == 8
+        vmax.vx         v10, v10, zero
+        vmax.vx         v12, v12, zero
+        vmax.vx         v14, v14, zero
+        vmax.vx         v16, v16, zero
+.endif
+        vmin.vx         v0, v0, a5
+        vmin.vx         v2, v2, a5
+        vmin.vx         v4, v4, a5
+        vmin.vx         v6, v6, a5
+.if \width == 8
+        vmin.vx         v10, v10, a5
+        vmin.vx         v12, v12, a5
+        vmin.vx         v14, v14, a5
+        vmin.vx         v16, v16, a5
+.endif
+        vse16.v         v0, (a0)
+        vse16.v         v2, (t4)
+        vse16.v         v4, (t5)
+        vse16.v         v6, (t6)
+.if \width == 8
+        vse16.v         v10, (t0)
+        vse16.v         v12, (t1)
+        vse16.v         v14, (t2)
+        vse16.v         v16, (t3)
+.endif
+        ret
+endfunc
+.endm
+
+idct_dc_add 4
+idct_dc_add 8
+
+.irp depth,9,10,12,14
+func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
+        li              a5, (1 << \depth) - 1
+        j               ff_h264_idct4_dc_add_16_rvv
+endfunc
+
+func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
+        li              a5, (1 << \depth) - 1
+        j               ff_h264_idct8_dc_add_16_rvv
+endfunc
+.endr