diff mbox series

[FFmpeg-devel,v2] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks

Message ID 20230619130609.15547-1-arnie.chang@sifive.com
State New
Headers show
Series [FFmpeg-devel,v2] lavc/h264chroma: RISC-V V add motion compensation for 4xH and 2xH chroma blocks | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 fail Make fate failed
andriy/make_x86 success Make finished
andriy/make_fate_x86 fail Make fate failed

Commit Message

Arnie Chang June 19, 2023, 1:06 p.m. UTC
Optimize the put and avg filtering for 4xH and 2xH blocks

Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
---
V2:
1. Change the \width to an run time argument
2. Call to an internal function instead of instantiating similar code three times

RVVi32:
 - h264chroma.chroma_mc [OK]
checkasm: all 6 tests passed
avg_h264_chroma_mc1_8_c: 1821.5
avg_h264_chroma_mc1_8_rvv_i32: 466.5
avg_h264_chroma_mc2_8_c: 939.2
avg_h264_chroma_mc2_8_rvv_i32: 466.5
avg_h264_chroma_mc4_8_c: 502.2
avg_h264_chroma_mc4_8_rvv_i32: 466.5
put_h264_chroma_mc1_8_c: 1436.5
put_h264_chroma_mc1_8_rvv_i32: 382.5
put_h264_chroma_mc2_8_c: 824.2
put_h264_chroma_mc2_8_rvv_i32: 382.5
put_h264_chroma_mc4_8_c: 431.2
put_h264_chroma_mc4_8_rvv_i32: 382.5

 libavcodec/riscv/h264_chroma_init_riscv.c |   8 +
 libavcodec/riscv/h264_mc_chroma.S         | 237 ++++++++++++++--------
 2 files changed, 160 insertions(+), 85 deletions(-)

Comments

Arnie Chang July 25, 2023, 3:37 a.m. UTC | #1
It appears that all the issues raised during the review have been fixed,
and there have been no additional comments for over 1 month.
Could I kindly request assistance in pushing the patch?

On Mon, Jun 19, 2023 at 9:06 PM Arnie Chang <arnie.chang@sifive.com> wrote:

> Optimize the put and avg filtering for 4xH and 2xH blocks
>
> Signed-off-by: Arnie Chang <arnie.chang@sifive.com>
> ---
> V2:
> 1. Change the \width to an run time argument
> 2. Call to an internal function instead of instantiating similar code
> three times
>
> RVVi32:
>  - h264chroma.chroma_mc [OK]
> checkasm: all 6 tests passed
> avg_h264_chroma_mc1_8_c: 1821.5
> avg_h264_chroma_mc1_8_rvv_i32: 466.5
> avg_h264_chroma_mc2_8_c: 939.2
> avg_h264_chroma_mc2_8_rvv_i32: 466.5
> avg_h264_chroma_mc4_8_c: 502.2
> avg_h264_chroma_mc4_8_rvv_i32: 466.5
> put_h264_chroma_mc1_8_c: 1436.5
> put_h264_chroma_mc1_8_rvv_i32: 382.5
> put_h264_chroma_mc2_8_c: 824.2
> put_h264_chroma_mc2_8_rvv_i32: 382.5
> put_h264_chroma_mc4_8_c: 431.2
> put_h264_chroma_mc4_8_rvv_i32: 382.5
>
>  libavcodec/riscv/h264_chroma_init_riscv.c |   8 +
>  libavcodec/riscv/h264_mc_chroma.S         | 237 ++++++++++++++--------
>  2 files changed, 160 insertions(+), 85 deletions(-)
>
> diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c
> b/libavcodec/riscv/h264_chroma_init_riscv.c
> index 7c905edfcd..9f95150ea3 100644
> --- a/libavcodec/riscv/h264_chroma_init_riscv.c
> +++ b/libavcodec/riscv/h264_chroma_init_riscv.c
> @@ -27,6 +27,10 @@
>
>  void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
>  void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
> +void h264_put_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
> +void h264_avg_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
> +void h264_put_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
> +void h264_avg_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src,
> ptrdiff_t stride, int h, int x, int y);
>
>  av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
>  {
> @@ -36,6 +40,10 @@ av_cold void ff_h264chroma_init_riscv(H264ChromaContext
> *c, int bit_depth)
>      if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_get_rv_vlenb() >= 16) {
>          c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
>          c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
> +        c->put_h264_chroma_pixels_tab[1] = h264_put_chroma_mc4_rvv;
> +        c->avg_h264_chroma_pixels_tab[1] = h264_avg_chroma_mc4_rvv;
> +        c->put_h264_chroma_pixels_tab[2] = h264_put_chroma_mc2_rvv;
> +        c->avg_h264_chroma_pixels_tab[2] = h264_avg_chroma_mc2_rvv;
>      }
>  #endif
>  }
> diff --git a/libavcodec/riscv/h264_mc_chroma.S
> b/libavcodec/riscv/h264_mc_chroma.S
> index 364bc3156e..ce99bda44d 100644
> --- a/libavcodec/riscv/h264_mc_chroma.S
> +++ b/libavcodec/riscv/h264_mc_chroma.S
> @@ -19,8 +19,7 @@
>   */
>  #include "libavutil/riscv/asm.S"
>
> -.macro  h264_chroma_mc8 type
> -func h264_\type\()_chroma_mc8_rvv, zve32x
> +.macro  do_chroma_mc type unroll
>          csrw            vxrm, zero
>          slli            t2, a5, 3
>          mul             t1, a5, a4
> @@ -30,94 +29,100 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
>          sub             a7, a4, t1
>          addi            a6, a5, 64
>          sub             t0, t2, t1
> -        vsetivli        t3, 8, e8, m1, ta, mu
> +        vsetvli         t3, t6, e8, m1, ta, mu
>          beqz            t1, 2f
>          blez            a3, 8f
>          li              t4, 0
>          li              t2, 0
>          li              t5, 1
>          addi            a5, t3, 1
> -        slli            t3, a2, 2
> +        slli            t3, a2, (1 + \unroll)
>  1:                                # if (xy != 0)
>          add             a4, a1, t4
>          vsetvli         zero, a5, e8, m1, ta, ma
> +  .ifc \unroll,1
>          addi            t2, t2, 4
> +  .else
> +        addi            t2, t2, 2
> +  .endif
>          vle8.v          v10, (a4)
>          add             a4, a4, a2
>          vslide1down.vx  v11, v10, t5
> -        vsetivli        zero, 8, e8, m1, ta, ma
> +        vsetvli         zero, t6, e8, m1, ta, ma
>          vwmulu.vx       v8, v10, a6
>          vwmaccu.vx      v8, a7, v11
>          vsetvli         zero, a5, e8, m1, ta, ma
>          vle8.v          v12, (a4)
> -        vsetivli        zero, 8, e8, m1, ta, ma
> +        vsetvli         zero, t6, e8, m1, ta, ma
>          add             a4, a4, a2
>          vwmaccu.vx      v8, t0, v12
>          vsetvli         zero, a5, e8, m1, ta, ma
>          vslide1down.vx  v13, v12, t5
> -        vsetivli        zero, 8, e8, m1, ta, ma
> +        vsetvli         zero, t6, e8, m1, ta, ma
>          vwmulu.vx       v10, v12, a6
>          vwmaccu.vx      v8, t1, v13
>          vwmaccu.vx      v10, a7, v13
>          vsetvli         zero, a5, e8, m1, ta, ma
>          vle8.v          v14, (a4)
> -        vsetivli        zero, 8, e8, m1, ta, ma
> +        vsetvli         zero, t6, e8, m1, ta, ma
>          add             a4, a4, a2
>          vwmaccu.vx      v10, t0, v14
>          vsetvli         zero, a5, e8, m1, ta, ma
>          vslide1down.vx  v15, v14, t5
> -        vsetivli        zero, 8, e8, m1, ta, ma
> +        vsetvli         zero, t6, e8, m1, ta, ma
>          vwmulu.vx       v12, v14, a6
>          vwmaccu.vx      v10, t1, v15
>          vwmaccu.vx      v12, a7, v15
> +        vnclipu.wi      v15, v8, 6
> +  .ifc \type,avg
> +        vle8.v          v9, (a0)
> +        vaaddu.vv       v15, v15, v9
> +  .endif
> +        vse8.v          v15, (a0)
> +        add             a0, a0, a2
> +        vnclipu.wi      v8, v10, 6
> +  .ifc \type,avg
> +        vle8.v          v9, (a0)
> +        vaaddu.vv       v8, v8, v9
> +  .endif
> +        add             t4, t4, t3
> +        vse8.v          v8, (a0)
> +        add             a0, a0, a2
> +  .ifc \unroll,1
>          vsetvli         zero, a5, e8, m1, ta, ma
>          vle8.v          v14, (a4)
> -        vsetivli        zero, 8, e8, m1, ta, ma
> +        vsetvli         zero, t6, e8, m1, ta, ma
>          add             a4, a4, a2
>          vwmaccu.vx      v12, t0, v14
>          vsetvli         zero, a5, e8, m1, ta, ma
>          vslide1down.vx  v15, v14, t5
> -        vsetivli        zero, 8, e8, m1, ta, ma
> +        vsetvli         zero, t6, e8, m1, ta, ma
>          vwmulu.vx       v16, v14, a6
>          vwmaccu.vx      v12, t1, v15
>          vwmaccu.vx      v16, a7, v15
>          vsetvli         zero, a5, e8, m1, ta, ma
>          vle8.v          v14, (a4)
> -        vsetivli        zero, 8, e8, m1, ta, ma
> -        add             a4, a0, t4
> -        add             t4, t4, t3
> +        vsetvli         zero, t6, e8, m1, ta, ma
>          vwmaccu.vx      v16, t0, v14
>          vsetvli         zero, a5, e8, m1, ta, ma
>          vslide1down.vx  v14, v14, t5
> -        vsetivli        zero, 8, e8, m1, ta, ma
> -        vnclipu.wi      v15, v8, 6
> +        vsetvli         zero, t6, e8, m1, ta, ma
>          vwmaccu.vx      v16, t1, v14
> -  .ifc \type,avg
> -        vle8.v          v9, (a4)
> -        vaaddu.vv       v15, v15, v9
> -  .endif
> -        vse8.v          v15, (a4)
> -        add             a4, a4, a2
> -        vnclipu.wi      v8, v10, 6
> -  .ifc \type,avg
> -        vle8.v          v9, (a4)
> -        vaaddu.vv       v8, v8, v9
> -  .endif
> -        vse8.v          v8, (a4)
> -        add             a4, a4, a2
>          vnclipu.wi      v8, v12, 6
>    .ifc \type,avg
> -        vle8.v          v9, (a4)
> +        vle8.v          v9, (a0)
>          vaaddu.vv       v8, v8, v9
>    .endif
> -        vse8.v          v8, (a4)
> -        add             a4, a4, a2
> +        vse8.v          v8, (a0)
> +        add             a0, a0, a2
>          vnclipu.wi      v8, v16, 6
>    .ifc \type,avg
> -        vle8.v          v9, (a4)
> +        vle8.v          v9, (a0)
>          vaaddu.vv       v8, v8, v9
>    .endif
> -        vse8.v          v8, (a4)
> +        vse8.v          v8, (a0)
> +        add             a0, a0, a2
> +  .endif
>          blt             t2, a3, 1b
>          j               8f
>  2:
> @@ -126,11 +131,15 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
>          blez            a3, 8f
>          li              a4, 0
>          li              t1, 0
> -        slli            a7, a2, 2
> +        slli            a7, a2, (1 + \unroll)
>  3:                                # if ((x8 - xy) == 0 && (y8 -xy) != 0)
>          add             a5, a1, a4
>          vsetvli         zero, zero, e8, m1, ta, ma
> +  .ifc \unroll,1
>          addi            t1, t1, 4
> +  .else
> +        addi            t1, t1, 2
> +  .endif
>          vle8.v          v8, (a5)
>          add             a5, a5, a2
>          add             t2, a5, a2
> @@ -141,42 +150,44 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
>          add             t2, t2, a2
>          add             a5, t2, a2
>          vwmaccu.vx      v10, t0, v8
> -        vle8.v          v8, (t2)
> -        vle8.v          v14, (a5)
> -        add             a5, a0, a4
>          add             a4, a4, a7
>          vwmaccu.vx      v12, t0, v9
>          vnclipu.wi      v15, v10, 6
>          vwmulu.vx       v10, v9, a6
> +        vnclipu.wi      v9, v12, 6
>    .ifc \type,avg
> -        vle8.v          v16, (a5)
> +        vle8.v          v16, (a0)
>          vaaddu.vv       v15, v15, v16
>    .endif
> -        vse8.v          v15, (a5)
> -        add             a5, a5, a2
> -        vnclipu.wi      v9, v12, 6
> -        vwmaccu.vx      v10, t0, v8
> -        vwmulu.vx       v12, v8, a6
> +        vse8.v          v15, (a0)
> +        add             a0, a0, a2
>    .ifc \type,avg
> -        vle8.v          v16, (a5)
> +        vle8.v          v16, (a0)
>          vaaddu.vv       v9, v9, v16
>    .endif
> -        vse8.v          v9, (a5)
> -        add             a5, a5, a2
> +        vse8.v          v9, (a0)
> +        add             a0, a0, a2
> +  .ifc \unroll,1
> +        vle8.v          v8, (t2)
> +        vle8.v          v14, (a5)
> +        vwmaccu.vx      v10, t0, v8
> +        vwmulu.vx       v12, v8, a6
>          vnclipu.wi      v8, v10, 6
>          vwmaccu.vx      v12, t0, v14
>    .ifc \type,avg
> -        vle8.v          v16, (a5)
> +        vle8.v          v16, (a0)
>          vaaddu.vv       v8, v8, v16
>    .endif
> -        vse8.v          v8, (a5)
> -        add             a5, a5, a2
> +        vse8.v          v8, (a0)
> +        add             a0, a0, a2
>          vnclipu.wi      v8, v12, 6
>    .ifc \type,avg
> -        vle8.v          v16, (a5)
> +        vle8.v          v16, (a0)
>          vaaddu.vv       v8, v8, v16
>    .endif
> -        vse8.v          v8, (a5)
> +        vse8.v          v8, (a0)
> +        add             a0, a0, a2
> +  .endif
>          blt             t1, a3, 3b
>          j               8f
>  4:
> @@ -186,87 +197,95 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
>          li              a4, 0
>          li              t2, 0
>          addi            t0, t3, 1
> -        slli            t1, a2, 2
> +        slli            t1, a2, (1 + \unroll)
>  5:                               # if ((x8 - xy) != 0 && (y8 -xy) == 0)
>          add             a5, a1, a4
>          vsetvli         zero, t0, e8, m1, ta, ma
> +  .ifc \unroll,1
>          addi            t2, t2, 4
> +  .else
> +        addi            t2, t2, 2
> +  .endif
>          vle8.v          v8, (a5)
>          add             a5, a5, a2
>          vslide1down.vx  v9, v8, t5
> -        vsetivli        zero, 8, e8, m1, ta, ma
> +        vsetvli         zero, t6, e8, m1, ta, ma
>          vwmulu.vx       v10, v8, a6
>          vwmaccu.vx      v10, a7, v9
>          vsetvli         zero, t0, e8, m1, ta, ma
>          vle8.v          v8, (a5)
>          add             a5, a5, a2
>          vslide1down.vx  v9, v8, t5
> -        vsetivli        zero, 8, e8, m1, ta, ma
> +        vsetvli         zero, t6, e8, m1, ta, ma
>          vwmulu.vx       v12, v8, a6
>          vwmaccu.vx      v12, a7, v9
> +        vnclipu.wi      v16, v10, 6
> +  .ifc \type,avg
> +        vle8.v          v18, (a0)
> +        vaaddu.vv       v16, v16, v18
> +  .endif
> +        vse8.v          v16, (a0)
> +        add             a0, a0, a2
> +        vnclipu.wi      v10, v12, 6
> +  .ifc \type,avg
> +        vle8.v          v18, (a0)
> +        vaaddu.vv       v10, v10, v18
> +  .endif
> +        add             a4, a4, t1
> +        vse8.v          v10, (a0)
> +        add             a0, a0, a2
> +  .ifc \unroll,1
>          vsetvli         zero, t0, e8, m1, ta, ma
>          vle8.v          v8, (a5)
>          add             a5, a5, a2
>          vslide1down.vx  v9, v8, t5
> -        vsetivli        zero, 8, e8, m1, ta, ma
> +        vsetvli         zero, t6, e8, m1, ta, ma
>          vwmulu.vx       v14, v8, a6
>          vwmaccu.vx      v14, a7, v9
>          vsetvli         zero, t0, e8, m1, ta, ma
>          vle8.v          v8, (a5)
> -        add             a5, a0, a4
> -        add             a4, a4, t1
>          vslide1down.vx  v9, v8, t5
> -        vsetivli        zero, 8, e8, m1, ta, ma
> -        vnclipu.wi      v16, v10, 6
> -  .ifc \type,avg
> -        vle8.v          v18, (a5)
> -        vaaddu.vv       v16, v16, v18
> -  .endif
> -        vse8.v          v16, (a5)
> -        add             a5, a5, a2
> -        vnclipu.wi      v10, v12, 6
> +        vsetvli         zero, t6, e8, m1, ta, ma
>          vwmulu.vx       v12, v8, a6
> -  .ifc \type,avg
> -        vle8.v          v18, (a5)
> -        vaaddu.vv       v10, v10, v18
> -  .endif
> -        vse8.v          v10, (a5)
> -        add             a5, a5, a2
>          vnclipu.wi      v8, v14, 6
>          vwmaccu.vx      v12, a7, v9
>    .ifc \type,avg
> -        vle8.v          v18, (a5)
> +        vle8.v          v18, (a0)
>          vaaddu.vv       v8, v8, v18
>    .endif
> -        vse8.v          v8, (a5)
> -        add             a5, a5, a2
> +        vse8.v          v8, (a0)
> +        add             a0, a0, a2
>          vnclipu.wi      v8, v12, 6
>    .ifc \type,avg
> -        vle8.v          v18, (a5)
> +        vle8.v          v18, (a0)
>          vaaddu.vv       v8, v8, v18
>    .endif
> -        vse8.v          v8, (a5)
> +        vse8.v          v8, (a0)
> +        add             a0, a0, a2
> +  .endif
>          blt             t2, a3, 5b
>          j               8f
>  6:
>          blez            a3, 8f
>          li              a4, 0
>          li              t2, 0
> -        slli            a7, a2, 2
> +        slli            a7, a2, (1 + \unroll)
>  7:                               # the final else, none of the above
> conditions are met
>          add             t0, a1, a4
>          vsetvli         zero, zero, e8, m1, ta, ma
>          add             a5, a0, a4
>          add             a4, a4, a7
> +  .ifc \unroll,1
>          addi            t2, t2, 4
> +  .else
> +        addi            t2, t2, 2
> +  .endif
>          vle8.v          v8, (t0)
>          add             t0, t0, a2
>          add             t1, t0, a2
>          vwmulu.vx       v10, v8, a6
>          vle8.v          v8, (t0)
>          add             t0, t1, a2
> -        vle8.v          v9, (t1)
> -        vle8.v          v12, (t0)
>          vnclipu.wi      v13, v10, 6
>          vwmulu.vx       v10, v8, a6
>    .ifc \type,avg
> @@ -276,13 +295,16 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
>          vse8.v          v13, (a5)
>          add             a5, a5, a2
>          vnclipu.wi      v8, v10, 6
> -        vwmulu.vx       v10, v9, a6
>    .ifc \type,avg
>          vle8.v          v18, (a5)
>          vaaddu.vv       v8, v8, v18
>    .endif
>          vse8.v          v8, (a5)
>          add             a5, a5, a2
> +  .ifc \unroll,1
> +        vle8.v          v9, (t1)
> +        vle8.v          v12, (t0)
> +        vwmulu.vx       v10, v9, a6
>          vnclipu.wi      v8, v10, 6
>          vwmulu.vx       v10, v12, a6
>    .ifc \type,avg
> @@ -297,11 +319,56 @@ func h264_\type\()_chroma_mc8_rvv, zve32x
>          vaaddu.vv       v8, v8, v18
>    .endif
>          vse8.v          v8, (a5)
> +  .endif
>          blt             t2, a3, 7b
>  8:
>          ret
> -endfunc
>  .endm
>
> -h264_chroma_mc8 put
> -h264_chroma_mc8 avg
> +func h264_put_chroma_mc_rvv, zve32x
> +11:
> +        li      a7, 3
> +        blt     a3, a7, 12f
> +        do_chroma_mc put 1
> +12:
> +        do_chroma_mc put 0
> +endfunc
> +
> +func h264_avg_chroma_mc_rvv, zve32x
> +21:
> +        li      a7, 3
> +        blt     a3, a7, 22f
> +        do_chroma_mc avg 1
> +22:
> +        do_chroma_mc avg 0
> +endfunc
> +
> +func h264_put_chroma_mc8_rvv, zve32x
> +        li      t6, 8
> +        j       11b
> +endfunc
> +
> +func h264_put_chroma_mc4_rvv, zve32x
> +        li      t6, 4
> +        j       11b
> +endfunc
> +
> +func h264_put_chroma_mc2_rvv, zve32x
> +        li      t6, 2
> +        j       11b
> +endfunc
> +
> +func h264_avg_chroma_mc8_rvv, zve32x
> +        li      t6, 8
> +        j       21b
> +endfunc
> +
> +func h264_avg_chroma_mc4_rvv, zve32x
> +        li      t6, 4
> +        j       21b
> +endfunc
> +
> +func h264_avg_chroma_mc2_rvv, zve32x
> +        li      t6, 2
> +        j       21b
> +endfunc
> --
> 2.17.1
>
>
Rémi Denis-Courmont July 25, 2023, 6:24 a.m. UTC | #2
Hi,

Sorry, I totally missed the last version. I'll see if I can dig it out of the archives or patchwork.
diff mbox series

Patch

diff --git a/libavcodec/riscv/h264_chroma_init_riscv.c b/libavcodec/riscv/h264_chroma_init_riscv.c
index 7c905edfcd..9f95150ea3 100644
--- a/libavcodec/riscv/h264_chroma_init_riscv.c
+++ b/libavcodec/riscv/h264_chroma_init_riscv.c
@@ -27,6 +27,10 @@ 
 
 void h264_put_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
 void h264_avg_chroma_mc8_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_put_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_avg_chroma_mc4_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_put_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
+void h264_avg_chroma_mc2_rvv(uint8_t *p_dst, const uint8_t *p_src, ptrdiff_t stride, int h, int x, int y);
 
 av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
 {
@@ -36,6 +40,10 @@  av_cold void ff_h264chroma_init_riscv(H264ChromaContext *c, int bit_depth)
     if (bit_depth == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_get_rv_vlenb() >= 16) {
         c->put_h264_chroma_pixels_tab[0] = h264_put_chroma_mc8_rvv;
         c->avg_h264_chroma_pixels_tab[0] = h264_avg_chroma_mc8_rvv;
+        c->put_h264_chroma_pixels_tab[1] = h264_put_chroma_mc4_rvv;
+        c->avg_h264_chroma_pixels_tab[1] = h264_avg_chroma_mc4_rvv;
+        c->put_h264_chroma_pixels_tab[2] = h264_put_chroma_mc2_rvv;
+        c->avg_h264_chroma_pixels_tab[2] = h264_avg_chroma_mc2_rvv;
     }
 #endif
 }
diff --git a/libavcodec/riscv/h264_mc_chroma.S b/libavcodec/riscv/h264_mc_chroma.S
index 364bc3156e..ce99bda44d 100644
--- a/libavcodec/riscv/h264_mc_chroma.S
+++ b/libavcodec/riscv/h264_mc_chroma.S
@@ -19,8 +19,7 @@ 
  */
 #include "libavutil/riscv/asm.S"
 
-.macro  h264_chroma_mc8 type
-func h264_\type\()_chroma_mc8_rvv, zve32x
+.macro  do_chroma_mc type unroll
         csrw            vxrm, zero
         slli            t2, a5, 3
         mul             t1, a5, a4
@@ -30,94 +29,100 @@  func h264_\type\()_chroma_mc8_rvv, zve32x
         sub             a7, a4, t1
         addi            a6, a5, 64
         sub             t0, t2, t1
-        vsetivli        t3, 8, e8, m1, ta, mu
+        vsetvli         t3, t6, e8, m1, ta, mu
         beqz            t1, 2f
         blez            a3, 8f
         li              t4, 0
         li              t2, 0
         li              t5, 1
         addi            a5, t3, 1
-        slli            t3, a2, 2
+        slli            t3, a2, (1 + \unroll)
 1:                                # if (xy != 0)
         add             a4, a1, t4
         vsetvli         zero, a5, e8, m1, ta, ma
+  .ifc \unroll,1
         addi            t2, t2, 4
+  .else
+        addi            t2, t2, 2
+  .endif
         vle8.v          v10, (a4)
         add             a4, a4, a2
         vslide1down.vx  v11, v10, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetvli         zero, t6, e8, m1, ta, ma
         vwmulu.vx       v8, v10, a6
         vwmaccu.vx      v8, a7, v11
         vsetvli         zero, a5, e8, m1, ta, ma
         vle8.v          v12, (a4)
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetvli         zero, t6, e8, m1, ta, ma
         add             a4, a4, a2
         vwmaccu.vx      v8, t0, v12
         vsetvli         zero, a5, e8, m1, ta, ma
         vslide1down.vx  v13, v12, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetvli         zero, t6, e8, m1, ta, ma
         vwmulu.vx       v10, v12, a6
         vwmaccu.vx      v8, t1, v13
         vwmaccu.vx      v10, a7, v13
         vsetvli         zero, a5, e8, m1, ta, ma
         vle8.v          v14, (a4)
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetvli         zero, t6, e8, m1, ta, ma
         add             a4, a4, a2
         vwmaccu.vx      v10, t0, v14
         vsetvli         zero, a5, e8, m1, ta, ma
         vslide1down.vx  v15, v14, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetvli         zero, t6, e8, m1, ta, ma
         vwmulu.vx       v12, v14, a6
         vwmaccu.vx      v10, t1, v15
         vwmaccu.vx      v12, a7, v15
+        vnclipu.wi      v15, v8, 6
+  .ifc \type,avg
+        vle8.v          v9, (a0)
+        vaaddu.vv       v15, v15, v9
+  .endif
+        vse8.v          v15, (a0)
+        add             a0, a0, a2
+        vnclipu.wi      v8, v10, 6
+  .ifc \type,avg
+        vle8.v          v9, (a0)
+        vaaddu.vv       v8, v8, v9
+  .endif
+        add             t4, t4, t3
+        vse8.v          v8, (a0)
+        add             a0, a0, a2
+  .ifc \unroll,1
         vsetvli         zero, a5, e8, m1, ta, ma
         vle8.v          v14, (a4)
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetvli         zero, t6, e8, m1, ta, ma
         add             a4, a4, a2
         vwmaccu.vx      v12, t0, v14
         vsetvli         zero, a5, e8, m1, ta, ma
         vslide1down.vx  v15, v14, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetvli         zero, t6, e8, m1, ta, ma
         vwmulu.vx       v16, v14, a6
         vwmaccu.vx      v12, t1, v15
         vwmaccu.vx      v16, a7, v15
         vsetvli         zero, a5, e8, m1, ta, ma
         vle8.v          v14, (a4)
-        vsetivli        zero, 8, e8, m1, ta, ma
-        add             a4, a0, t4
-        add             t4, t4, t3
+        vsetvli         zero, t6, e8, m1, ta, ma
         vwmaccu.vx      v16, t0, v14
         vsetvli         zero, a5, e8, m1, ta, ma
         vslide1down.vx  v14, v14, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
-        vnclipu.wi      v15, v8, 6
+        vsetvli         zero, t6, e8, m1, ta, ma
         vwmaccu.vx      v16, t1, v14
-  .ifc \type,avg
-        vle8.v          v9, (a4)
-        vaaddu.vv       v15, v15, v9
-  .endif
-        vse8.v          v15, (a4)
-        add             a4, a4, a2
-        vnclipu.wi      v8, v10, 6
-  .ifc \type,avg
-        vle8.v          v9, (a4)
-        vaaddu.vv       v8, v8, v9
-  .endif
-        vse8.v          v8, (a4)
-        add             a4, a4, a2
         vnclipu.wi      v8, v12, 6
   .ifc \type,avg
-        vle8.v          v9, (a4)
+        vle8.v          v9, (a0)
         vaaddu.vv       v8, v8, v9
   .endif
-        vse8.v          v8, (a4)
-        add             a4, a4, a2
+        vse8.v          v8, (a0)
+        add             a0, a0, a2
         vnclipu.wi      v8, v16, 6
   .ifc \type,avg
-        vle8.v          v9, (a4)
+        vle8.v          v9, (a0)
         vaaddu.vv       v8, v8, v9
   .endif
-        vse8.v          v8, (a4)
+        vse8.v          v8, (a0)
+        add             a0, a0, a2
+  .endif
         blt             t2, a3, 1b
         j               8f
 2:
@@ -126,11 +131,15 @@  func h264_\type\()_chroma_mc8_rvv, zve32x
         blez            a3, 8f
         li              a4, 0
         li              t1, 0
-        slli            a7, a2, 2
+        slli            a7, a2, (1 + \unroll)
 3:                                # if ((x8 - xy) == 0 && (y8 -xy) != 0)
         add             a5, a1, a4
         vsetvli         zero, zero, e8, m1, ta, ma
+  .ifc \unroll,1
         addi            t1, t1, 4
+  .else
+        addi            t1, t1, 2
+  .endif
         vle8.v          v8, (a5)
         add             a5, a5, a2
         add             t2, a5, a2
@@ -141,42 +150,44 @@  func h264_\type\()_chroma_mc8_rvv, zve32x
         add             t2, t2, a2
         add             a5, t2, a2
         vwmaccu.vx      v10, t0, v8
-        vle8.v          v8, (t2)
-        vle8.v          v14, (a5)
-        add             a5, a0, a4
         add             a4, a4, a7
         vwmaccu.vx      v12, t0, v9
         vnclipu.wi      v15, v10, 6
         vwmulu.vx       v10, v9, a6
+        vnclipu.wi      v9, v12, 6
   .ifc \type,avg
-        vle8.v          v16, (a5)
+        vle8.v          v16, (a0)
         vaaddu.vv       v15, v15, v16
   .endif
-        vse8.v          v15, (a5)
-        add             a5, a5, a2
-        vnclipu.wi      v9, v12, 6
-        vwmaccu.vx      v10, t0, v8
-        vwmulu.vx       v12, v8, a6
+        vse8.v          v15, (a0)
+        add             a0, a0, a2
   .ifc \type,avg
-        vle8.v          v16, (a5)
+        vle8.v          v16, (a0)
         vaaddu.vv       v9, v9, v16
   .endif
-        vse8.v          v9, (a5)
-        add             a5, a5, a2
+        vse8.v          v9, (a0)
+        add             a0, a0, a2
+  .ifc \unroll,1
+        vle8.v          v8, (t2)
+        vle8.v          v14, (a5)
+        vwmaccu.vx      v10, t0, v8
+        vwmulu.vx       v12, v8, a6
         vnclipu.wi      v8, v10, 6
         vwmaccu.vx      v12, t0, v14
   .ifc \type,avg
-        vle8.v          v16, (a5)
+        vle8.v          v16, (a0)
         vaaddu.vv       v8, v8, v16
   .endif
-        vse8.v          v8, (a5)
-        add             a5, a5, a2
+        vse8.v          v8, (a0)
+        add             a0, a0, a2
         vnclipu.wi      v8, v12, 6
   .ifc \type,avg
-        vle8.v          v16, (a5)
+        vle8.v          v16, (a0)
         vaaddu.vv       v8, v8, v16
   .endif
-        vse8.v          v8, (a5)
+        vse8.v          v8, (a0)
+        add             a0, a0, a2
+  .endif
         blt             t1, a3, 3b
         j               8f
 4:
@@ -186,87 +197,95 @@  func h264_\type\()_chroma_mc8_rvv, zve32x
         li              a4, 0
         li              t2, 0
         addi            t0, t3, 1
-        slli            t1, a2, 2
+        slli            t1, a2, (1 + \unroll)
 5:                               # if ((x8 - xy) != 0 && (y8 -xy) == 0)
         add             a5, a1, a4
         vsetvli         zero, t0, e8, m1, ta, ma
+  .ifc \unroll,1
         addi            t2, t2, 4
+  .else
+        addi            t2, t2, 2
+  .endif
         vle8.v          v8, (a5)
         add             a5, a5, a2
         vslide1down.vx  v9, v8, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetvli         zero, t6, e8, m1, ta, ma
         vwmulu.vx       v10, v8, a6
         vwmaccu.vx      v10, a7, v9
         vsetvli         zero, t0, e8, m1, ta, ma
         vle8.v          v8, (a5)
         add             a5, a5, a2
         vslide1down.vx  v9, v8, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetvli         zero, t6, e8, m1, ta, ma
         vwmulu.vx       v12, v8, a6
         vwmaccu.vx      v12, a7, v9
+        vnclipu.wi      v16, v10, 6
+  .ifc \type,avg
+        vle8.v          v18, (a0)
+        vaaddu.vv       v16, v16, v18
+  .endif
+        vse8.v          v16, (a0)
+        add             a0, a0, a2
+        vnclipu.wi      v10, v12, 6
+  .ifc \type,avg
+        vle8.v          v18, (a0)
+        vaaddu.vv       v10, v10, v18
+  .endif
+        add             a4, a4, t1
+        vse8.v          v10, (a0)
+        add             a0, a0, a2
+  .ifc \unroll,1
         vsetvli         zero, t0, e8, m1, ta, ma
         vle8.v          v8, (a5)
         add             a5, a5, a2
         vslide1down.vx  v9, v8, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
+        vsetvli         zero, t6, e8, m1, ta, ma
         vwmulu.vx       v14, v8, a6
         vwmaccu.vx      v14, a7, v9
         vsetvli         zero, t0, e8, m1, ta, ma
         vle8.v          v8, (a5)
-        add             a5, a0, a4
-        add             a4, a4, t1
         vslide1down.vx  v9, v8, t5
-        vsetivli        zero, 8, e8, m1, ta, ma
-        vnclipu.wi      v16, v10, 6
-  .ifc \type,avg
-        vle8.v          v18, (a5)
-        vaaddu.vv       v16, v16, v18
-  .endif
-        vse8.v          v16, (a5)
-        add             a5, a5, a2
-        vnclipu.wi      v10, v12, 6
+        vsetvli         zero, t6, e8, m1, ta, ma
         vwmulu.vx       v12, v8, a6
-  .ifc \type,avg
-        vle8.v          v18, (a5)
-        vaaddu.vv       v10, v10, v18
-  .endif
-        vse8.v          v10, (a5)
-        add             a5, a5, a2
         vnclipu.wi      v8, v14, 6
         vwmaccu.vx      v12, a7, v9
   .ifc \type,avg
-        vle8.v          v18, (a5)
+        vle8.v          v18, (a0)
         vaaddu.vv       v8, v8, v18
   .endif
-        vse8.v          v8, (a5)
-        add             a5, a5, a2
+        vse8.v          v8, (a0)
+        add             a0, a0, a2
         vnclipu.wi      v8, v12, 6
   .ifc \type,avg
-        vle8.v          v18, (a5)
+        vle8.v          v18, (a0)
         vaaddu.vv       v8, v8, v18
   .endif
-        vse8.v          v8, (a5)
+        vse8.v          v8, (a0)
+        add             a0, a0, a2
+  .endif
         blt             t2, a3, 5b
         j               8f
 6:
         blez            a3, 8f
         li              a4, 0
         li              t2, 0
-        slli            a7, a2, 2
+        slli            a7, a2, (1 + \unroll)
 7:                               # the final else, none of the above conditions are met
         add             t0, a1, a4
         vsetvli         zero, zero, e8, m1, ta, ma
         add             a5, a0, a4
         add             a4, a4, a7
+  .ifc \unroll,1
         addi            t2, t2, 4
+  .else
+        addi            t2, t2, 2
+  .endif
         vle8.v          v8, (t0)
         add             t0, t0, a2
         add             t1, t0, a2
         vwmulu.vx       v10, v8, a6
         vle8.v          v8, (t0)
         add             t0, t1, a2
-        vle8.v          v9, (t1)
-        vle8.v          v12, (t0)
         vnclipu.wi      v13, v10, 6
         vwmulu.vx       v10, v8, a6
   .ifc \type,avg
@@ -276,13 +295,16 @@  func h264_\type\()_chroma_mc8_rvv, zve32x
         vse8.v          v13, (a5)
         add             a5, a5, a2
         vnclipu.wi      v8, v10, 6
-        vwmulu.vx       v10, v9, a6
   .ifc \type,avg
         vle8.v          v18, (a5)
         vaaddu.vv       v8, v8, v18
   .endif
         vse8.v          v8, (a5)
         add             a5, a5, a2
+  .ifc \unroll,1
+        vle8.v          v9, (t1)
+        vle8.v          v12, (t0)
+        vwmulu.vx       v10, v9, a6
         vnclipu.wi      v8, v10, 6
         vwmulu.vx       v10, v12, a6
   .ifc \type,avg
@@ -297,11 +319,56 @@  func h264_\type\()_chroma_mc8_rvv, zve32x
         vaaddu.vv       v8, v8, v18
   .endif
         vse8.v          v8, (a5)
+  .endif
         blt             t2, a3, 7b
 8:
         ret
-endfunc
 .endm
 
-h264_chroma_mc8 put
-h264_chroma_mc8 avg
+func h264_put_chroma_mc_rvv, zve32x
+11:
+        li      a7, 3
+        blt     a3, a7, 12f
+        do_chroma_mc put 1
+12:
+        do_chroma_mc put 0
+endfunc
+
+func h264_avg_chroma_mc_rvv, zve32x
+21:
+        li      a7, 3
+        blt     a3, a7, 22f
+        do_chroma_mc avg 1
+22:
+        do_chroma_mc avg 0
+endfunc
+
+func h264_put_chroma_mc8_rvv, zve32x
+        li      t6, 8
+        j       11b
+endfunc
+
+func h264_put_chroma_mc4_rvv, zve32x
+        li      t6, 4
+        j       11b
+endfunc
+
+func h264_put_chroma_mc2_rvv, zve32x
+        li      t6, 2
+        j       11b
+endfunc
+
+func h264_avg_chroma_mc8_rvv, zve32x
+        li      t6, 8
+        j       21b
+endfunc
+
+func h264_avg_chroma_mc4_rvv, zve32x
+        li      t6, 4
+        j       21b
+endfunc
+
+func h264_avg_chroma_mc2_rvv, zve32x
+        li      t6, 2
+        j       21b
+endfunc