diff mbox series

[FFmpeg-devel] lavc/vvc_mc: R-V V avg w_avg

Message ID tencent_E698AABD05DAD064F1D79C33D673582A8F06@qq.com
State New
Headers show
Series [FFmpeg-devel] lavc/vvc_mc: R-V V avg w_avg | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

uk7b@foxmail.com July 10, 2024, 10:02 a.m. UTC
From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                      C908   X60
avg_8_2x2_c                                        :    1.2    1.2
avg_8_2x2_rvv_i32                                  :    0.7    0.7
avg_8_2x4_c                                        :    2.0    2.0
avg_8_2x4_rvv_i32                                  :    1.2    1.0
avg_8_2x8_c                                        :    3.7    4.0
avg_8_2x8_rvv_i32                                  :    1.7    1.5
avg_8_2x16_c                                       :    7.2    7.5
avg_8_2x16_rvv_i32                                 :    3.0    2.7
avg_8_2x32_c                                       :   14.5   15.2
avg_8_2x32_rvv_i32                                 :    5.5    5.0
avg_8_2x64_c                                       :   53.5   42.2
avg_8_2x64_rvv_i32                                 :   42.0   33.2
avg_8_2x128_c                                      :   93.5   86.0
avg_8_2x128_rvv_i32                                :   79.2   74.0
avg_8_4x2_c                                        :    1.7    2.0
avg_8_4x2_rvv_i32                                  :    1.0    1.0
avg_8_4x4_c                                        :    3.5    3.5
avg_8_4x4_rvv_i32                                  :    1.2    1.0
avg_8_4x8_c                                        :    6.5    7.0
avg_8_4x8_rvv_i32                                  :    1.7    1.7
avg_8_4x16_c                                       :   13.5   14.0
avg_8_4x16_rvv_i32                                 :    3.0    2.5
avg_8_4x32_c                                       :   26.2   27.5
avg_8_4x32_rvv_i32                                 :    5.7    5.0
avg_8_4x64_c                                       :   79.0   66.5
avg_8_4x64_rvv_i32                                 :   41.7   34.2
avg_8_4x128_c                                      :  154.0  128.7
avg_8_4x128_rvv_i32                                :   80.5   74.5
avg_8_8x2_c                                        :    3.2    3.2
avg_8_8x2_rvv_i32                                  :    1.0    0.7
avg_8_8x4_c                                        :    6.5    6.5
avg_8_8x4_rvv_i32                                  :    1.2    1.0
avg_8_8x8_c                                        :   12.5   13.2
avg_8_8x8_rvv_i32                                  :    2.0    1.7
avg_8_8x16_c                                       :   25.2   26.5
avg_8_8x16_rvv_i32                                 :    3.2    2.7
avg_8_8x32_c                                       :   50.0   52.7
avg_8_8x32_rvv_i32                                 :    6.2    4.7
avg_8_8x64_c                                       :  130.0  112.2
avg_8_8x64_rvv_i32                                 :   44.2   33.5
avg_8_8x128_c                                      :  241.5  226.7
avg_8_8x128_rvv_i32                                :   78.7   74.0
avg_8_16x2_c                                       :    6.2    6.5
avg_8_16x2_rvv_i32                                 :    1.2    0.7
avg_8_16x4_c                                       :   12.2   13.0
avg_8_16x4_rvv_i32                                 :    1.7    1.0
avg_8_16x8_c                                       :   24.7   25.7
avg_8_16x8_rvv_i32                                 :    3.0    1.7
avg_8_16x16_c                                      :   49.0   51.5
avg_8_16x16_rvv_i32                                :    5.5    3.2
avg_8_16x32_c                                      :   97.7  102.7
avg_8_16x32_rvv_i32                                :   10.5    5.5
avg_8_16x64_c                                      :  219.5  223.5
avg_8_16x64_rvv_i32                                :   56.7   34.5
avg_8_16x128_c                                     :  409.7  426.0
avg_8_16x128_rvv_i32                               :   98.7   73.5
avg_8_32x2_c                                       :   12.5   13.0
avg_8_32x2_rvv_i32                                 :    1.7    1.0
avg_8_32x4_c                                       :   24.2   25.5
avg_8_32x4_rvv_i32                                 :    3.0    1.5
avg_8_32x8_c                                       :   48.5   50.7
avg_8_32x8_rvv_i32                                 :    5.2    2.7
avg_8_32x16_c                                      :   96.5  101.2
avg_8_32x16_rvv_i32                                :   10.2    5.0
avg_8_32x32_c                                      :  192.7  202.5
avg_8_32x32_rvv_i32                                :   19.7    9.5
avg_8_32x64_c                                      :  433.5  415.5
avg_8_32x64_rvv_i32                                :   38.7   18.2
avg_8_32x128_c                                     :  812.0  820.7
avg_8_32x128_rvv_i32                               :  145.2   73.0
avg_8_64x2_c                                       :   24.0   25.2
avg_8_64x2_rvv_i32                                 :    2.7    1.5
avg_8_64x4_c                                       :   48.0   50.5
avg_8_64x4_rvv_i32                                 :    5.2    2.5
avg_8_64x8_c                                       :  117.5  100.7
avg_8_64x8_rvv_i32                                 :   10.0    4.7
avg_8_64x16_c                                      :  208.5  201.0
avg_8_64x16_rvv_i32                                :   19.0    9.0
avg_8_64x32_c                                      :  382.7  402.0
avg_8_64x32_rvv_i32                                :   37.5   17.5
avg_8_64x64_c                                      :  830.0  834.2
avg_8_64x64_rvv_i32                                :   75.5   34.5
avg_8_64x128_c                                     : 2008.0 1705.2
avg_8_64x128_rvv_i32                               :  205.5  149.2
avg_8_128x2_c                                      :   48.7   51.0
avg_8_128x2_rvv_i32                                :    5.2    2.7
avg_8_128x4_c                                      :   96.5  101.2
avg_8_128x4_rvv_i32                                :   10.2    4.7
avg_8_128x8_c                                      :  192.2  202.0
avg_8_128x8_rvv_i32                                :   19.7    9.5
avg_8_128x16_c                                     :  385.5  403.2
avg_8_128x16_rvv_i32                               :   38.7   18.2
avg_8_128x32_c                                     :  788.0  805.7
avg_8_128x32_rvv_i32                               :   77.0   36.2
avg_8_128x64_c                                     : 1597.5 1658.0
avg_8_128x64_rvv_i32                               :  175.5   78.7
avg_8_128x128_c                                    : 3156.0 3282.5
avg_8_128x128_rvv_i32                              :  369.2  276.7
w_avg_8_2x2_c                                      :    1.5    1.5
w_avg_8_2x2_rvv_i32                                :    1.2    1.0
w_avg_8_2x4_c                                      :    2.7    2.5
w_avg_8_2x4_rvv_i32                                :    1.7    1.7
w_avg_8_2x8_c                                      :    5.0    4.7
w_avg_8_2x8_rvv_i32                                :    2.7    2.5
w_avg_8_2x16_c                                     :    9.7    9.5
w_avg_8_2x16_rvv_i32                               :    4.7    4.5
w_avg_8_2x32_c                                     :   18.7   18.5
w_avg_8_2x32_rvv_i32                               :    9.0    7.7
w_avg_8_2x64_c                                     :   64.0   51.2
w_avg_8_2x64_rvv_i32                               :   50.0   38.2
w_avg_8_2x128_c                                    :  107.7   94.0
w_avg_8_2x128_rvv_i32                              :   86.2   75.7
w_avg_8_4x2_c                                      :    2.5    2.5
w_avg_8_4x2_rvv_i32                                :    1.2    1.0
w_avg_8_4x4_c                                      :    4.7    4.5
w_avg_8_4x4_rvv_i32                                :    1.7    1.5
w_avg_8_4x8_c                                      :    9.0    9.0
w_avg_8_4x8_rvv_i32                                :    2.7    2.5
w_avg_8_4x16_c                                     :   17.7   17.5
w_avg_8_4x16_rvv_i32                               :    5.0    4.2
w_avg_8_4x32_c                                     :   34.7   35.0
w_avg_8_4x32_rvv_i32                               :    9.0    8.0
w_avg_8_4x64_c                                     :  103.2   82.0
w_avg_8_4x64_rvv_i32                               :   45.7   37.5
w_avg_8_4x128_c                                    :  210.0  164.5
w_avg_8_4x128_rvv_i32                              :   86.2   75.7
w_avg_8_8x2_c                                      :    4.5    4.5
w_avg_8_8x2_rvv_i32                                :    1.2    1.2
w_avg_8_8x4_c                                      :    8.7    8.5
w_avg_8_8x4_rvv_i32                                :    1.7    1.5
w_avg_8_8x8_c                                      :   17.2   17.2
w_avg_8_8x8_rvv_i32                                :    3.2    2.5
w_avg_8_8x16_c                                     :   34.0   34.0
w_avg_8_8x16_rvv_i32                               :    5.5    4.2
w_avg_8_8x32_c                                     :   67.7   67.7
w_avg_8_8x32_rvv_i32                               :   10.7    8.0
w_avg_8_8x64_c                                     :  174.0  145.5
w_avg_8_8x64_rvv_i32                               :   50.0   40.0
w_avg_8_8x128_c                                    :  342.2  294.2
w_avg_8_8x128_rvv_i32                              :   85.2   75.2
w_avg_8_16x2_c                                     :    8.5    8.5
w_avg_8_16x2_rvv_i32                               :    2.0    1.0
w_avg_8_16x4_c                                     :   16.7   17.0
w_avg_8_16x4_rvv_i32                               :    3.2    1.7
w_avg_8_16x8_c                                     :   33.2   33.2
w_avg_8_16x8_rvv_i32                               :    5.5    3.0
w_avg_8_16x16_c                                    :   66.5   66.7
w_avg_8_16x16_rvv_i32                              :   28.2    5.0
w_avg_8_16x32_c                                    :  134.0  133.5
w_avg_8_16x32_rvv_i32                              :   20.0    9.5
w_avg_8_16x64_c                                    :  318.2  344.5
w_avg_8_16x64_rvv_i32                              :   71.7   41.7
w_avg_8_16x128_c                                   :  718.0  583.0
w_avg_8_16x128_rvv_i32                             :  117.5   78.2
w_avg_8_32x2_c                                     :   16.7   16.7
w_avg_8_32x2_rvv_i32                               :    3.7    3.2
w_avg_8_32x4_c                                     :   33.2   33.5
w_avg_8_32x4_rvv_i32                               :    6.7    6.0
w_avg_8_32x8_c                                     :   65.7   66.0
w_avg_8_32x8_rvv_i32                               :   12.5   11.0
w_avg_8_32x16_c                                    :  132.7  133.5
w_avg_8_32x16_rvv_i32                              :   24.0   21.5
w_avg_8_32x32_c                                    :  311.5  263.5
w_avg_8_32x32_rvv_i32                              :   47.7   42.5
w_avg_8_32x64_c                                    :  592.0  555.5
w_avg_8_32x64_rvv_i32                              :  126.5   97.7
w_avg_8_32x128_c                                   : 1179.0 1139.5
w_avg_8_32x128_rvv_i32                             :  238.2  180.7
w_avg_8_64x2_c                                     :   32.7   33.0
w_avg_8_64x2_rvv_i32                               :    6.0    3.2
w_avg_8_64x4_c                                     :   65.7   66.0
w_avg_8_64x4_rvv_i32                               :   11.5    5.7
w_avg_8_64x8_c                                     :  134.0  132.2
w_avg_8_64x8_rvv_i32                               :   22.7   11.0
w_avg_8_64x16_c                                    :  281.2  262.5
w_avg_8_64x16_rvv_i32                              :   44.2   21.5
w_avg_8_64x32_c                                    :  646.2  570.0
w_avg_8_64x32_rvv_i32                              :   88.0   42.5
w_avg_8_64x64_c                                    : 1203.0 1066.7
w_avg_8_64x64_rvv_i32                              :  210.7   90.5
w_avg_8_64x128_c                                   : 2688.0 2156.2
w_avg_8_64x128_rvv_i32                             :  443.0  214.7
w_avg_8_128x2_c                                    :   65.7   66.0
w_avg_8_128x2_rvv_i32                              :   11.2    5.5
w_avg_8_128x4_c                                    :  131.0  133.0
w_avg_8_128x4_rvv_i32                              :   22.0   10.2
w_avg_8_128x8_c                                    :  263.5  273.0
w_avg_8_128x8_rvv_i32                              :   43.2   20.0
w_avg_8_128x16_c                                   :  525.7  528.0
w_avg_8_128x16_rvv_i32                             :   85.5   39.2
w_avg_8_128x32_c                                   : 1064.5 1211.0
w_avg_8_128x32_rvv_i32                             :  170.7   78.5
w_avg_8_128x64_c                                   : 2305.5 2350.7
w_avg_8_128x64_rvv_i32                             :  400.0  177.5
w_avg_8_128x128_c                                  : 4771.7 4992.7
w_avg_8_128x128_rvv_i32                            :  757.5  371.5
---
 libavcodec/riscv/vvc/Makefile      |   2 +
 libavcodec/riscv/vvc/vvc_mc_rvv.S  | 288 +++++++++++++++++++++++++++++
 libavcodec/riscv/vvc/vvcdsp_init.c |  72 ++++++++
 libavcodec/vvc/dsp.c               |   4 +-
 libavcodec/vvc/dsp.h               |   1 +
 5 files changed, 366 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/vvc/Makefile
 create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S
 create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c

Comments

Rémi Denis-Courmont July 16, 2024, 2:21 p.m. UTC | #1
Le keskiviikkona 10. heinäkuuta 2024, 13.02.44 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
>                                                       C908   X60
> avg_8_2x2_c                                        :    1.2    1.2
> avg_8_2x2_rvv_i32                                  :    0.7    0.7
> avg_8_2x4_c                                        :    2.0    2.0
> avg_8_2x4_rvv_i32                                  :    1.2    1.0
> avg_8_2x8_c                                        :    3.7    4.0
> avg_8_2x8_rvv_i32                                  :    1.7    1.5
> avg_8_2x16_c                                       :    7.2    7.5
> avg_8_2x16_rvv_i32                                 :    3.0    2.7
> avg_8_2x32_c                                       :   14.5   15.2
> avg_8_2x32_rvv_i32                                 :    5.5    5.0
> avg_8_2x64_c                                       :   53.5   42.2
> avg_8_2x64_rvv_i32                                 :   42.0   33.2
> avg_8_2x128_c                                      :   93.5   86.0
> avg_8_2x128_rvv_i32                                :   79.2   74.0
> avg_8_4x2_c                                        :    1.7    2.0
> avg_8_4x2_rvv_i32                                  :    1.0    1.0
> avg_8_4x4_c                                        :    3.5    3.5
> avg_8_4x4_rvv_i32                                  :    1.2    1.0
> avg_8_4x8_c                                        :    6.5    7.0
> avg_8_4x8_rvv_i32                                  :    1.7    1.7
> avg_8_4x16_c                                       :   13.5   14.0
> avg_8_4x16_rvv_i32                                 :    3.0    2.5
> avg_8_4x32_c                                       :   26.2   27.5
> avg_8_4x32_rvv_i32                                 :    5.7    5.0
> avg_8_4x64_c                                       :   79.0   66.5
> avg_8_4x64_rvv_i32                                 :   41.7   34.2
> avg_8_4x128_c                                      :  154.0  128.7
> avg_8_4x128_rvv_i32                                :   80.5   74.5
> avg_8_8x2_c                                        :    3.2    3.2
> avg_8_8x2_rvv_i32                                  :    1.0    0.7
> avg_8_8x4_c                                        :    6.5    6.5
> avg_8_8x4_rvv_i32                                  :    1.2    1.0
> avg_8_8x8_c                                        :   12.5   13.2
> avg_8_8x8_rvv_i32                                  :    2.0    1.7
> avg_8_8x16_c                                       :   25.2   26.5
> avg_8_8x16_rvv_i32                                 :    3.2    2.7
> avg_8_8x32_c                                       :   50.0   52.7
> avg_8_8x32_rvv_i32                                 :    6.2    4.7
> avg_8_8x64_c                                       :  130.0  112.2
> avg_8_8x64_rvv_i32                                 :   44.2   33.5
> avg_8_8x128_c                                      :  241.5  226.7
> avg_8_8x128_rvv_i32                                :   78.7   74.0
> avg_8_16x2_c                                       :    6.2    6.5
> avg_8_16x2_rvv_i32                                 :    1.2    0.7
> avg_8_16x4_c                                       :   12.2   13.0
> avg_8_16x4_rvv_i32                                 :    1.7    1.0
> avg_8_16x8_c                                       :   24.7   25.7
> avg_8_16x8_rvv_i32                                 :    3.0    1.7
> avg_8_16x16_c                                      :   49.0   51.5
> avg_8_16x16_rvv_i32                                :    5.5    3.2
> avg_8_16x32_c                                      :   97.7  102.7
> avg_8_16x32_rvv_i32                                :   10.5    5.5
> avg_8_16x64_c                                      :  219.5  223.5
> avg_8_16x64_rvv_i32                                :   56.7   34.5
> avg_8_16x128_c                                     :  409.7  426.0
> avg_8_16x128_rvv_i32                               :   98.7   73.5
> avg_8_32x2_c                                       :   12.5   13.0
> avg_8_32x2_rvv_i32                                 :    1.7    1.0
> avg_8_32x4_c                                       :   24.2   25.5
> avg_8_32x4_rvv_i32                                 :    3.0    1.5
> avg_8_32x8_c                                       :   48.5   50.7
> avg_8_32x8_rvv_i32                                 :    5.2    2.7
> avg_8_32x16_c                                      :   96.5  101.2
> avg_8_32x16_rvv_i32                                :   10.2    5.0
> avg_8_32x32_c                                      :  192.7  202.5
> avg_8_32x32_rvv_i32                                :   19.7    9.5
> avg_8_32x64_c                                      :  433.5  415.5
> avg_8_32x64_rvv_i32                                :   38.7   18.2
> avg_8_32x128_c                                     :  812.0  820.7
> avg_8_32x128_rvv_i32                               :  145.2   73.0
> avg_8_64x2_c                                       :   24.0   25.2
> avg_8_64x2_rvv_i32                                 :    2.7    1.5
> avg_8_64x4_c                                       :   48.0   50.5
> avg_8_64x4_rvv_i32                                 :    5.2    2.5
> avg_8_64x8_c                                       :  117.5  100.7
> avg_8_64x8_rvv_i32                                 :   10.0    4.7
> avg_8_64x16_c                                      :  208.5  201.0
> avg_8_64x16_rvv_i32                                :   19.0    9.0
> avg_8_64x32_c                                      :  382.7  402.0
> avg_8_64x32_rvv_i32                                :   37.5   17.5
> avg_8_64x64_c                                      :  830.0  834.2
> avg_8_64x64_rvv_i32                                :   75.5   34.5
> avg_8_64x128_c                                     : 2008.0 1705.2
> avg_8_64x128_rvv_i32                               :  205.5  149.2
> avg_8_128x2_c                                      :   48.7   51.0
> avg_8_128x2_rvv_i32                                :    5.2    2.7
> avg_8_128x4_c                                      :   96.5  101.2
> avg_8_128x4_rvv_i32                                :   10.2    4.7
> avg_8_128x8_c                                      :  192.2  202.0
> avg_8_128x8_rvv_i32                                :   19.7    9.5
> avg_8_128x16_c                                     :  385.5  403.2
> avg_8_128x16_rvv_i32                               :   38.7   18.2
> avg_8_128x32_c                                     :  788.0  805.7
> avg_8_128x32_rvv_i32                               :   77.0   36.2
> avg_8_128x64_c                                     : 1597.5 1658.0
> avg_8_128x64_rvv_i32                               :  175.5   78.7
> avg_8_128x128_c                                    : 3156.0 3282.5
> avg_8_128x128_rvv_i32                              :  369.2  276.7
> w_avg_8_2x2_c                                      :    1.5    1.5
> w_avg_8_2x2_rvv_i32                                :    1.2    1.0
> w_avg_8_2x4_c                                      :    2.7    2.5
> w_avg_8_2x4_rvv_i32                                :    1.7    1.7
> w_avg_8_2x8_c                                      :    5.0    4.7
> w_avg_8_2x8_rvv_i32                                :    2.7    2.5
> w_avg_8_2x16_c                                     :    9.7    9.5
> w_avg_8_2x16_rvv_i32                               :    4.7    4.5
> w_avg_8_2x32_c                                     :   18.7   18.5
> w_avg_8_2x32_rvv_i32                               :    9.0    7.7
> w_avg_8_2x64_c                                     :   64.0   51.2
> w_avg_8_2x64_rvv_i32                               :   50.0   38.2
> w_avg_8_2x128_c                                    :  107.7   94.0
> w_avg_8_2x128_rvv_i32                              :   86.2   75.7
> w_avg_8_4x2_c                                      :    2.5    2.5
> w_avg_8_4x2_rvv_i32                                :    1.2    1.0
> w_avg_8_4x4_c                                      :    4.7    4.5
> w_avg_8_4x4_rvv_i32                                :    1.7    1.5
> w_avg_8_4x8_c                                      :    9.0    9.0
> w_avg_8_4x8_rvv_i32                                :    2.7    2.5
> w_avg_8_4x16_c                                     :   17.7   17.5
> w_avg_8_4x16_rvv_i32                               :    5.0    4.2
> w_avg_8_4x32_c                                     :   34.7   35.0
> w_avg_8_4x32_rvv_i32                               :    9.0    8.0
> w_avg_8_4x64_c                                     :  103.2   82.0
> w_avg_8_4x64_rvv_i32                               :   45.7   37.5
> w_avg_8_4x128_c                                    :  210.0  164.5
> w_avg_8_4x128_rvv_i32                              :   86.2   75.7
> w_avg_8_8x2_c                                      :    4.5    4.5
> w_avg_8_8x2_rvv_i32                                :    1.2    1.2
> w_avg_8_8x4_c                                      :    8.7    8.5
> w_avg_8_8x4_rvv_i32                                :    1.7    1.5
> w_avg_8_8x8_c                                      :   17.2   17.2
> w_avg_8_8x8_rvv_i32                                :    3.2    2.5
> w_avg_8_8x16_c                                     :   34.0   34.0
> w_avg_8_8x16_rvv_i32                               :    5.5    4.2
> w_avg_8_8x32_c                                     :   67.7   67.7
> w_avg_8_8x32_rvv_i32                               :   10.7    8.0
> w_avg_8_8x64_c                                     :  174.0  145.5
> w_avg_8_8x64_rvv_i32                               :   50.0   40.0
> w_avg_8_8x128_c                                    :  342.2  294.2
> w_avg_8_8x128_rvv_i32                              :   85.2   75.2
> w_avg_8_16x2_c                                     :    8.5    8.5
> w_avg_8_16x2_rvv_i32                               :    2.0    1.0
> w_avg_8_16x4_c                                     :   16.7   17.0
> w_avg_8_16x4_rvv_i32                               :    3.2    1.7
> w_avg_8_16x8_c                                     :   33.2   33.2
> w_avg_8_16x8_rvv_i32                               :    5.5    3.0
> w_avg_8_16x16_c                                    :   66.5   66.7
> w_avg_8_16x16_rvv_i32                              :   28.2    5.0
> w_avg_8_16x32_c                                    :  134.0  133.5
> w_avg_8_16x32_rvv_i32                              :   20.0    9.5
> w_avg_8_16x64_c                                    :  318.2  344.5
> w_avg_8_16x64_rvv_i32                              :   71.7   41.7
> w_avg_8_16x128_c                                   :  718.0  583.0
> w_avg_8_16x128_rvv_i32                             :  117.5   78.2
> w_avg_8_32x2_c                                     :   16.7   16.7
> w_avg_8_32x2_rvv_i32                               :    3.7    3.2
> w_avg_8_32x4_c                                     :   33.2   33.5
> w_avg_8_32x4_rvv_i32                               :    6.7    6.0
> w_avg_8_32x8_c                                     :   65.7   66.0
> w_avg_8_32x8_rvv_i32                               :   12.5   11.0
> w_avg_8_32x16_c                                    :  132.7  133.5
> w_avg_8_32x16_rvv_i32                              :   24.0   21.5
> w_avg_8_32x32_c                                    :  311.5  263.5
> w_avg_8_32x32_rvv_i32                              :   47.7   42.5
> w_avg_8_32x64_c                                    :  592.0  555.5
> w_avg_8_32x64_rvv_i32                              :  126.5   97.7
> w_avg_8_32x128_c                                   : 1179.0 1139.5
> w_avg_8_32x128_rvv_i32                             :  238.2  180.7
> w_avg_8_64x2_c                                     :   32.7   33.0
> w_avg_8_64x2_rvv_i32                               :    6.0    3.2
> w_avg_8_64x4_c                                     :   65.7   66.0
> w_avg_8_64x4_rvv_i32                               :   11.5    5.7
> w_avg_8_64x8_c                                     :  134.0  132.2
> w_avg_8_64x8_rvv_i32                               :   22.7   11.0
> w_avg_8_64x16_c                                    :  281.2  262.5
> w_avg_8_64x16_rvv_i32                              :   44.2   21.5
> w_avg_8_64x32_c                                    :  646.2  570.0
> w_avg_8_64x32_rvv_i32                              :   88.0   42.5
> w_avg_8_64x64_c                                    : 1203.0 1066.7
> w_avg_8_64x64_rvv_i32                              :  210.7   90.5
> w_avg_8_64x128_c                                   : 2688.0 2156.2
> w_avg_8_64x128_rvv_i32                             :  443.0  214.7
> w_avg_8_128x2_c                                    :   65.7   66.0
> w_avg_8_128x2_rvv_i32                              :   11.2    5.5
> w_avg_8_128x4_c                                    :  131.0  133.0
> w_avg_8_128x4_rvv_i32                              :   22.0   10.2
> w_avg_8_128x8_c                                    :  263.5  273.0
> w_avg_8_128x8_rvv_i32                              :   43.2   20.0
> w_avg_8_128x16_c                                   :  525.7  528.0
> w_avg_8_128x16_rvv_i32                             :   85.5   39.2
> w_avg_8_128x32_c                                   : 1064.5 1211.0
> w_avg_8_128x32_rvv_i32                             :  170.7   78.5
> w_avg_8_128x64_c                                   : 2305.5 2350.7
> w_avg_8_128x64_rvv_i32                             :  400.0  177.5
> w_avg_8_128x128_c                                  : 4771.7 4992.7
> w_avg_8_128x128_rvv_i32                            :  757.5  371.5
> ---
>  libavcodec/riscv/vvc/Makefile      |   2 +
>  libavcodec/riscv/vvc/vvc_mc_rvv.S  | 288 +++++++++++++++++++++++++++++
>  libavcodec/riscv/vvc/vvcdsp_init.c |  72 ++++++++
>  libavcodec/vvc/dsp.c               |   4 +-
>  libavcodec/vvc/dsp.h               |   1 +
>  5 files changed, 366 insertions(+), 1 deletion(-)
>  create mode 100644 libavcodec/riscv/vvc/Makefile
>  create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S
>  create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c
> 
> diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile
> new file mode 100644
> index 0000000000..582b051579
> --- /dev/null
> +++ b/libavcodec/riscv/vvc/Makefile
> @@ -0,0 +1,2 @@
> +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
> +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
> diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> b/libavcodec/riscv/vvc/vvc_mc_rvv.S new file mode 100644
> index 0000000000..8cf4bcf680
> --- /dev/null
> +++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
> @@ -0,0 +1,288 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS). + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro vsetvlstatic8 w, vlen, max_lmul=m4

Again, I don't think that a maximul multiplier belongs here. If the calling 
code cannot scale the multiplier up, then it should be a normal loop providing 
the same code for all VLENs.

> +        .if \w == 2 && \vlen == 128
> +                vsetivli        zero, \w, e8, mf8, ta, ma
> +        .elseif \w == 4 && \vlen == 128
> +                vsetivli        zero, \w, e8, mf4, ta, ma
> +        .elseif \w == 8 && \vlen == 128
> +                vsetivli        zero, \w, e8, mf2, ta, ma
> +        .elseif \w == 16 && \vlen == 128
> +                vsetivli        zero, \w, e8, m1, ta, ma
> +        .elseif \w == 32 && \vlen == 128
> +                li              t0, \w
> +                vsetvli         zero, t0, e8, m2, ta, ma
> +        .elseif \w <= 4 && \vlen == 256
> +                vsetivli        zero, \w, e8, mf8, ta, ma
> +        .elseif \w == 8 && \vlen == 256
> +                vsetivli        zero, \w, e8, mf4, ta, ma
> +        .elseif \w == 16 && \vlen == 256
> +                vsetivli        zero, \w, e8, mf2, ta, ma
> +        .elseif \w == 32 && \vlen == 256
> +                li              t0, \w
> +                vsetvli         zero, t0, e8, m1, ta, ma
> +        .elseif \w == 64 && \vlen == 256
> +                li              t0, \w
> +                vsetvli         zero, t0, e8, m2, ta, ma
> +        // (\w <= 128 && \vlen == 128) || (\w == 128 && \vlen == 256)
> +        .else
> +                li              t0, \w
> +                vsetvli         zero, t0, e8, \max_lmul, ta, ma
> +        .endif
> +.endm
> +
> +.macro vsetvlstatic16 w, vlen, max_lmul=m8
> +        .if \w == 2 && \vlen == 128
> +                vsetivli        zero, \w, e16, mf4, ta, ma
> +        .elseif \w == 4 && \vlen == 128
> +                vsetivli        zero, \w, e16, mf2, ta, ma
> +        .elseif \w == 8 && \vlen == 128
> +                vsetivli        zero, \w, e16, m1, ta, ma
> +        .elseif \w == 16 && \vlen == 128
> +                vsetivli        zero, \w, e16, m2, ta, ma
> +        .elseif \w == 32 && \vlen == 128
> +                li              t0, \w
> +                vsetvli         zero, t0, e16, m4, ta, ma
> +        .elseif \w <= 4 && \vlen == 256
> +                vsetivli        zero, \w, e16, mf4, ta, ma
> +        .elseif \w == 8 && \vlen == 256
> +                vsetivli        zero, \w, e16, mf2, ta, ma
> +        .elseif \w == 16 && \vlen == 256
> +                vsetivli        zero, \w, e16, m1, ta, ma
> +        .elseif \w == 32 && \vlen == 256
> +                li              t0, \w
> +                vsetvli         zero, t0, e16, m2, ta, ma
> +        .elseif \w == 64 && \vlen == 256
> +                li              t0, \w
> +                vsetvli         zero, t0, e16, m4, ta, ma
> +        // (\w <= 128 && \vlen == 128) || (\w == 128 && \vlen == 256)
> +        .else
> +                li              t0, \w
> +                vsetvli         zero, t0, e16, \max_lmul, ta, ma
> +        .endif
> +.endm
> +
> +.macro vsetvlstatic32 w, vlen
> +        .if \w == 2
> +                vsetivli        zero, \w, e32, mf2, ta, ma
> +        .elseif \w == 4 && \vlen == 128
> +                vsetivli        zero, \w, e32, m1, ta, ma
> +        .elseif \w == 8 && \vlen == 128
> +                vsetivli        zero, \w, e32, m2, ta, ma
> +        .elseif \w == 16 && \vlen == 128
> +                vsetivli        zero, \w, e32, m4, ta, ma
> +        .elseif \w == 4 && \vlen == 256
> +                vsetivli        zero, \w, e32, mf2, ta, ma
> +        .elseif \w == 8 && \vlen == 256
> +                vsetivli        zero, \w, e32, m1, ta, ma
> +        .elseif \w == 16 && \vlen == 256
> +                vsetivli        zero, \w, e32, m2, ta, ma
> +        .elseif \w == 32 && \vlen == 256
> +                li              t0, \w
> +                vsetvli         zero, t0, e32, m4, ta, ma
> +        // (\w <= 128 && \vlen == 128) || (\w <= 128 && \vlen == 256)
> +        .else
> +                li              t0, \w
> +                vsetvli         zero, t0, e32, m8, ta, ma
> +        .endif
> +.endm
> +
> +.macro avg w, vlen, id
> +\id\w\vlen:
> +.if \w < 128
> +        vsetvlstatic16    \w, \vlen
> +        addi              t0, a2, 128*2
> +        addi              t1, a3, 128*2
> +        add               t2, a0, a1
> +        vle16.v           v0, (a2)
> +        vle16.v           v8, (a3)
> +        addi              a5, a5, -2
> +        vle16.v           v16, (t0)
> +        vle16.v           v24, (t1)
> +        vadd.vv           v8, v8, v0
> +        vadd.vv           v24, v24, v16
> +        vmax.vx           v8, v8, zero
> +        vmax.vx           v24, v24, zero
> +        vsetvlstatic8     \w, \vlen
> +        addi              a2, a2, 128*4
> +        vnclipu.wi        v8, v8, 7
> +        vnclipu.wi        v24, v24, 7
> +        addi              a3, a3, 128*4
> +        vse8.v            v8, (a0)
> +        vse8.v            v24, (t2)
> +        sh1add            a0, a1, a0
> +.else
> +        addi              a5, a5, -1
> +        mv                t1, a0
> +        mv                t2, a2
> +        mv                t3, a3
> +        mv                t4, a4
> +1:
> +        vsetvli           t0, a4, e16, m8, ta, ma
> +        sub               a4, a4, t0
> +        vle16.v           v0, (a2)
> +        vle16.v           v8, (a3)
> +        vadd.vv           v8, v8, v0
> +        vmax.vx           v8, v8, zero
> +        vsetvli           zero, zero, e8, m4, ta, ma
> +        vnclipu.wi        v8, v8, 7
> +        vse8.v            v8, (a0)
> +        sh1add            a2, t0, a2
> +        sh1add            a3, t0, a3
> +        add               a0, a0, t0
> +        bnez              a4, 1b
> +        add               a0, t1, a1
> +        addi              a2, t2, 128*2
> +        addi              a3, t3, 128*2
> +        mv                a4, t4
> +.endif
> +        bnez              a5, \id\w\vlen\()b
> +        ret
> +.endm
> +
> +
> +.macro AVG_JMP_TABLE id, vlen
> +const jmp_table_\id\vlen
> +        .4byte \id\()2\vlen\()f - jmp_table_\id\vlen
> +        .4byte \id\()4\vlen\()f - jmp_table_\id\vlen
> +        .4byte \id\()8\vlen\()f - jmp_table_\id\vlen
> +        .4byte \id\()16\vlen\()f - jmp_table_\id\vlen
> +        .4byte \id\()32\vlen\()f - jmp_table_\id\vlen
> +        .4byte \id\()64\vlen\()f - jmp_table_\id\vlen
> +        .4byte \id\()128\vlen\()f - jmp_table_\id\vlen

Maybe use .irp here?

> +endconst
> +.endm
> +
> +.macro AVG_J vlen, id
> +        clz               t1, a4
> +        neg               t1, t1
> +        lla               t5, jmp_table_\id\vlen
> +        sh2add            t1, t1, t5
> +        lw                t1, ((__riscv_xlen-2)<<2)(t1)
> +        add               t1, t1, t5
> +        jr                t1
> +.endm
> +
> +.macro func_avg vlen
> +func ff_vvc_avg_8_rvv_\vlen\(), zve32x
> +        AVG_JMP_TABLE     1, \vlen
> +        csrwi             vxrm, 0
> +        AVG_J             \vlen, 1
> +        .irp w,2,4,8,16,32,64,128
> +        avg               \w, \vlen, 1
> +        .endr
> +endfunc
> +.endm
> +
> +.macro w_avg w, vlen, id
> +\id\w\vlen:
> +.if \w < 32
> +        vsetvlstatic16    \w, \vlen, m4
> +        addi              t0, a2, 128*2
> +        addi              t1, a3, 128*2
> +        vle16.v           v0, (a2)
> +        vle16.v           v8, (a3)
> +        addi              a5, a5, -2
> +        vle16.v           v20, (t0)
> +        vle16.v           v24, (t1)
> +        vwmul.vx          v16, v0, a7
> +        vwmul.vx          v28, v20, a7
> +        vwmacc.vx         v16, t3, v8
> +        vwmacc.vx         v28, t3, v24
> +        vsetvlstatic32    \w, \vlen
> +        add               t2, a0, a1
> +        vadd.vx           v16, v16, t4
> +        vadd.vx           v28, v28, t4
> +        vsetvlstatic16    \w, \vlen, m4
> +        vnsrl.wx          v16, v16, t6
> +        vnsrl.wx          v28, v28, t6
> +        vmax.vx           v16, v16, zero
> +        vmax.vx           v28, v28, zero
> +        vsetvlstatic8     \w, \vlen, m2
> +        addi              a2, a2, 128*4
> +        vnclipu.wi        v16, v16, 0
> +        vnclipu.wi        v28, v28, 0
> +        vse8.v            v16, (a0)
> +        addi              a3, a3, 128*4
> +        vse8.v            v28, (t2)
> +        sh1add            a0, a1, a0
> +.else
> +        addi              a5, a5, -1
> +        mv                t1, a0
> +        mv                t2, a2
> +        mv                t5, a3
> +        mv                a6, a4
> +1:
> +        vsetvli           t0, a4, e16, m4, ta, ma
> +        sub               a4, a4, t0
> +        vle16.v           v0, (a2)
> +        vle16.v           v8, (a3)
> +        vwmul.vx          v16, v0, a7
> +        vwmacc.vx         v16, t3, v8
> +        vsetvli           zero, zero, e32, m8, ta, ma
> +        vadd.vx           v16, v16, t4
> +        vsetvli           zero, zero, e16, m4, ta, ma
> +        vnsrl.wx          v16, v16, t6
> +        vmax.vx           v16, v16, zero
> +        vsetvli           zero, zero, e8, m2, ta, ma
> +        vnclipu.wi        v16, v16, 0
> +        vse8.v            v16, (a0)
> +        sh1add            a2, t0, a2
> +        sh1add            a3, t0, a3
> +        add               a0, a0, t0
> +        bnez              a4, 1b
> +        add               a0, t1, a1
> +        addi              a2, t2, 128*2
> +        addi              a3, t5, 128*2
> +        mv                a4, a6
> +.endif
> +        bnez              a5, \id\w\vlen\()b
> +        ret
> +.endm
> +
> +
> +.macro func_w_avg vlen
> +func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x
> +        AVG_JMP_TABLE     2, \vlen
> +        csrwi             vxrm, 0
> +        addi              t6, a6, 7
> +        ld                t3, (sp)
> +        ld                t4, 8(sp)
> +        ld                t5, 16(sp)

Breaks build if XLEN = 32.

> +        addi              t4, t4, 1       // o0 + o1 + 1
> +        add               t4, t4, t5
> +        addi              t5, t6, -1      // shift - 1
> +        sll               t4, t4, t5
> +        AVG_J             \vlen, 2
> +        .irp w,2,4,8,16,32,64,128
> +        w_avg             \w, \vlen, 2
> +        .endr
> +endfunc
> +.endm
> +
> +func_avg 128
> +func_avg 256
> +#if (__riscv_xlen == 64)
> +func_w_avg 128
> +func_w_avg 256
> +#endif
> diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c
> b/libavcodec/riscv/vvc/vvcdsp_init.c new file mode 100644
> index 0000000000..9819a7c570
> --- /dev/null
> +++ b/libavcodec/riscv/vvc/vvcdsp_init.c
> @@ -0,0 +1,72 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS). + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include "config.h"
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
> +#include "libavcodec/vvc/dsp.h"
> +
> +#define bf(fn, bd,  opt) fn##_##bd##_##opt
> +
> +#define AVG_PROTOTYPES(bd, opt)                                            
>                          \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst,
> ptrdiff_t dst_stride,                                     \ +    const
> int16_t *src0, const int16_t *src1, int width, int height);                
>                \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t
> dst_stride,                                   \ +    const int16_t *src0,
> const int16_t *src1, int width, int height,                                
> \ +    int denom, int w0, int w1, int o0, int o1);
> +
> +AVG_PROTOTYPES(8, rvv_128)
> +AVG_PROTOTYPES(8, rvv_256)
> +
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> +{
> +#if HAVE_RVV
> +    const int flags = av_get_cpu_flags();
> +    int vlenb = ff_get_rv_vlenb();
> +
> +    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
> +        vlenb >= 32) {
> +        switch (bd) {
> +            case 8:
> +                c->inter.avg    = ff_vvc_avg_8_rvv_256;
> +# if (__riscv_xlen == 64)
> +                c->inter.w_avg    = ff_vvc_w_avg_8_rvv_256;
> +# endif
> +                break;
> +            default:
> +                break;
> +        }
> +    } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags &
> AV_CPU_FLAG_RVB_ADDR) && +               vlenb >= 16) {
> +        switch (bd) {
> +            case 8:
> +                c->inter.avg    = ff_vvc_avg_8_rvv_128;
> +# if (__riscv_xlen == 64)
> +                c->inter.w_avg    = ff_vvc_w_avg_8_rvv_128;
> +# endif
> +                break;
> +            default:
> +                break;
> +        }
> +    }
> +#endif
> +}
> diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
> index 41e830a98a..c55a37d255 100644
> --- a/libavcodec/vvc/dsp.c
> +++ b/libavcodec/vvc/dsp.c
> @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int
> bit_depth) break;
>      }
> 
> -#if ARCH_X86
> +#if ARCH_RISCV
> +    ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
> +#elif ARCH_X86
>      ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
>  #endif
>  }
> diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> index 1f14096c41..e03236dd76 100644
> --- a/libavcodec/vvc/dsp.h
> +++ b/libavcodec/vvc/dsp.h
> @@ -180,6 +180,7 @@ typedef struct VVCDSPContext {
> 
>  void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
> 
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
>  void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
> 
>  #endif /* AVCODEC_VVC_DSP_H */
flow gg July 18, 2024, 3:04 p.m. UTC | #2
> Again, I don't think that a maximul multiplier belongs here. If the
calling
> code cannot scale the multiplier up, then it should be a normal loop
providing
> the same code for all VLENs.

I think it's acceptable to add such a parameter, which isn't particularly
common in other files, because this vset is used for vvc_mc_rvv.S rather
than libavutil/riscv/asm.S. This parameter isn't only used for avg and
w_avg; it can also save some if  for other functions in vvc_mc_rvv.S later
on


>> +        .4byte \id\()64\vlen\()f - jmp_table_\id\vlen
>> +        .4byte \id\()128\vlen\()f - jmp_table_\id\vlen
> Maybe use .irp here?

I'm not sure, there is a syntax error here with the mixed use of them.

        .irp w,2,4,8,16,32,64,128
        .4byte \id\()\w\()\vlen\()f - jmp_table_\id\vlen
        .endr

libavcodec/riscv/vvc/vvc_mc_rvv.S:176: Error: junk at end of line, first
unrecognized character is `\'
libavcodec/riscv/vvc/vvc_mc_rvv.S:195:   Info: macro invoked from here

> Breaks build if XLEN = 32.

Okay,updated it

Rémi Denis-Courmont <remi@remlab.net> 于2024年7月16日周二 22:31写道:

> Le keskiviikkona 10. heinäkuuta 2024, 13.02.44 EEST uk7b@foxmail.com a
> écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> >                                                       C908   X60
> > avg_8_2x2_c                                        :    1.2    1.2
> > avg_8_2x2_rvv_i32                                  :    0.7    0.7
> > avg_8_2x4_c                                        :    2.0    2.0
> > avg_8_2x4_rvv_i32                                  :    1.2    1.0
> > avg_8_2x8_c                                        :    3.7    4.0
> > avg_8_2x8_rvv_i32                                  :    1.7    1.5
> > avg_8_2x16_c                                       :    7.2    7.5
> > avg_8_2x16_rvv_i32                                 :    3.0    2.7
> > avg_8_2x32_c                                       :   14.5   15.2
> > avg_8_2x32_rvv_i32                                 :    5.5    5.0
> > avg_8_2x64_c                                       :   53.5   42.2
> > avg_8_2x64_rvv_i32                                 :   42.0   33.2
> > avg_8_2x128_c                                      :   93.5   86.0
> > avg_8_2x128_rvv_i32                                :   79.2   74.0
> > avg_8_4x2_c                                        :    1.7    2.0
> > avg_8_4x2_rvv_i32                                  :    1.0    1.0
> > avg_8_4x4_c                                        :    3.5    3.5
> > avg_8_4x4_rvv_i32                                  :    1.2    1.0
> > avg_8_4x8_c                                        :    6.5    7.0
> > avg_8_4x8_rvv_i32                                  :    1.7    1.7
> > avg_8_4x16_c                                       :   13.5   14.0
> > avg_8_4x16_rvv_i32                                 :    3.0    2.5
> > avg_8_4x32_c                                       :   26.2   27.5
> > avg_8_4x32_rvv_i32                                 :    5.7    5.0
> > avg_8_4x64_c                                       :   79.0   66.5
> > avg_8_4x64_rvv_i32                                 :   41.7   34.2
> > avg_8_4x128_c                                      :  154.0  128.7
> > avg_8_4x128_rvv_i32                                :   80.5   74.5
> > avg_8_8x2_c                                        :    3.2    3.2
> > avg_8_8x2_rvv_i32                                  :    1.0    0.7
> > avg_8_8x4_c                                        :    6.5    6.5
> > avg_8_8x4_rvv_i32                                  :    1.2    1.0
> > avg_8_8x8_c                                        :   12.5   13.2
> > avg_8_8x8_rvv_i32                                  :    2.0    1.7
> > avg_8_8x16_c                                       :   25.2   26.5
> > avg_8_8x16_rvv_i32                                 :    3.2    2.7
> > avg_8_8x32_c                                       :   50.0   52.7
> > avg_8_8x32_rvv_i32                                 :    6.2    4.7
> > avg_8_8x64_c                                       :  130.0  112.2
> > avg_8_8x64_rvv_i32                                 :   44.2   33.5
> > avg_8_8x128_c                                      :  241.5  226.7
> > avg_8_8x128_rvv_i32                                :   78.7   74.0
> > avg_8_16x2_c                                       :    6.2    6.5
> > avg_8_16x2_rvv_i32                                 :    1.2    0.7
> > avg_8_16x4_c                                       :   12.2   13.0
> > avg_8_16x4_rvv_i32                                 :    1.7    1.0
> > avg_8_16x8_c                                       :   24.7   25.7
> > avg_8_16x8_rvv_i32                                 :    3.0    1.7
> > avg_8_16x16_c                                      :   49.0   51.5
> > avg_8_16x16_rvv_i32                                :    5.5    3.2
> > avg_8_16x32_c                                      :   97.7  102.7
> > avg_8_16x32_rvv_i32                                :   10.5    5.5
> > avg_8_16x64_c                                      :  219.5  223.5
> > avg_8_16x64_rvv_i32                                :   56.7   34.5
> > avg_8_16x128_c                                     :  409.7  426.0
> > avg_8_16x128_rvv_i32                               :   98.7   73.5
> > avg_8_32x2_c                                       :   12.5   13.0
> > avg_8_32x2_rvv_i32                                 :    1.7    1.0
> > avg_8_32x4_c                                       :   24.2   25.5
> > avg_8_32x4_rvv_i32                                 :    3.0    1.5
> > avg_8_32x8_c                                       :   48.5   50.7
> > avg_8_32x8_rvv_i32                                 :    5.2    2.7
> > avg_8_32x16_c                                      :   96.5  101.2
> > avg_8_32x16_rvv_i32                                :   10.2    5.0
> > avg_8_32x32_c                                      :  192.7  202.5
> > avg_8_32x32_rvv_i32                                :   19.7    9.5
> > avg_8_32x64_c                                      :  433.5  415.5
> > avg_8_32x64_rvv_i32                                :   38.7   18.2
> > avg_8_32x128_c                                     :  812.0  820.7
> > avg_8_32x128_rvv_i32                               :  145.2   73.0
> > avg_8_64x2_c                                       :   24.0   25.2
> > avg_8_64x2_rvv_i32                                 :    2.7    1.5
> > avg_8_64x4_c                                       :   48.0   50.5
> > avg_8_64x4_rvv_i32                                 :    5.2    2.5
> > avg_8_64x8_c                                       :  117.5  100.7
> > avg_8_64x8_rvv_i32                                 :   10.0    4.7
> > avg_8_64x16_c                                      :  208.5  201.0
> > avg_8_64x16_rvv_i32                                :   19.0    9.0
> > avg_8_64x32_c                                      :  382.7  402.0
> > avg_8_64x32_rvv_i32                                :   37.5   17.5
> > avg_8_64x64_c                                      :  830.0  834.2
> > avg_8_64x64_rvv_i32                                :   75.5   34.5
> > avg_8_64x128_c                                     : 2008.0 1705.2
> > avg_8_64x128_rvv_i32                               :  205.5  149.2
> > avg_8_128x2_c                                      :   48.7   51.0
> > avg_8_128x2_rvv_i32                                :    5.2    2.7
> > avg_8_128x4_c                                      :   96.5  101.2
> > avg_8_128x4_rvv_i32                                :   10.2    4.7
> > avg_8_128x8_c                                      :  192.2  202.0
> > avg_8_128x8_rvv_i32                                :   19.7    9.5
> > avg_8_128x16_c                                     :  385.5  403.2
> > avg_8_128x16_rvv_i32                               :   38.7   18.2
> > avg_8_128x32_c                                     :  788.0  805.7
> > avg_8_128x32_rvv_i32                               :   77.0   36.2
> > avg_8_128x64_c                                     : 1597.5 1658.0
> > avg_8_128x64_rvv_i32                               :  175.5   78.7
> > avg_8_128x128_c                                    : 3156.0 3282.5
> > avg_8_128x128_rvv_i32                              :  369.2  276.7
> > w_avg_8_2x2_c                                      :    1.5    1.5
> > w_avg_8_2x2_rvv_i32                                :    1.2    1.0
> > w_avg_8_2x4_c                                      :    2.7    2.5
> > w_avg_8_2x4_rvv_i32                                :    1.7    1.7
> > w_avg_8_2x8_c                                      :    5.0    4.7
> > w_avg_8_2x8_rvv_i32                                :    2.7    2.5
> > w_avg_8_2x16_c                                     :    9.7    9.5
> > w_avg_8_2x16_rvv_i32                               :    4.7    4.5
> > w_avg_8_2x32_c                                     :   18.7   18.5
> > w_avg_8_2x32_rvv_i32                               :    9.0    7.7
> > w_avg_8_2x64_c                                     :   64.0   51.2
> > w_avg_8_2x64_rvv_i32                               :   50.0   38.2
> > w_avg_8_2x128_c                                    :  107.7   94.0
> > w_avg_8_2x128_rvv_i32                              :   86.2   75.7
> > w_avg_8_4x2_c                                      :    2.5    2.5
> > w_avg_8_4x2_rvv_i32                                :    1.2    1.0
> > w_avg_8_4x4_c                                      :    4.7    4.5
> > w_avg_8_4x4_rvv_i32                                :    1.7    1.5
> > w_avg_8_4x8_c                                      :    9.0    9.0
> > w_avg_8_4x8_rvv_i32                                :    2.7    2.5
> > w_avg_8_4x16_c                                     :   17.7   17.5
> > w_avg_8_4x16_rvv_i32                               :    5.0    4.2
> > w_avg_8_4x32_c                                     :   34.7   35.0
> > w_avg_8_4x32_rvv_i32                               :    9.0    8.0
> > w_avg_8_4x64_c                                     :  103.2   82.0
> > w_avg_8_4x64_rvv_i32                               :   45.7   37.5
> > w_avg_8_4x128_c                                    :  210.0  164.5
> > w_avg_8_4x128_rvv_i32                              :   86.2   75.7
> > w_avg_8_8x2_c                                      :    4.5    4.5
> > w_avg_8_8x2_rvv_i32                                :    1.2    1.2
> > w_avg_8_8x4_c                                      :    8.7    8.5
> > w_avg_8_8x4_rvv_i32                                :    1.7    1.5
> > w_avg_8_8x8_c                                      :   17.2   17.2
> > w_avg_8_8x8_rvv_i32                                :    3.2    2.5
> > w_avg_8_8x16_c                                     :   34.0   34.0
> > w_avg_8_8x16_rvv_i32                               :    5.5    4.2
> > w_avg_8_8x32_c                                     :   67.7   67.7
> > w_avg_8_8x32_rvv_i32                               :   10.7    8.0
> > w_avg_8_8x64_c                                     :  174.0  145.5
> > w_avg_8_8x64_rvv_i32                               :   50.0   40.0
> > w_avg_8_8x128_c                                    :  342.2  294.2
> > w_avg_8_8x128_rvv_i32                              :   85.2   75.2
> > w_avg_8_16x2_c                                     :    8.5    8.5
> > w_avg_8_16x2_rvv_i32                               :    2.0    1.0
> > w_avg_8_16x4_c                                     :   16.7   17.0
> > w_avg_8_16x4_rvv_i32                               :    3.2    1.7
> > w_avg_8_16x8_c                                     :   33.2   33.2
> > w_avg_8_16x8_rvv_i32                               :    5.5    3.0
> > w_avg_8_16x16_c                                    :   66.5   66.7
> > w_avg_8_16x16_rvv_i32                              :   28.2    5.0
> > w_avg_8_16x32_c                                    :  134.0  133.5
> > w_avg_8_16x32_rvv_i32                              :   20.0    9.5
> > w_avg_8_16x64_c                                    :  318.2  344.5
> > w_avg_8_16x64_rvv_i32                              :   71.7   41.7
> > w_avg_8_16x128_c                                   :  718.0  583.0
> > w_avg_8_16x128_rvv_i32                             :  117.5   78.2
> > w_avg_8_32x2_c                                     :   16.7   16.7
> > w_avg_8_32x2_rvv_i32                               :    3.7    3.2
> > w_avg_8_32x4_c                                     :   33.2   33.5
> > w_avg_8_32x4_rvv_i32                               :    6.7    6.0
> > w_avg_8_32x8_c                                     :   65.7   66.0
> > w_avg_8_32x8_rvv_i32                               :   12.5   11.0
> > w_avg_8_32x16_c                                    :  132.7  133.5
> > w_avg_8_32x16_rvv_i32                              :   24.0   21.5
> > w_avg_8_32x32_c                                    :  311.5  263.5
> > w_avg_8_32x32_rvv_i32                              :   47.7   42.5
> > w_avg_8_32x64_c                                    :  592.0  555.5
> > w_avg_8_32x64_rvv_i32                              :  126.5   97.7
> > w_avg_8_32x128_c                                   : 1179.0 1139.5
> > w_avg_8_32x128_rvv_i32                             :  238.2  180.7
> > w_avg_8_64x2_c                                     :   32.7   33.0
> > w_avg_8_64x2_rvv_i32                               :    6.0    3.2
> > w_avg_8_64x4_c                                     :   65.7   66.0
> > w_avg_8_64x4_rvv_i32                               :   11.5    5.7
> > w_avg_8_64x8_c                                     :  134.0  132.2
> > w_avg_8_64x8_rvv_i32                               :   22.7   11.0
> > w_avg_8_64x16_c                                    :  281.2  262.5
> > w_avg_8_64x16_rvv_i32                              :   44.2   21.5
> > w_avg_8_64x32_c                                    :  646.2  570.0
> > w_avg_8_64x32_rvv_i32                              :   88.0   42.5
> > w_avg_8_64x64_c                                    : 1203.0 1066.7
> > w_avg_8_64x64_rvv_i32                              :  210.7   90.5
> > w_avg_8_64x128_c                                   : 2688.0 2156.2
> > w_avg_8_64x128_rvv_i32                             :  443.0  214.7
> > w_avg_8_128x2_c                                    :   65.7   66.0
> > w_avg_8_128x2_rvv_i32                              :   11.2    5.5
> > w_avg_8_128x4_c                                    :  131.0  133.0
> > w_avg_8_128x4_rvv_i32                              :   22.0   10.2
> > w_avg_8_128x8_c                                    :  263.5  273.0
> > w_avg_8_128x8_rvv_i32                              :   43.2   20.0
> > w_avg_8_128x16_c                                   :  525.7  528.0
> > w_avg_8_128x16_rvv_i32                             :   85.5   39.2
> > w_avg_8_128x32_c                                   : 1064.5 1211.0
> > w_avg_8_128x32_rvv_i32                             :  170.7   78.5
> > w_avg_8_128x64_c                                   : 2305.5 2350.7
> > w_avg_8_128x64_rvv_i32                             :  400.0  177.5
> > w_avg_8_128x128_c                                  : 4771.7 4992.7
> > w_avg_8_128x128_rvv_i32                            :  757.5  371.5
> > ---
> >  libavcodec/riscv/vvc/Makefile      |   2 +
> >  libavcodec/riscv/vvc/vvc_mc_rvv.S  | 288 +++++++++++++++++++++++++++++
> >  libavcodec/riscv/vvc/vvcdsp_init.c |  72 ++++++++
> >  libavcodec/vvc/dsp.c               |   4 +-
> >  libavcodec/vvc/dsp.h               |   1 +
> >  5 files changed, 366 insertions(+), 1 deletion(-)
> >  create mode 100644 libavcodec/riscv/vvc/Makefile
> >  create mode 100644 libavcodec/riscv/vvc/vvc_mc_rvv.S
> >  create mode 100644 libavcodec/riscv/vvc/vvcdsp_init.c
> >
> > diff --git a/libavcodec/riscv/vvc/Makefile
> b/libavcodec/riscv/vvc/Makefile
> > new file mode 100644
> > index 0000000000..582b051579
> > --- /dev/null
> > +++ b/libavcodec/riscv/vvc/Makefile
> > @@ -0,0 +1,2 @@
> > +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
> > +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
> > diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> > b/libavcodec/riscv/vvc/vvc_mc_rvv.S new file mode 100644
> > index 0000000000..8cf4bcf680
> > --- /dev/null
> > +++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
> > @@ -0,0 +1,288 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +.macro vsetvlstatic8 w, vlen, max_lmul=m4
>
> Again, I don't think that a maximul multiplier belongs here. If the
> calling
> code cannot scale the multiplier up, then it should be a normal loop
> providing
> the same code for all VLENs.
>
> > +        .if \w == 2 && \vlen == 128
> > +                vsetivli        zero, \w, e8, mf8, ta, ma
> > +        .elseif \w == 4 && \vlen == 128
> > +                vsetivli        zero, \w, e8, mf4, ta, ma
> > +        .elseif \w == 8 && \vlen == 128
> > +                vsetivli        zero, \w, e8, mf2, ta, ma
> > +        .elseif \w == 16 && \vlen == 128
> > +                vsetivli        zero, \w, e8, m1, ta, ma
> > +        .elseif \w == 32 && \vlen == 128
> > +                li              t0, \w
> > +                vsetvli         zero, t0, e8, m2, ta, ma
> > +        .elseif \w <= 4 && \vlen == 256
> > +                vsetivli        zero, \w, e8, mf8, ta, ma
> > +        .elseif \w == 8 && \vlen == 256
> > +                vsetivli        zero, \w, e8, mf4, ta, ma
> > +        .elseif \w == 16 && \vlen == 256
> > +                vsetivli        zero, \w, e8, mf2, ta, ma
> > +        .elseif \w == 32 && \vlen == 256
> > +                li              t0, \w
> > +                vsetvli         zero, t0, e8, m1, ta, ma
> > +        .elseif \w == 64 && \vlen == 256
> > +                li              t0, \w
> > +                vsetvli         zero, t0, e8, m2, ta, ma
> > +        // (\w <= 128 && \vlen == 128) || (\w == 128 && \vlen == 256)
> > +        .else
> > +                li              t0, \w
> > +                vsetvli         zero, t0, e8, \max_lmul, ta, ma
> > +        .endif
> > +.endm
> > +
> > +.macro vsetvlstatic16 w, vlen, max_lmul=m8
> > +        .if \w == 2 && \vlen == 128
> > +                vsetivli        zero, \w, e16, mf4, ta, ma
> > +        .elseif \w == 4 && \vlen == 128
> > +                vsetivli        zero, \w, e16, mf2, ta, ma
> > +        .elseif \w == 8 && \vlen == 128
> > +                vsetivli        zero, \w, e16, m1, ta, ma
> > +        .elseif \w == 16 && \vlen == 128
> > +                vsetivli        zero, \w, e16, m2, ta, ma
> > +        .elseif \w == 32 && \vlen == 128
> > +                li              t0, \w
> > +                vsetvli         zero, t0, e16, m4, ta, ma
> > +        .elseif \w <= 4 && \vlen == 256
> > +                vsetivli        zero, \w, e16, mf4, ta, ma
> > +        .elseif \w == 8 && \vlen == 256
> > +                vsetivli        zero, \w, e16, mf2, ta, ma
> > +        .elseif \w == 16 && \vlen == 256
> > +                vsetivli        zero, \w, e16, m1, ta, ma
> > +        .elseif \w == 32 && \vlen == 256
> > +                li              t0, \w
> > +                vsetvli         zero, t0, e16, m2, ta, ma
> > +        .elseif \w == 64 && \vlen == 256
> > +                li              t0, \w
> > +                vsetvli         zero, t0, e16, m4, ta, ma
> > +        // (\w <= 128 && \vlen == 128) || (\w == 128 && \vlen == 256)
> > +        .else
> > +                li              t0, \w
> > +                vsetvli         zero, t0, e16, \max_lmul, ta, ma
> > +        .endif
> > +.endm
> > +
> > +.macro vsetvlstatic32 w, vlen
> > +        .if \w == 2
> > +                vsetivli        zero, \w, e32, mf2, ta, ma
> > +        .elseif \w == 4 && \vlen == 128
> > +                vsetivli        zero, \w, e32, m1, ta, ma
> > +        .elseif \w == 8 && \vlen == 128
> > +                vsetivli        zero, \w, e32, m2, ta, ma
> > +        .elseif \w == 16 && \vlen == 128
> > +                vsetivli        zero, \w, e32, m4, ta, ma
> > +        .elseif \w == 4 && \vlen == 256
> > +                vsetivli        zero, \w, e32, mf2, ta, ma
> > +        .elseif \w == 8 && \vlen == 256
> > +                vsetivli        zero, \w, e32, m1, ta, ma
> > +        .elseif \w == 16 && \vlen == 256
> > +                vsetivli        zero, \w, e32, m2, ta, ma
> > +        .elseif \w == 32 && \vlen == 256
> > +                li              t0, \w
> > +                vsetvli         zero, t0, e32, m4, ta, ma
> > +        // (\w <= 128 && \vlen == 128) || (\w <= 128 && \vlen == 256)
> > +        .else
> > +                li              t0, \w
> > +                vsetvli         zero, t0, e32, m8, ta, ma
> > +        .endif
> > +.endm
> > +
> > +.macro avg w, vlen, id
> > +\id\w\vlen:
> > +.if \w < 128
> > +        vsetvlstatic16    \w, \vlen
> > +        addi              t0, a2, 128*2
> > +        addi              t1, a3, 128*2
> > +        add               t2, a0, a1
> > +        vle16.v           v0, (a2)
> > +        vle16.v           v8, (a3)
> > +        addi              a5, a5, -2
> > +        vle16.v           v16, (t0)
> > +        vle16.v           v24, (t1)
> > +        vadd.vv           v8, v8, v0
> > +        vadd.vv           v24, v24, v16
> > +        vmax.vx           v8, v8, zero
> > +        vmax.vx           v24, v24, zero
> > +        vsetvlstatic8     \w, \vlen
> > +        addi              a2, a2, 128*4
> > +        vnclipu.wi        v8, v8, 7
> > +        vnclipu.wi        v24, v24, 7
> > +        addi              a3, a3, 128*4
> > +        vse8.v            v8, (a0)
> > +        vse8.v            v24, (t2)
> > +        sh1add            a0, a1, a0
> > +.else
> > +        addi              a5, a5, -1
> > +        mv                t1, a0
> > +        mv                t2, a2
> > +        mv                t3, a3
> > +        mv                t4, a4
> > +1:
> > +        vsetvli           t0, a4, e16, m8, ta, ma
> > +        sub               a4, a4, t0
> > +        vle16.v           v0, (a2)
> > +        vle16.v           v8, (a3)
> > +        vadd.vv           v8, v8, v0
> > +        vmax.vx           v8, v8, zero
> > +        vsetvli           zero, zero, e8, m4, ta, ma
> > +        vnclipu.wi        v8, v8, 7
> > +        vse8.v            v8, (a0)
> > +        sh1add            a2, t0, a2
> > +        sh1add            a3, t0, a3
> > +        add               a0, a0, t0
> > +        bnez              a4, 1b
> > +        add               a0, t1, a1
> > +        addi              a2, t2, 128*2
> > +        addi              a3, t3, 128*2
> > +        mv                a4, t4
> > +.endif
> > +        bnez              a5, \id\w\vlen\()b
> > +        ret
> > +.endm
> > +
> > +
> > +.macro AVG_JMP_TABLE id, vlen
> > +const jmp_table_\id\vlen
> > +        .4byte \id\()2\vlen\()f - jmp_table_\id\vlen
> > +        .4byte \id\()4\vlen\()f - jmp_table_\id\vlen
> > +        .4byte \id\()8\vlen\()f - jmp_table_\id\vlen
> > +        .4byte \id\()16\vlen\()f - jmp_table_\id\vlen
> > +        .4byte \id\()32\vlen\()f - jmp_table_\id\vlen
> > +        .4byte \id\()64\vlen\()f - jmp_table_\id\vlen
> > +        .4byte \id\()128\vlen\()f - jmp_table_\id\vlen
>
> Maybe use .irp here?
>
> > +endconst
> > +.endm
> > +
> > +.macro AVG_J vlen, id
> > +        clz               t1, a4
> > +        neg               t1, t1
> > +        lla               t5, jmp_table_\id\vlen
> > +        sh2add            t1, t1, t5
> > +        lw                t1, ((__riscv_xlen-2)<<2)(t1)
> > +        add               t1, t1, t5
> > +        jr                t1
> > +.endm
> > +
> > +.macro func_avg vlen
> > +func ff_vvc_avg_8_rvv_\vlen\(), zve32x
> > +        AVG_JMP_TABLE     1, \vlen
> > +        csrwi             vxrm, 0
> > +        AVG_J             \vlen, 1
> > +        .irp w,2,4,8,16,32,64,128
> > +        avg               \w, \vlen, 1
> > +        .endr
> > +endfunc
> > +.endm
> > +
> > +.macro w_avg w, vlen, id
> > +\id\w\vlen:
> > +.if \w < 32
> > +        vsetvlstatic16    \w, \vlen, m4
> > +        addi              t0, a2, 128*2
> > +        addi              t1, a3, 128*2
> > +        vle16.v           v0, (a2)
> > +        vle16.v           v8, (a3)
> > +        addi              a5, a5, -2
> > +        vle16.v           v20, (t0)
> > +        vle16.v           v24, (t1)
> > +        vwmul.vx          v16, v0, a7
> > +        vwmul.vx          v28, v20, a7
> > +        vwmacc.vx         v16, t3, v8
> > +        vwmacc.vx         v28, t3, v24
> > +        vsetvlstatic32    \w, \vlen
> > +        add               t2, a0, a1
> > +        vadd.vx           v16, v16, t4
> > +        vadd.vx           v28, v28, t4
> > +        vsetvlstatic16    \w, \vlen, m4
> > +        vnsrl.wx          v16, v16, t6
> > +        vnsrl.wx          v28, v28, t6
> > +        vmax.vx           v16, v16, zero
> > +        vmax.vx           v28, v28, zero
> > +        vsetvlstatic8     \w, \vlen, m2
> > +        addi              a2, a2, 128*4
> > +        vnclipu.wi        v16, v16, 0
> > +        vnclipu.wi        v28, v28, 0
> > +        vse8.v            v16, (a0)
> > +        addi              a3, a3, 128*4
> > +        vse8.v            v28, (t2)
> > +        sh1add            a0, a1, a0
> > +.else
> > +        addi              a5, a5, -1
> > +        mv                t1, a0
> > +        mv                t2, a2
> > +        mv                t5, a3
> > +        mv                a6, a4
> > +1:
> > +        vsetvli           t0, a4, e16, m4, ta, ma
> > +        sub               a4, a4, t0
> > +        vle16.v           v0, (a2)
> > +        vle16.v           v8, (a3)
> > +        vwmul.vx          v16, v0, a7
> > +        vwmacc.vx         v16, t3, v8
> > +        vsetvli           zero, zero, e32, m8, ta, ma
> > +        vadd.vx           v16, v16, t4
> > +        vsetvli           zero, zero, e16, m4, ta, ma
> > +        vnsrl.wx          v16, v16, t6
> > +        vmax.vx           v16, v16, zero
> > +        vsetvli           zero, zero, e8, m2, ta, ma
> > +        vnclipu.wi        v16, v16, 0
> > +        vse8.v            v16, (a0)
> > +        sh1add            a2, t0, a2
> > +        sh1add            a3, t0, a3
> > +        add               a0, a0, t0
> > +        bnez              a4, 1b
> > +        add               a0, t1, a1
> > +        addi              a2, t2, 128*2
> > +        addi              a3, t5, 128*2
> > +        mv                a4, a6
> > +.endif
> > +        bnez              a5, \id\w\vlen\()b
> > +        ret
> > +.endm
> > +
> > +
> > +.macro func_w_avg vlen
> > +func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x
> > +        AVG_JMP_TABLE     2, \vlen
> > +        csrwi             vxrm, 0
> > +        addi              t6, a6, 7
> > +        ld                t3, (sp)
> > +        ld                t4, 8(sp)
> > +        ld                t5, 16(sp)
>
> Breaks build if XLEN = 32.
>
> > +        addi              t4, t4, 1       // o0 + o1 + 1
> > +        add               t4, t4, t5
> > +        addi              t5, t6, -1      // shift - 1
> > +        sll               t4, t4, t5
> > +        AVG_J             \vlen, 2
> > +        .irp w,2,4,8,16,32,64,128
> > +        w_avg             \w, \vlen, 2
> > +        .endr
> > +endfunc
> > +.endm
> > +
> > +func_avg 128
> > +func_avg 256
> > +#if (__riscv_xlen == 64)
> > +func_w_avg 128
> > +func_w_avg 256
> > +#endif
> > diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c
> > b/libavcodec/riscv/vvc/vvcdsp_init.c new file mode 100644
> > index 0000000000..9819a7c570
> > --- /dev/null
> > +++ b/libavcodec/riscv/vvc/vvcdsp_init.c
> > @@ -0,0 +1,72 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "config.h"
> > +
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavutil/riscv/cpu.h"
> > +#include "libavcodec/vvc/dsp.h"
> > +
> > +#define bf(fn, bd,  opt) fn##_##bd##_##opt
> > +
> > +#define AVG_PROTOTYPES(bd, opt)
>
> >                          \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst,
> > ptrdiff_t dst_stride,                                     \ +    const
> > int16_t *src0, const int16_t *src1, int width, int height);
>
> >                \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t
> > dst_stride,                                   \ +    const int16_t *src0,
> > const int16_t *src1, int width, int height,
>
> > \ +    int denom, int w0, int w1, int o0, int o1);
> > +
> > +AVG_PROTOTYPES(8, rvv_128)
> > +AVG_PROTOTYPES(8, rvv_256)
> > +
> > +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> > +{
> > +#if HAVE_RVV
> > +    const int flags = av_get_cpu_flags();
> > +    int vlenb = ff_get_rv_vlenb();
> > +
> > +    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)
> &&
> > +        vlenb >= 32) {
> > +        switch (bd) {
> > +            case 8:
> > +                c->inter.avg    = ff_vvc_avg_8_rvv_256;
> > +# if (__riscv_xlen == 64)
> > +                c->inter.w_avg    = ff_vvc_w_avg_8_rvv_256;
> > +# endif
> > +                break;
> > +            default:
> > +                break;
> > +        }
> > +    } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags &
> > AV_CPU_FLAG_RVB_ADDR) && +               vlenb >= 16) {
> > +        switch (bd) {
> > +            case 8:
> > +                c->inter.avg    = ff_vvc_avg_8_rvv_128;
> > +# if (__riscv_xlen == 64)
> > +                c->inter.w_avg    = ff_vvc_w_avg_8_rvv_128;
> > +# endif
> > +                break;
> > +            default:
> > +                break;
> > +        }
> > +    }
> > +#endif
> > +}
> > diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
> > index 41e830a98a..c55a37d255 100644
> > --- a/libavcodec/vvc/dsp.c
> > +++ b/libavcodec/vvc/dsp.c
> > @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int
> > bit_depth) break;
> >      }
> >
> > -#if ARCH_X86
> > +#if ARCH_RISCV
> > +    ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
> > +#elif ARCH_X86
> >      ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
> >  #endif
> >  }
> > diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> > index 1f14096c41..e03236dd76 100644
> > --- a/libavcodec/vvc/dsp.h
> > +++ b/libavcodec/vvc/dsp.h
> > @@ -180,6 +180,7 @@ typedef struct VVCDSPContext {
> >
> >  void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
> >
> > +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
> >  void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
> >
> >  #endif /* AVCODEC_VVC_DSP_H */
>
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Rémi Denis-Courmont July 19, 2024, 3:55 p.m. UTC | #3
Le torstaina 18. heinäkuuta 2024, 18.04.15 EEST flow gg a écrit :
> > Again, I don't think that a maximul multiplier belongs here. If the
> > calling code cannot scale the multiplier up, then it should be a normal
> > loop providing the same code for all VLENs.
> 
> I think it's acceptable to add such a parameter, which isn't particularly
> common in other files, because this vset is used for vvc_mc_rvv.S rather
> than libavutil/riscv/asm.S.

Maybe but that's really not my point. If you use the same LMUL for all VLENBs, 
then you should use the same function, not two copies of the exact same 
function.
flow gg July 21, 2024, 1:45 p.m. UTC | #4
Okay, updated it

Rémi Denis-Courmont <remi@remlab.net> 于2024年7月19日周五 23:56写道:

> Le torstaina 18. heinäkuuta 2024, 18.04.15 EEST flow gg a écrit :
> > > Again, I don't think that a maximul multiplier belongs here. If the
> > > calling code cannot scale the multiplier up, then it should be a normal
> > > loop providing the same code for all VLENs.
> >
> > I think it's acceptable to add such a parameter, which isn't particularly
> > common in other files, because this vset is used for vvc_mc_rvv.S rather
> > than libavutil/riscv/asm.S.
>
> Maybe but that's really not my point. If you use the same LMUL for all
> VLENBs,
> then you should use the same function, not two copies of the exact same
> function.
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile
new file mode 100644
index 0000000000..582b051579
--- /dev/null
+++ b/libavcodec/riscv/vvc/Makefile
@@ -0,0 +1,2 @@ 
+OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
+RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S b/libavcodec/riscv/vvc/vvc_mc_rvv.S
new file mode 100644
index 0000000000..8cf4bcf680
--- /dev/null
+++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
@@ -0,0 +1,288 @@ 
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro vsetvlstatic8 w, vlen, max_lmul=m4
+        .if \w == 2 && \vlen == 128
+                vsetivli        zero, \w, e8, mf8, ta, ma
+        .elseif \w == 4 && \vlen == 128
+                vsetivli        zero, \w, e8, mf4, ta, ma
+        .elseif \w == 8 && \vlen == 128
+                vsetivli        zero, \w, e8, mf2, ta, ma
+        .elseif \w == 16 && \vlen == 128
+                vsetivli        zero, \w, e8, m1, ta, ma
+        .elseif \w == 32 && \vlen == 128
+                li              t0, \w
+                vsetvli         zero, t0, e8, m2, ta, ma
+        .elseif \w <= 4 && \vlen == 256
+                vsetivli        zero, \w, e8, mf8, ta, ma
+        .elseif \w == 8 && \vlen == 256
+                vsetivli        zero, \w, e8, mf4, ta, ma
+        .elseif \w == 16 && \vlen == 256
+                vsetivli        zero, \w, e8, mf2, ta, ma
+        .elseif \w == 32 && \vlen == 256
+                li              t0, \w
+                vsetvli         zero, t0, e8, m1, ta, ma
+        .elseif \w == 64 && \vlen == 256
+                li              t0, \w
+                vsetvli         zero, t0, e8, m2, ta, ma
+        // (\w <= 128 && \vlen == 128) || (\w == 128 && \vlen == 256)
+        .else
+                li              t0, \w
+                vsetvli         zero, t0, e8, \max_lmul, ta, ma
+        .endif
+.endm
+
+.macro vsetvlstatic16 w, vlen, max_lmul=m8
+        .if \w == 2 && \vlen == 128
+                vsetivli        zero, \w, e16, mf4, ta, ma
+        .elseif \w == 4 && \vlen == 128
+                vsetivli        zero, \w, e16, mf2, ta, ma
+        .elseif \w == 8 && \vlen == 128
+                vsetivli        zero, \w, e16, m1, ta, ma
+        .elseif \w == 16 && \vlen == 128
+                vsetivli        zero, \w, e16, m2, ta, ma
+        .elseif \w == 32 && \vlen == 128
+                li              t0, \w
+                vsetvli         zero, t0, e16, m4, ta, ma
+        .elseif \w <= 4 && \vlen == 256
+                vsetivli        zero, \w, e16, mf4, ta, ma
+        .elseif \w == 8 && \vlen == 256
+                vsetivli        zero, \w, e16, mf2, ta, ma
+        .elseif \w == 16 && \vlen == 256
+                vsetivli        zero, \w, e16, m1, ta, ma
+        .elseif \w == 32 && \vlen == 256
+                li              t0, \w
+                vsetvli         zero, t0, e16, m2, ta, ma
+        .elseif \w == 64 && \vlen == 256
+                li              t0, \w
+                vsetvli         zero, t0, e16, m4, ta, ma
+        // (\w <= 128 && \vlen == 128) || (\w == 128 && \vlen == 256)
+        .else
+                li              t0, \w
+                vsetvli         zero, t0, e16, \max_lmul, ta, ma
+        .endif
+.endm
+
+.macro vsetvlstatic32 w, vlen
+        .if \w == 2
+                vsetivli        zero, \w, e32, mf2, ta, ma
+        .elseif \w == 4 && \vlen == 128
+                vsetivli        zero, \w, e32, m1, ta, ma
+        .elseif \w == 8 && \vlen == 128
+                vsetivli        zero, \w, e32, m2, ta, ma
+        .elseif \w == 16 && \vlen == 128
+                vsetivli        zero, \w, e32, m4, ta, ma
+        .elseif \w == 4 && \vlen == 256
+                vsetivli        zero, \w, e32, mf2, ta, ma
+        .elseif \w == 8 && \vlen == 256
+                vsetivli        zero, \w, e32, m1, ta, ma
+        .elseif \w == 16 && \vlen == 256
+                vsetivli        zero, \w, e32, m2, ta, ma
+        .elseif \w == 32 && \vlen == 256
+                li              t0, \w
+                vsetvli         zero, t0, e32, m4, ta, ma
+        // (\w <= 128 && \vlen == 128) || (\w <= 128 && \vlen == 256)
+        .else
+                li              t0, \w
+                vsetvli         zero, t0, e32, m8, ta, ma
+        .endif
+.endm
+
+.macro avg w, vlen, id
+\id\w\vlen:
+.if \w < 128
+        vsetvlstatic16    \w, \vlen
+        addi              t0, a2, 128*2
+        addi              t1, a3, 128*2
+        add               t2, a0, a1
+        vle16.v           v0, (a2)
+        vle16.v           v8, (a3)
+        addi              a5, a5, -2
+        vle16.v           v16, (t0)
+        vle16.v           v24, (t1)
+        vadd.vv           v8, v8, v0
+        vadd.vv           v24, v24, v16
+        vmax.vx           v8, v8, zero
+        vmax.vx           v24, v24, zero
+        vsetvlstatic8     \w, \vlen
+        addi              a2, a2, 128*4
+        vnclipu.wi        v8, v8, 7
+        vnclipu.wi        v24, v24, 7
+        addi              a3, a3, 128*4
+        vse8.v            v8, (a0)
+        vse8.v            v24, (t2)
+        sh1add            a0, a1, a0
+.else
+        addi              a5, a5, -1
+        mv                t1, a0
+        mv                t2, a2
+        mv                t3, a3
+        mv                t4, a4
+1:
+        vsetvli           t0, a4, e16, m8, ta, ma
+        sub               a4, a4, t0
+        vle16.v           v0, (a2)
+        vle16.v           v8, (a3)
+        vadd.vv           v8, v8, v0
+        vmax.vx           v8, v8, zero
+        vsetvli           zero, zero, e8, m4, ta, ma
+        vnclipu.wi        v8, v8, 7
+        vse8.v            v8, (a0)
+        sh1add            a2, t0, a2
+        sh1add            a3, t0, a3
+        add               a0, a0, t0
+        bnez              a4, 1b
+        add               a0, t1, a1
+        addi              a2, t2, 128*2
+        addi              a3, t3, 128*2
+        mv                a4, t4
+.endif
+        bnez              a5, \id\w\vlen\()b
+        ret
+.endm
+
+
+.macro AVG_JMP_TABLE id, vlen
+const jmp_table_\id\vlen
+        .4byte \id\()2\vlen\()f - jmp_table_\id\vlen
+        .4byte \id\()4\vlen\()f - jmp_table_\id\vlen
+        .4byte \id\()8\vlen\()f - jmp_table_\id\vlen
+        .4byte \id\()16\vlen\()f - jmp_table_\id\vlen
+        .4byte \id\()32\vlen\()f - jmp_table_\id\vlen
+        .4byte \id\()64\vlen\()f - jmp_table_\id\vlen
+        .4byte \id\()128\vlen\()f - jmp_table_\id\vlen
+endconst
+.endm
+
+.macro AVG_J vlen, id
+        clz               t1, a4
+        neg               t1, t1
+        lla               t5, jmp_table_\id\vlen
+        sh2add            t1, t1, t5
+        lw                t1, ((__riscv_xlen-2)<<2)(t1)
+        add               t1, t1, t5
+        jr                t1
+.endm
+
+.macro func_avg vlen
+func ff_vvc_avg_8_rvv_\vlen\(), zve32x
+        AVG_JMP_TABLE     1, \vlen
+        csrwi             vxrm, 0
+        AVG_J             \vlen, 1
+        .irp w,2,4,8,16,32,64,128
+        avg               \w, \vlen, 1
+        .endr
+endfunc
+.endm
+
+.macro w_avg w, vlen, id
+\id\w\vlen:
+.if \w < 32
+        vsetvlstatic16    \w, \vlen, m4
+        addi              t0, a2, 128*2
+        addi              t1, a3, 128*2
+        vle16.v           v0, (a2)
+        vle16.v           v8, (a3)
+        addi              a5, a5, -2
+        vle16.v           v20, (t0)
+        vle16.v           v24, (t1)
+        vwmul.vx          v16, v0, a7
+        vwmul.vx          v28, v20, a7
+        vwmacc.vx         v16, t3, v8
+        vwmacc.vx         v28, t3, v24
+        vsetvlstatic32    \w, \vlen
+        add               t2, a0, a1
+        vadd.vx           v16, v16, t4
+        vadd.vx           v28, v28, t4
+        vsetvlstatic16    \w, \vlen, m4
+        vnsrl.wx          v16, v16, t6
+        vnsrl.wx          v28, v28, t6
+        vmax.vx           v16, v16, zero
+        vmax.vx           v28, v28, zero
+        vsetvlstatic8     \w, \vlen, m2
+        addi              a2, a2, 128*4
+        vnclipu.wi        v16, v16, 0
+        vnclipu.wi        v28, v28, 0
+        vse8.v            v16, (a0)
+        addi              a3, a3, 128*4
+        vse8.v            v28, (t2)
+        sh1add            a0, a1, a0
+.else
+        addi              a5, a5, -1
+        mv                t1, a0
+        mv                t2, a2
+        mv                t5, a3
+        mv                a6, a4
+1:
+        vsetvli           t0, a4, e16, m4, ta, ma
+        sub               a4, a4, t0
+        vle16.v           v0, (a2)
+        vle16.v           v8, (a3)
+        vwmul.vx          v16, v0, a7
+        vwmacc.vx         v16, t3, v8
+        vsetvli           zero, zero, e32, m8, ta, ma
+        vadd.vx           v16, v16, t4
+        vsetvli           zero, zero, e16, m4, ta, ma
+        vnsrl.wx          v16, v16, t6
+        vmax.vx           v16, v16, zero
+        vsetvli           zero, zero, e8, m2, ta, ma
+        vnclipu.wi        v16, v16, 0
+        vse8.v            v16, (a0)
+        sh1add            a2, t0, a2
+        sh1add            a3, t0, a3
+        add               a0, a0, t0
+        bnez              a4, 1b
+        add               a0, t1, a1
+        addi              a2, t2, 128*2
+        addi              a3, t5, 128*2
+        mv                a4, a6
+.endif
+        bnez              a5, \id\w\vlen\()b
+        ret
+.endm
+
+
+.macro func_w_avg vlen
+func ff_vvc_w_avg_8_rvv_\vlen\(), zve32x
+        AVG_JMP_TABLE     2, \vlen
+        csrwi             vxrm, 0
+        addi              t6, a6, 7
+        ld                t3, (sp)
+        ld                t4, 8(sp)
+        ld                t5, 16(sp)
+        addi              t4, t4, 1       // o0 + o1 + 1
+        add               t4, t4, t5
+        addi              t5, t6, -1      // shift - 1
+        sll               t4, t4, t5
+        AVG_J             \vlen, 2
+        .irp w,2,4,8,16,32,64,128
+        w_avg             \w, \vlen, 2
+        .endr
+endfunc
+.endm
+
+func_avg 128
+func_avg 256
+#if (__riscv_xlen == 64)
+func_w_avg 128
+func_w_avg 256
+#endif
diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c b/libavcodec/riscv/vvc/vvcdsp_init.c
new file mode 100644
index 0000000000..9819a7c570
--- /dev/null
+++ b/libavcodec/riscv/vvc/vvcdsp_init.c
@@ -0,0 +1,72 @@ 
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/vvc/dsp.h"
+
+#define bf(fn, bd,  opt) fn##_##bd##_##opt
+
+#define AVG_PROTOTYPES(bd, opt)                                                                      \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,                                     \
+    const int16_t *src0, const int16_t *src1, int width, int height);                                \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,                                   \
+    const int16_t *src0, const int16_t *src1, int width, int height,                                 \
+    int denom, int w0, int w1, int o0, int o1);
+
+AVG_PROTOTYPES(8, rvv_128)
+AVG_PROTOTYPES(8, rvv_256)
+
+void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
+{
+#if HAVE_RVV
+    const int flags = av_get_cpu_flags();
+    int vlenb = ff_get_rv_vlenb();
+
+    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+        vlenb >= 32) {
+        switch (bd) {
+            case 8:
+                c->inter.avg    = ff_vvc_avg_8_rvv_256;
+# if (__riscv_xlen == 64)
+                c->inter.w_avg    = ff_vvc_w_avg_8_rvv_256;
+# endif
+                break;
+            default:
+                break;
+        }
+    } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+               vlenb >= 16) {
+        switch (bd) {
+            case 8:
+                c->inter.avg    = ff_vvc_avg_8_rvv_128;
+# if (__riscv_xlen == 64)
+                c->inter.w_avg    = ff_vvc_w_avg_8_rvv_128;
+# endif
+                break;
+            default:
+                break;
+        }
+    }
+#endif
+}
diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
index 41e830a98a..c55a37d255 100644
--- a/libavcodec/vvc/dsp.c
+++ b/libavcodec/vvc/dsp.c
@@ -121,7 +121,9 @@  void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
         break;
     }
 
-#if ARCH_X86
+#if ARCH_RISCV
+    ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
+#elif ARCH_X86
     ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
 #endif
 }
diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index 1f14096c41..e03236dd76 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -180,6 +180,7 @@  typedef struct VVCDSPContext {
 
 void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
 
+void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
 void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
 
 #endif /* AVCODEC_VVC_DSP_H */