diff mbox series

[FFmpeg-devel] lavc/vvc_mc: R-V V avg w_avg

Message ID tencent_F675C9C260C8A998600E0CEB21EC9EE74105@qq.com
State New
Headers show
Series [FFmpeg-devel] lavc/vvc_mc: R-V V avg w_avg | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished

Commit Message

uk7b@foxmail.com May 21, 2024, 7:37 a.m. UTC
From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                      C908   X60
avg_8_2x2_c                                        :    1.0    1.0
avg_8_2x2_rvv_i32                                  :    0.7    0.7
avg_8_2x4_c                                        :    2.0    2.0
avg_8_2x4_rvv_i32                                  :    1.0    0.7
avg_8_2x8_c                                        :    4.0    3.7
avg_8_2x8_rvv_i32                                  :    1.5    1.2
avg_8_2x16_c                                       :    7.5    7.7
avg_8_2x16_rvv_i32                                 :    2.7    2.5
avg_8_2x32_c                                       :   14.2   15.0
avg_8_2x32_rvv_i32                                 :    5.0    4.5
avg_8_2x64_c                                       :   28.5   30.2
avg_8_2x64_rvv_i32                                 :    9.5    8.7
avg_8_2x128_c                                      :   80.0   70.5
avg_8_2x128_rvv_i32                                :   50.7   41.2
avg_8_4x2_c                                        :    1.7    2.0
avg_8_4x2_rvv_i32                                  :    0.7    0.7
avg_8_4x4_c                                        :    3.5    3.7
avg_8_4x4_rvv_i32                                  :    1.2    1.0
avg_8_4x8_c                                        :    6.7    7.0
avg_8_4x8_rvv_i32                                  :    1.5    1.2
avg_8_4x16_c                                       :   13.2   14.0
avg_8_4x16_rvv_i32                                 :    2.7    2.5
avg_8_4x32_c                                       :   26.2   27.7
avg_8_4x32_rvv_i32                                 :    5.0    4.5
avg_8_4x64_c                                       :   52.2   55.0
avg_8_4x64_rvv_i32                                 :    9.5    8.7
avg_8_4x128_c                                      :  146.0  117.5
avg_8_4x128_rvv_i32                                :   53.2   40.5
avg_8_8x2_c                                        :    3.5    3.5
avg_8_8x2_rvv_i32                                  :    0.7    0.7
avg_8_8x4_c                                        :    6.5    6.5
avg_8_8x4_rvv_i32                                  :    1.2    1.0
avg_8_8x8_c                                        :   12.7   13.2
avg_8_8x8_rvv_i32                                  :    2.0    1.5
avg_8_8x16_c                                       :   25.2   26.2
avg_8_8x16_rvv_i32                                 :    3.5    2.5
avg_8_8x32_c                                       :   50.0   52.7
avg_8_8x32_rvv_i32                                 :    6.5    4.7
avg_8_8x64_c                                       :   99.7  105.0
avg_8_8x64_rvv_i32                                 :   12.5    8.5
avg_8_8x128_c                                      :  225.7  218.0
avg_8_8x128_rvv_i32                                :   78.0   39.2
avg_8_16x2_c                                       :    6.2    6.7
avg_8_16x2_rvv_i32                                 :    1.2    0.7
avg_8_16x4_c                                       :   12.2   12.7
avg_8_16x4_rvv_i32                                 :    2.0    1.2
avg_8_16x8_c                                       :   24.7   26.0
avg_8_16x8_rvv_i32                                 :    3.5    1.7
avg_8_16x16_c                                      :   49.0   51.5
avg_8_16x16_rvv_i32                                :    6.2    3.2
avg_8_16x32_c                                      :   97.5  102.5
avg_8_16x32_rvv_i32                                :   11.5    5.7
avg_8_16x64_c                                      :  212.5  204.7
avg_8_16x64_rvv_i32                                :   22.5   11.0
avg_8_16x128_c                                     :  411.2  418.2
avg_8_16x128_rvv_i32                               :   76.0   47.7
avg_8_32x2_c                                       :   12.2   12.7
avg_8_32x2_rvv_i32                                 :    2.0    1.2
avg_8_32x4_c                                       :   24.2   25.5
avg_8_32x4_rvv_i32                                 :    3.2    1.7
avg_8_32x8_c                                       :   48.5   50.7
avg_8_32x8_rvv_i32                                 :    5.7    3.2
avg_8_32x16_c                                      :   96.5  101.2
avg_8_32x16_rvv_i32                                :   10.7    5.7
avg_8_32x32_c                                      :  192.5  202.5
avg_8_32x32_rvv_i32                                :   20.7   10.5
avg_8_32x64_c                                      :  411.2  404.5
avg_8_32x64_rvv_i32                                :   41.0   20.5
avg_8_32x128_c                                     :  834.7  855.2
avg_8_32x128_rvv_i32                               :  151.2  118.7
avg_8_64x2_c                                       :   24.0   25.2
avg_8_64x2_rvv_i32                                 :    3.2    1.7
avg_8_64x4_c                                       :   48.2   50.5
avg_8_64x4_rvv_i32                                 :    5.2    3.0
avg_8_64x8_c                                       :   95.7  100.7
avg_8_64x8_rvv_i32                                 :   10.0    5.2
avg_8_64x16_c                                      :  191.7  201.2
avg_8_64x16_rvv_i32                                :   19.2    9.5
avg_8_64x32_c                                      :  406.2  402.0
avg_8_64x32_rvv_i32                                :   38.0   18.5
avg_8_64x64_c                                      :  827.5  833.7
avg_8_64x64_rvv_i32                                :  148.2   95.2
avg_8_64x128_c                                     : 1607.7 1625.7
avg_8_64x128_rvv_i32                               :  252.0  179.5
avg_8_128x2_c                                      :   48.7   51.0
avg_8_128x2_rvv_i32                                :    5.5    2.7
avg_8_128x4_c                                      :   96.7  101.2
avg_8_128x4_rvv_i32                                :    9.7    5.0
avg_8_128x8_c                                      :  192.5  202.0
avg_8_128x8_rvv_i32                                :   19.0    9.0
avg_8_128x16_c                                     :  403.5  403.2
avg_8_128x16_rvv_i32                               :   37.0   17.5
avg_8_128x32_c                                     :  787.0  805.7
avg_8_128x32_rvv_i32                               :   73.5   34.2
avg_8_128x64_c                                     : 1635.7 1654.7
avg_8_128x64_rvv_i32                               :  229.5   68.5
avg_8_128x128_c                                    : 3217.0 3233.5
avg_8_128x128_rvv_i32                              :  435.0  321.2
w_avg_8_2x2_c                                      :    1.5    1.5
w_avg_8_2x2_rvv_i32                                :    1.2    1.2
w_avg_8_2x4_c                                      :    2.7    2.5
w_avg_8_2x4_rvv_i32                                :    1.7    1.7
w_avg_8_2x8_c                                      :    5.0    4.7
w_avg_8_2x8_rvv_i32                                :    2.7    2.5
w_avg_8_2x16_c                                     :    9.7    9.5
w_avg_8_2x16_rvv_i32                               :    4.7    4.5
w_avg_8_2x32_c                                     :   19.0   18.5
w_avg_8_2x32_rvv_i32                               :    9.0    8.0
w_avg_8_2x64_c                                     :   37.2   37.0
w_avg_8_2x64_rvv_i32                               :   17.5   15.5
w_avg_8_2x128_c                                    :  120.7   82.7
w_avg_8_2x128_rvv_i32                              :   71.2   49.0
w_avg_8_4x2_c                                      :    2.5    2.5
w_avg_8_4x2_rvv_i32                                :    1.2    1.2
w_avg_8_4x4_c                                      :    4.7    4.5
w_avg_8_4x4_rvv_i32                                :    1.7    1.5
w_avg_8_4x8_c                                      :    9.0    9.0
w_avg_8_4x8_rvv_i32                                :    2.7    2.5
w_avg_8_4x16_c                                     :   17.7   17.7
w_avg_8_4x16_rvv_i32                               :    5.0    4.2
w_avg_8_4x32_c                                     :   34.7   34.7
w_avg_8_4x32_rvv_i32                               :    9.0    8.0
w_avg_8_4x64_c                                     :   69.7   69.5
w_avg_8_4x64_rvv_i32                               :   17.2   15.5
w_avg_8_4x128_c                                    :  171.7  154.7
w_avg_8_4x128_rvv_i32                              :   87.0   48.0
w_avg_8_8x2_c                                      :    4.5    4.5
w_avg_8_8x2_rvv_i32                                :    1.5    1.2
w_avg_8_8x4_c                                      :    8.7    8.7
w_avg_8_8x4_rvv_i32                                :    2.0    1.7
w_avg_8_8x8_c                                      :   17.2   17.0
w_avg_8_8x8_rvv_i32                                :    3.5    2.5
w_avg_8_8x16_c                                     :   34.0   34.0
w_avg_8_8x16_rvv_i32                               :    6.0    4.5
w_avg_8_8x32_c                                     :   67.5   68.0
w_avg_8_8x32_rvv_i32                               :   10.7    8.2
w_avg_8_8x64_c                                     :  135.7  135.0
w_avg_8_8x64_rvv_i32                               :   21.0   15.7
w_avg_8_8x128_c                                    :  304.0  280.0
w_avg_8_8x128_rvv_i32                              :   65.5   56.7
w_avg_8_16x2_c                                     :    8.5    8.7
w_avg_8_16x2_rvv_i32                               :    2.0    1.2
w_avg_8_16x4_c                                     :   16.7   17.0
w_avg_8_16x4_rvv_i32                               :    3.2    2.0
w_avg_8_16x8_c                                     :   33.5   33.5
w_avg_8_16x8_rvv_i32                               :    5.7    3.0
w_avg_8_16x16_c                                    :   66.7   62.2
w_avg_8_16x16_rvv_i32                              :   27.0    5.2
w_avg_8_16x32_c                                    :  132.5  133.0
w_avg_8_16x32_rvv_i32                              :   20.2    9.7
w_avg_8_16x64_c                                    :  264.2  239.0
w_avg_8_16x64_rvv_i32                              :   39.7   18.7
w_avg_8_16x128_c                                   :  572.5  541.2
w_avg_8_16x128_rvv_i32                             :  148.5   55.2
w_avg_8_32x2_c                                     :   16.7   16.7
w_avg_8_32x2_rvv_i32                               :    3.2    2.0
w_avg_8_32x4_c                                     :   33.2   33.2
w_avg_8_32x4_rvv_i32                               :    6.0    3.0
w_avg_8_32x8_c                                     :   66.0   66.0
w_avg_8_32x8_rvv_i32                               :   11.0    5.5
w_avg_8_32x16_c                                    :  131.2  122.7
w_avg_8_32x16_rvv_i32                              :   21.5    9.7
w_avg_8_32x32_c                                    :  262.2  268.7
w_avg_8_32x32_rvv_i32                              :   42.2   18.5
w_avg_8_32x64_c                                    :  544.2  547.0
w_avg_8_32x64_rvv_i32                              :   83.5   37.0
w_avg_8_32x128_c                                   : 1426.7 1139.7
w_avg_8_32x128_rvv_i32                             :  201.0  138.2
w_avg_8_64x2_c                                     :   33.0   33.0
w_avg_8_64x2_rvv_i32                               :    6.0    3.0
w_avg_8_64x4_c                                     :   65.7   65.7
w_avg_8_64x4_rvv_i32                               :   11.2    5.5
w_avg_8_64x8_c                                     :  131.0  131.5
w_avg_8_64x8_rvv_i32                               :   21.5   10.0
w_avg_8_64x16_c                                    :  289.2  262.7
w_avg_8_64x16_rvv_i32                              :   42.5   19.2
w_avg_8_64x32_c                                    :  548.7  525.2
w_avg_8_64x32_rvv_i32                              :   83.7   37.5
w_avg_8_64x64_c                                    : 1139.5 1208.2
w_avg_8_64x64_rvv_i32                              :  209.0  107.5
w_avg_8_64x128_c                                   : 2495.5 2300.5
w_avg_8_64x128_rvv_i32                             :  420.2  208.7
w_avg_8_128x2_c                                    :   66.0   66.5
w_avg_8_128x2_rvv_i32                              :   11.2    5.5
w_avg_8_128x4_c                                    :  131.2  132.5
w_avg_8_128x4_rvv_i32                              :   21.5   10.0
w_avg_8_128x8_c                                    :  280.2  275.7
w_avg_8_128x8_rvv_i32                              :   42.2   19.5
w_avg_8_128x16_c                                   :  549.0  527.7
w_avg_8_128x16_rvv_i32                             :  104.7   37.7
w_avg_8_128x32_c                                   : 1215.2 1068.5
w_avg_8_128x32_rvv_i32                             :  189.0   74.7
w_avg_8_128x64_c                                   : 2305.5 2145.5
w_avg_8_128x64_rvv_i32                             :  386.7  190.0
w_avg_8_128x128_c                                  : 5797.0 4600.2
w_avg_8_128x128_rvv_i32                            :  760.5  343.0
---
 libavcodec/riscv/Makefile      |   2 +
 libavcodec/riscv/vvc_mc_rvv.S  | 312 +++++++++++++++++++++++++++++++++
 libavcodec/riscv/vvcdsp_init.c |  76 ++++++++
 libavcodec/vvc/dsp.c           |   4 +-
 libavcodec/vvc/dsp.h           |   1 +
 5 files changed, 394 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
 create mode 100644 libavcodec/riscv/vvcdsp_init.c

Comments

flow gg May 21, 2024, 7:38 a.m. UTC | #1
To obtain test results, need to comment out the if (w == h) in
tests/checkasm/vvc_mc.c.
Because vset needs to be used in the loop, I manually wrote a cumbersome
vset macro.

<uk7b@foxmail.com> 于2024年5月21日周二 15:38写道:

> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
>                                                       C908   X60
> avg_8_2x2_c                                        :    1.0    1.0
> avg_8_2x2_rvv_i32                                  :    0.7    0.7
> avg_8_2x4_c                                        :    2.0    2.0
> avg_8_2x4_rvv_i32                                  :    1.0    0.7
> avg_8_2x8_c                                        :    4.0    3.7
> avg_8_2x8_rvv_i32                                  :    1.5    1.2
> avg_8_2x16_c                                       :    7.5    7.7
> avg_8_2x16_rvv_i32                                 :    2.7    2.5
> avg_8_2x32_c                                       :   14.2   15.0
> avg_8_2x32_rvv_i32                                 :    5.0    4.5
> avg_8_2x64_c                                       :   28.5   30.2
> avg_8_2x64_rvv_i32                                 :    9.5    8.7
> avg_8_2x128_c                                      :   80.0   70.5
> avg_8_2x128_rvv_i32                                :   50.7   41.2
> avg_8_4x2_c                                        :    1.7    2.0
> avg_8_4x2_rvv_i32                                  :    0.7    0.7
> avg_8_4x4_c                                        :    3.5    3.7
> avg_8_4x4_rvv_i32                                  :    1.2    1.0
> avg_8_4x8_c                                        :    6.7    7.0
> avg_8_4x8_rvv_i32                                  :    1.5    1.2
> avg_8_4x16_c                                       :   13.2   14.0
> avg_8_4x16_rvv_i32                                 :    2.7    2.5
> avg_8_4x32_c                                       :   26.2   27.7
> avg_8_4x32_rvv_i32                                 :    5.0    4.5
> avg_8_4x64_c                                       :   52.2   55.0
> avg_8_4x64_rvv_i32                                 :    9.5    8.7
> avg_8_4x128_c                                      :  146.0  117.5
> avg_8_4x128_rvv_i32                                :   53.2   40.5
> avg_8_8x2_c                                        :    3.5    3.5
> avg_8_8x2_rvv_i32                                  :    0.7    0.7
> avg_8_8x4_c                                        :    6.5    6.5
> avg_8_8x4_rvv_i32                                  :    1.2    1.0
> avg_8_8x8_c                                        :   12.7   13.2
> avg_8_8x8_rvv_i32                                  :    2.0    1.5
> avg_8_8x16_c                                       :   25.2   26.2
> avg_8_8x16_rvv_i32                                 :    3.5    2.5
> avg_8_8x32_c                                       :   50.0   52.7
> avg_8_8x32_rvv_i32                                 :    6.5    4.7
> avg_8_8x64_c                                       :   99.7  105.0
> avg_8_8x64_rvv_i32                                 :   12.5    8.5
> avg_8_8x128_c                                      :  225.7  218.0
> avg_8_8x128_rvv_i32                                :   78.0   39.2
> avg_8_16x2_c                                       :    6.2    6.7
> avg_8_16x2_rvv_i32                                 :    1.2    0.7
> avg_8_16x4_c                                       :   12.2   12.7
> avg_8_16x4_rvv_i32                                 :    2.0    1.2
> avg_8_16x8_c                                       :   24.7   26.0
> avg_8_16x8_rvv_i32                                 :    3.5    1.7
> avg_8_16x16_c                                      :   49.0   51.5
> avg_8_16x16_rvv_i32                                :    6.2    3.2
> avg_8_16x32_c                                      :   97.5  102.5
> avg_8_16x32_rvv_i32                                :   11.5    5.7
> avg_8_16x64_c                                      :  212.5  204.7
> avg_8_16x64_rvv_i32                                :   22.5   11.0
> avg_8_16x128_c                                     :  411.2  418.2
> avg_8_16x128_rvv_i32                               :   76.0   47.7
> avg_8_32x2_c                                       :   12.2   12.7
> avg_8_32x2_rvv_i32                                 :    2.0    1.2
> avg_8_32x4_c                                       :   24.2   25.5
> avg_8_32x4_rvv_i32                                 :    3.2    1.7
> avg_8_32x8_c                                       :   48.5   50.7
> avg_8_32x8_rvv_i32                                 :    5.7    3.2
> avg_8_32x16_c                                      :   96.5  101.2
> avg_8_32x16_rvv_i32                                :   10.7    5.7
> avg_8_32x32_c                                      :  192.5  202.5
> avg_8_32x32_rvv_i32                                :   20.7   10.5
> avg_8_32x64_c                                      :  411.2  404.5
> avg_8_32x64_rvv_i32                                :   41.0   20.5
> avg_8_32x128_c                                     :  834.7  855.2
> avg_8_32x128_rvv_i32                               :  151.2  118.7
> avg_8_64x2_c                                       :   24.0   25.2
> avg_8_64x2_rvv_i32                                 :    3.2    1.7
> avg_8_64x4_c                                       :   48.2   50.5
> avg_8_64x4_rvv_i32                                 :    5.2    3.0
> avg_8_64x8_c                                       :   95.7  100.7
> avg_8_64x8_rvv_i32                                 :   10.0    5.2
> avg_8_64x16_c                                      :  191.7  201.2
> avg_8_64x16_rvv_i32                                :   19.2    9.5
> avg_8_64x32_c                                      :  406.2  402.0
> avg_8_64x32_rvv_i32                                :   38.0   18.5
> avg_8_64x64_c                                      :  827.5  833.7
> avg_8_64x64_rvv_i32                                :  148.2   95.2
> avg_8_64x128_c                                     : 1607.7 1625.7
> avg_8_64x128_rvv_i32                               :  252.0  179.5
> avg_8_128x2_c                                      :   48.7   51.0
> avg_8_128x2_rvv_i32                                :    5.5    2.7
> avg_8_128x4_c                                      :   96.7  101.2
> avg_8_128x4_rvv_i32                                :    9.7    5.0
> avg_8_128x8_c                                      :  192.5  202.0
> avg_8_128x8_rvv_i32                                :   19.0    9.0
> avg_8_128x16_c                                     :  403.5  403.2
> avg_8_128x16_rvv_i32                               :   37.0   17.5
> avg_8_128x32_c                                     :  787.0  805.7
> avg_8_128x32_rvv_i32                               :   73.5   34.2
> avg_8_128x64_c                                     : 1635.7 1654.7
> avg_8_128x64_rvv_i32                               :  229.5   68.5
> avg_8_128x128_c                                    : 3217.0 3233.5
> avg_8_128x128_rvv_i32                              :  435.0  321.2
> w_avg_8_2x2_c                                      :    1.5    1.5
> w_avg_8_2x2_rvv_i32                                :    1.2    1.2
> w_avg_8_2x4_c                                      :    2.7    2.5
> w_avg_8_2x4_rvv_i32                                :    1.7    1.7
> w_avg_8_2x8_c                                      :    5.0    4.7
> w_avg_8_2x8_rvv_i32                                :    2.7    2.5
> w_avg_8_2x16_c                                     :    9.7    9.5
> w_avg_8_2x16_rvv_i32                               :    4.7    4.5
> w_avg_8_2x32_c                                     :   19.0   18.5
> w_avg_8_2x32_rvv_i32                               :    9.0    8.0
> w_avg_8_2x64_c                                     :   37.2   37.0
> w_avg_8_2x64_rvv_i32                               :   17.5   15.5
> w_avg_8_2x128_c                                    :  120.7   82.7
> w_avg_8_2x128_rvv_i32                              :   71.2   49.0
> w_avg_8_4x2_c                                      :    2.5    2.5
> w_avg_8_4x2_rvv_i32                                :    1.2    1.2
> w_avg_8_4x4_c                                      :    4.7    4.5
> w_avg_8_4x4_rvv_i32                                :    1.7    1.5
> w_avg_8_4x8_c                                      :    9.0    9.0
> w_avg_8_4x8_rvv_i32                                :    2.7    2.5
> w_avg_8_4x16_c                                     :   17.7   17.7
> w_avg_8_4x16_rvv_i32                               :    5.0    4.2
> w_avg_8_4x32_c                                     :   34.7   34.7
> w_avg_8_4x32_rvv_i32                               :    9.0    8.0
> w_avg_8_4x64_c                                     :   69.7   69.5
> w_avg_8_4x64_rvv_i32                               :   17.2   15.5
> w_avg_8_4x128_c                                    :  171.7  154.7
> w_avg_8_4x128_rvv_i32                              :   87.0   48.0
> w_avg_8_8x2_c                                      :    4.5    4.5
> w_avg_8_8x2_rvv_i32                                :    1.5    1.2
> w_avg_8_8x4_c                                      :    8.7    8.7
> w_avg_8_8x4_rvv_i32                                :    2.0    1.7
> w_avg_8_8x8_c                                      :   17.2   17.0
> w_avg_8_8x8_rvv_i32                                :    3.5    2.5
> w_avg_8_8x16_c                                     :   34.0   34.0
> w_avg_8_8x16_rvv_i32                               :    6.0    4.5
> w_avg_8_8x32_c                                     :   67.5   68.0
> w_avg_8_8x32_rvv_i32                               :   10.7    8.2
> w_avg_8_8x64_c                                     :  135.7  135.0
> w_avg_8_8x64_rvv_i32                               :   21.0   15.7
> w_avg_8_8x128_c                                    :  304.0  280.0
> w_avg_8_8x128_rvv_i32                              :   65.5   56.7
> w_avg_8_16x2_c                                     :    8.5    8.7
> w_avg_8_16x2_rvv_i32                               :    2.0    1.2
> w_avg_8_16x4_c                                     :   16.7   17.0
> w_avg_8_16x4_rvv_i32                               :    3.2    2.0
> w_avg_8_16x8_c                                     :   33.5   33.5
> w_avg_8_16x8_rvv_i32                               :    5.7    3.0
> w_avg_8_16x16_c                                    :   66.7   62.2
> w_avg_8_16x16_rvv_i32                              :   27.0    5.2
> w_avg_8_16x32_c                                    :  132.5  133.0
> w_avg_8_16x32_rvv_i32                              :   20.2    9.7
> w_avg_8_16x64_c                                    :  264.2  239.0
> w_avg_8_16x64_rvv_i32                              :   39.7   18.7
> w_avg_8_16x128_c                                   :  572.5  541.2
> w_avg_8_16x128_rvv_i32                             :  148.5   55.2
> w_avg_8_32x2_c                                     :   16.7   16.7
> w_avg_8_32x2_rvv_i32                               :    3.2    2.0
> w_avg_8_32x4_c                                     :   33.2   33.2
> w_avg_8_32x4_rvv_i32                               :    6.0    3.0
> w_avg_8_32x8_c                                     :   66.0   66.0
> w_avg_8_32x8_rvv_i32                               :   11.0    5.5
> w_avg_8_32x16_c                                    :  131.2  122.7
> w_avg_8_32x16_rvv_i32                              :   21.5    9.7
> w_avg_8_32x32_c                                    :  262.2  268.7
> w_avg_8_32x32_rvv_i32                              :   42.2   18.5
> w_avg_8_32x64_c                                    :  544.2  547.0
> w_avg_8_32x64_rvv_i32                              :   83.5   37.0
> w_avg_8_32x128_c                                   : 1426.7 1139.7
> w_avg_8_32x128_rvv_i32                             :  201.0  138.2
> w_avg_8_64x2_c                                     :   33.0   33.0
> w_avg_8_64x2_rvv_i32                               :    6.0    3.0
> w_avg_8_64x4_c                                     :   65.7   65.7
> w_avg_8_64x4_rvv_i32                               :   11.2    5.5
> w_avg_8_64x8_c                                     :  131.0  131.5
> w_avg_8_64x8_rvv_i32                               :   21.5   10.0
> w_avg_8_64x16_c                                    :  289.2  262.7
> w_avg_8_64x16_rvv_i32                              :   42.5   19.2
> w_avg_8_64x32_c                                    :  548.7  525.2
> w_avg_8_64x32_rvv_i32                              :   83.7   37.5
> w_avg_8_64x64_c                                    : 1139.5 1208.2
> w_avg_8_64x64_rvv_i32                              :  209.0  107.5
> w_avg_8_64x128_c                                   : 2495.5 2300.5
> w_avg_8_64x128_rvv_i32                             :  420.2  208.7
> w_avg_8_128x2_c                                    :   66.0   66.5
> w_avg_8_128x2_rvv_i32                              :   11.2    5.5
> w_avg_8_128x4_c                                    :  131.2  132.5
> w_avg_8_128x4_rvv_i32                              :   21.5   10.0
> w_avg_8_128x8_c                                    :  280.2  275.7
> w_avg_8_128x8_rvv_i32                              :   42.2   19.5
> w_avg_8_128x16_c                                   :  549.0  527.7
> w_avg_8_128x16_rvv_i32                             :  104.7   37.7
> w_avg_8_128x32_c                                   : 1215.2 1068.5
> w_avg_8_128x32_rvv_i32                             :  189.0   74.7
> w_avg_8_128x64_c                                   : 2305.5 2145.5
> w_avg_8_128x64_rvv_i32                             :  386.7  190.0
> w_avg_8_128x128_c                                  : 5797.0 4600.2
> w_avg_8_128x128_rvv_i32                            :  760.5  343.0
> ---
>  libavcodec/riscv/Makefile      |   2 +
>  libavcodec/riscv/vvc_mc_rvv.S  | 312 +++++++++++++++++++++++++++++++++
>  libavcodec/riscv/vvcdsp_init.c |  76 ++++++++
>  libavcodec/vvc/dsp.c           |   4 +-
>  libavcodec/vvc/dsp.h           |   1 +
>  5 files changed, 394 insertions(+), 1 deletion(-)
>  create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
>  create mode 100644 libavcodec/riscv/vvcdsp_init.c
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 27b268ae39..6297664fc9 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
>  RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
> +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
> diff --git a/libavcodec/riscv/vvc_mc_rvv.S b/libavcodec/riscv/vvc_mc_rvv.S
> new file mode 100644
> index 0000000000..26a6afba1f
> --- /dev/null
> +++ b/libavcodec/riscv/vvc_mc_rvv.S
> @@ -0,0 +1,312 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro vsetvlstatic8 w vlen is_w
> +        .if \w <= 2
> +                vsetivli        zero, \w, e8, mf8, ta, ma
> +        .elseif \w <= 4 && \vlen == 128
> +                vsetivli        zero, \w, e8, mf4, ta, ma
> +        .elseif \w <= 4 && \vlen >= 256
> +                vsetivli        zero, \w, e8, mf8, ta, ma
> +        .elseif \w <= 8 && \vlen == 128
> +                vsetivli        zero, \w, e8, mf2, ta, ma
> +        .elseif \w <= 8 && \vlen >= 256
> +                vsetivli        zero, \w, e8, mf4, ta, ma
> +        .elseif \w <= 16 && \vlen == 128
> +                vsetivli        zero, \w, e8, m1, ta, ma
> +        .elseif \w <= 16 && \vlen >= 256
> +                vsetivli        zero, \w, e8, mf2, ta, ma
> +        .elseif \w <= 32 && \vlen >= 256
> +                li t0, \w
> +                vsetvli         zero, t0, e8, m1, ta, ma
> +        .elseif \w <= (\vlen / 4) || \is_w
> +                li t0, 64
> +                vsetvli         zero, t0, e8, m2, ta, ma
> +        .else
> +                li t0, \w
> +                vsetvli         zero, t0, e8, m4, ta, ma
> +        .endif
> +.endm
> +
> +.macro vsetvlstatic16 w vlen is_w
> +        .if \w <= 2
> +                vsetivli        zero, \w, e16, mf4, ta, ma
> +        .elseif \w <= 4 && \vlen == 128
> +                vsetivli        zero, \w, e16, mf2, ta, ma
> +        .elseif \w <= 4 && \vlen >= 256
> +                vsetivli        zero, \w, e16, mf4, ta, ma
> +        .elseif \w <= 8 && \vlen == 128
> +                vsetivli        zero, \w, e16, m1, ta, ma
> +        .elseif \w <= 8 && \vlen >= 256
> +                vsetivli        zero, \w, e16, mf2, ta, ma
> +        .elseif \w <= 16 && \vlen == 128
> +                vsetivli        zero, \w, e16, m2, ta, ma
> +        .elseif \w <= 16 && \vlen >= 256
> +                vsetivli        zero, \w, e16, m1, ta, ma
> +        .elseif \w <= 32 && \vlen >= 256
> +                li t0, \w
> +                vsetvli         zero, t0, e16, m2, ta, ma
> +        .elseif \w <= (\vlen / 4) || \is_w
> +                li t0, 64
> +                vsetvli         zero, t0, e16, m4, ta, ma
> +        .else
> +                li t0, \w
> +                vsetvli         zero, t0, e16, m8, ta, ma
> +        .endif
> +.endm
> +
> +.macro vsetvlstatic32 w vlen
> +        .if \w <= 2
> +                vsetivli        zero, \w, e32, mf2, ta, ma
> +        .elseif \w <= 4 && \vlen == 128
> +                vsetivli        zero, \w, e32, m1, ta, ma
> +        .elseif \w <= 4 && \vlen >= 256
> +                vsetivli        zero, \w, e32, mf2, ta, ma
> +        .elseif \w <= 8 && \vlen == 128
> +                vsetivli        zero, \w, e32, m2, ta, ma
> +        .elseif \w <= 8 && \vlen >= 256
> +                vsetivli        zero, \w, e32, m1, ta, ma
> +        .elseif \w <= 16 && \vlen == 128
> +                vsetivli        zero, \w, e32, m4, ta, ma
> +        .elseif \w <= 16 && \vlen >= 256
> +                vsetivli        zero, \w, e32, m2, ta, ma
> +        .elseif \w <= 32 && \vlen >= 256
> +                li t0, \w
> +                vsetvli         zero, t0, e32, m4, ta, ma
> +        .else
> +                li t0, \w
> +                vsetvli         zero, t0, e32, m8, ta, ma
> +        .endif
> +.endm
> +
> +.macro avg_nx1 w vlen
> +        vsetvlstatic16    \w, \vlen, 0
> +        vle16.v           v0, (a2)
> +        vle16.v           v8, (a3)
> +        vadd.vv           v8, v8, v0
> +        vmax.vx           v8, v8, zero
> +        vsetvlstatic8     \w, \vlen, 0
> +        vnclipu.wi        v8, v8, 7
> +        vse8.v            v8, (a0)
> +.endm
> +
> +.macro avg w h vlen
> +        csrw              vxrm, zero
> +
> +.if \w <= (\vlen / 4) && \h >= 4
> +.rept (\h / 4)
> +        vsetvlstatic16    \w, \vlen, 0
> +        addi              t0, a2, 128*2
> +        addi              t1, a3, 128*2
> +        addi              t3, a2, 128*2*2
> +        addi              t4, a3, 128*2*2
> +        addi              a7, a3, 128*2*3
> +        addi              t6, a2, 128*2*3
> +        add               t2, a0, a1
> +        sh1add            t5, a1, a0
> +        add               a6, t5, a1
> +        vle16.v           v0, (a2)
> +        vle16.v           v4, (a3)
> +        vle16.v           v8, (t0)
> +        vle16.v           v12, (t1)
> +        vle16.v           v16, (t3)
> +        vle16.v           v20, (t4)
> +        vle16.v           v24, (t6)
> +        vle16.v           v28, (a7)
> +        vadd.vv           v4, v4, v0
> +        vadd.vv           v12, v12, v8
> +        vadd.vv           v20, v20, v16
> +        vadd.vv           v28, v28, v24
> +        vmax.vx           v4, v4, zero
> +        vmax.vx           v12, v12, zero
> +        vmax.vx           v20, v20, zero
> +        vmax.vx           v28, v28, zero
> +        vsetvlstatic8     \w, \vlen, 0
> +        vnclipu.wi        v4, v4, 7
> +        vnclipu.wi        v12, v12, 7
> +        vnclipu.wi        v20, v20, 7
> +        vnclipu.wi        v28, v28, 7
> +        vse8.v            v4, (a0)
> +        vse8.v            v12, (t2)
> +        vse8.v            v20, (t5)
> +        vse8.v            v28, (a6)
> +        addi              a2, a2, 128*8
> +        addi              a3, a3, 128*8
> +        sh2add            a0, a1, a0
> +.endr
> +
> +.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2))
> +.rept (\h / 2)
> +        vsetvlstatic16 \w, \vlen, 0
> +        addi              t0, a2, 128*2
> +        addi              t1, a3, 128*2
> +        add               t2, a0, a1
> +        vle16.v           v0, (a2)
> +        vle16.v           v8, (a3)
> +        vle16.v           v16, (t0)
> +        vle16.v           v24, (t1)
> +        vadd.vv           v8, v8, v0
> +        vadd.vv           v24, v24, v16
> +        vmax.vx           v8, v8, zero
> +        vmax.vx           v24, v24, zero
> +        vsetvlstatic8     \w, \vlen, 0
> +        vnclipu.wi        v8, v8, 7
> +        vnclipu.wi        v24, v24, 7
> +        vse8.v            v8, (a0)
> +        vse8.v            v24, (t2)
> +        addi              a2, a2, 128*4
> +        addi              a3, a3, 128*4
> +        sh1add            a0, a1, a0
> +.endr
> +
> +.else
> +.rept \h
> +        avg_nx1           \w, \vlen
> +        .if \w == 128 && \vlen == 128
> +        addi              a2, a2, 64*2
> +        addi              a3, a3, 64*2
> +        addi              a0, a0, 64
> +        avg_nx1           \w, \vlen
> +        addi              a2, a2, -64*2
> +        addi              a3, a3, -64*2
> +        addi              a0, a0, -64
> +        .endif
> +        addi              a2, a2, 128*2
> +        addi              a3, a3, 128*2
> +        add               a0, a0, a1
> +.endr
> +.endif
> +.endm
> +
> +.macro w_avg_nx1 w vlen
> +        vsetvlstatic16    \w, \vlen, 1
> +        vle16.v           v0, (a2)
> +        vle16.v           v8, (a3)
> +        vwmul.vx          v16, v0, a7
> +        vwmacc.vx         v16, t3, v8
> +        vsetvlstatic32    \w, \vlen
> +        vadd.vx           v16, v16, t4
> +        vsetvlstatic16    \w, \vlen, 1
> +        vnsrl.wx          v16, v16, t6
> +        vmax.vx           v16, v16, zero
> +        vsetvlstatic8     \w, \vlen, 1
> +        vnclipu.wi        v16, v16, 0
> +        vse8.v            v16, (a0)
> +.endm
> +
> +#if (__riscv_xlen == 64)
> +.macro w_avg w h vlen
> +        csrw              vxrm, zero
> +        addi              t6, a6, 7
> +        ld                t3, (sp)
> +        ld                t4, 8(sp)
> +        ld                t5, 16(sp)
> +        add               t4, t4, t5
> +        addi              t4, t4, 1       // o0 + o1 + 1
> +        addi              t5, t6, -1      // shift - 1
> +        sll               t4, t4, t5
> +
> +.if \w <= (\vlen / 8)
> +        .rept (\h / 2)
> +        vsetvlstatic16    \w, \vlen, 1
> +        addi              t0, a2, 128*2
> +        addi              t1, a3, 128*2
> +        add               t2, a0, a1
> +        vle16.v           v0, (a2)
> +        vle16.v           v8, (a3)
> +        vle16.v           v20, (t0)
> +        vle16.v           v24, (t1)
> +        vwmul.vx          v16, v0, a7
> +        vwmul.vx          v28, v20, a7
> +        vwmacc.vx         v16, t3, v8
> +        vwmacc.vx         v28, t3, v24
> +        vsetvlstatic32    \w, \vlen
> +        vadd.vx           v16, v16, t4
> +        vadd.vx           v28, v28, t4
> +        vsetvlstatic16    \w, \vlen, 1
> +        vnsrl.wx          v16, v16, t6
> +        vnsrl.wx          v28, v28, t6
> +        vmax.vx           v16, v16, zero
> +        vmax.vx           v28, v28, zero
> +        vsetvlstatic8     \w, \vlen, 1
> +        vnclipu.wi        v16, v16, 0
> +        vnclipu.wi        v28, v28, 0
> +        vse8.v            v16, (a0)
> +        vse8.v            v28, (t2)
> +        addi              a2, a2, 128*4
> +        addi              a3, a3, 128*4
> +        sh1add            a0, a1, a0
> +        .endr
> +.else
> +        .rept \h
> +        w_avg_nx1         \w, \vlen
> +        .if \w == (\vlen / 2)
> +        addi              a2, a2, (\vlen / 2)
> +        addi              a3, a3, (\vlen / 2)
> +        addi              a0, a0, (\vlen / 4)
> +        w_avg_nx1         \w, \vlen
> +        addi              a2, a2, -(\vlen / 2)
> +        addi              a3, a3, -(\vlen / 2)
> +        addi              a0, a0, -(\vlen / 4)
> +        .elseif \w == 128 && \vlen == 128
> +        .rept 3
> +        addi              a2, a2, (\vlen / 2)
> +        addi              a3, a3, (\vlen / 2)
> +        addi              a0, a0, (\vlen / 4)
> +        w_avg_nx1         \w, \vlen
> +        .endr
> +        addi              a2, a2, -(\vlen / 2) * 3
> +        addi              a3, a3, -(\vlen / 2) * 3
> +        addi              a0, a0, -(\vlen / 4) * 3
> +        .endif
> +
> +        addi              a2, a2, 128*2
> +        addi              a3, a3, 128*2
> +        add               a0, a0, a1
> +        .endr
> +.endif
> +.endm
> +#endif
> +
> +.macro func_avg name vlen
> +func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x
> +.irp w,2,4,8,16,32,64,128
> +        li                t3, \w
> +        bne               a4, t3, \name\vlen\()end\w
> +.irp h,2,4,8,16,32,64,128
> +        li                t4, \h
> +        bne               a5, t4, \name\vlen\()end\w\h
> +        \name             \w \h \vlen
> +        ret
> +\name\vlen\()end\w\h:
> +.endr
> +\name\vlen\()end\w:
> +.endr
> +endfunc
> +.endm
> +
> +func_avg avg 256
> +func_avg avg 128
> +#if (__riscv_xlen == 64)
> +func_avg w_avg 256
> +func_avg w_avg 128
> +#endif
> diff --git a/libavcodec/riscv/vvcdsp_init.c
> b/libavcodec/riscv/vvcdsp_init.c
> new file mode 100644
> index 0000000000..d26b4c1c4a
> --- /dev/null
> +++ b/libavcodec/riscv/vvcdsp_init.c
> @@ -0,0 +1,76 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "config.h"
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
> +#include "libavcodec/vvc/dsp.h"
> +
> +#define bf(fn, bd,  opt) fn##_##bd##_##opt
> +
> +#define AVG_PROTOTYPES(bd, opt)
>                             \
> +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,
>                            \
> +    const int16_t *src0, const int16_t *src1, int width, int height);
>                             \
> +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,
>                            \
> +    const int16_t *src0, const int16_t *src1, int width, int height,
>                            \
> +    int denom, int w0, int w1, int o0, int o1);
> +
> +AVG_PROTOTYPES(8, rvv_128)
> +AVG_PROTOTYPES(8, rvv_256)
> +
> +#define AVG_INIT(bd, opt) do {                                       \
> +    c->inter.avg    = bf(ff_vvc_avg, bd, opt);                       \
> +    c->inter.w_avg  = bf(ff_vvc_w_avg, bd, opt);                     \
> +} while (0)
> +
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> +{
> +#if HAVE_RVV
> +    const int flags = av_get_cpu_flags();
> +
> +    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
> +        ff_rv_vlen_least(256)) {
> +        switch (bd) {
> +            case 8:
> +                c->inter.avg    = ff_vvc_avg_8_rvv_256;
> +# if (__riscv_xlen == 64)
> +                c->inter.w_avg    = ff_vvc_w_avg_8_rvv_256;
> +# endif
> +                break;
> +            default:
> +                break;
> +        }
> +    } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags &
> AV_CPU_FLAG_RVB_ADDR) &&
> +               ff_rv_vlen_least(128)) {
> +        switch (bd) {
> +            case 8:
> +                c->inter.avg    = ff_vvc_avg_8_rvv_128;
> +# if (__riscv_xlen == 64)
> +                c->inter.w_avg    = ff_vvc_w_avg_8_rvv_128;
> +# endif
> +                break;
> +            default:
> +                break;
> +        }
> +    }
> +#endif
> +}
> diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
> index 41e830a98a..c55a37d255 100644
> --- a/libavcodec/vvc/dsp.c
> +++ b/libavcodec/vvc/dsp.c
> @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int
> bit_depth)
>          break;
>      }
>
> -#if ARCH_X86
> +#if ARCH_RISCV
> +    ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
> +#elif ARCH_X86
>      ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
>  #endif
>  }
> diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> index 9810ac314c..dcb978549f 100644
> --- a/libavcodec/vvc/dsp.h
> +++ b/libavcodec/vvc/dsp.h
> @@ -167,6 +167,7 @@ typedef struct VVCDSPContext {
>
>  void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
>
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
>  void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
>
>  #endif /* AVCODEC_VVC_DSP_H */
> --
> 2.45.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Rémi Denis-Courmont May 21, 2024, 4:03 p.m. UTC | #2
Le tiistaina 21. toukokuuta 2024, 10.37.51 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> ---
>  libavcodec/riscv/Makefile      |   2 +
>  libavcodec/riscv/vvc_mc_rvv.S  | 312 +++++++++++++++++++++++++++++++++
>  libavcodec/riscv/vvcdsp_init.c |  76 ++++++++
>  libavcodec/vvc/dsp.c           |   4 +-
>  libavcodec/vvc/dsp.h           |   1 +
>  5 files changed, 394 insertions(+), 1 deletion(-)
>  create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
>  create mode 100644 libavcodec/riscv/vvcdsp_init.c
> 
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 27b268ae39..6297664fc9 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
>  RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
> +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
> diff --git a/libavcodec/riscv/vvc_mc_rvv.S b/libavcodec/riscv/vvc_mc_rvv.S
> new file mode 100644
> index 0000000000..26a6afba1f
> --- /dev/null
> +++ b/libavcodec/riscv/vvc_mc_rvv.S
> @@ -0,0 +1,312 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS). + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro vsetvlstatic8 w vlen is_w
> +        .if \w <= 2
> +                vsetivli        zero, \w, e8, mf8, ta, ma
> +        .elseif \w <= 4 && \vlen == 128
> +                vsetivli        zero, \w, e8, mf4, ta, ma
> +        .elseif \w <= 4 && \vlen >= 256
> +                vsetivli        zero, \w, e8, mf8, ta, ma
> +        .elseif \w <= 8 && \vlen == 128
> +                vsetivli        zero, \w, e8, mf2, ta, ma
> +        .elseif \w <= 8 && \vlen >= 256
> +                vsetivli        zero, \w, e8, mf4, ta, ma
> +        .elseif \w <= 16 && \vlen == 128
> +                vsetivli        zero, \w, e8, m1, ta, ma
> +        .elseif \w <= 16 && \vlen >= 256
> +                vsetivli        zero, \w, e8, mf2, ta, ma
> +        .elseif \w <= 32 && \vlen >= 256
> +                li t0, \w
> +                vsetvli         zero, t0, e8, m1, ta, ma
> +        .elseif \w <= (\vlen / 4) || \is_w
> +                li t0, 64
> +                vsetvli         zero, t0, e8, m2, ta, ma
> +        .else
> +                li t0, \w
> +                vsetvli         zero, t0, e8, m4, ta, ma
> +        .endif
> +.endm
> +
> +.macro vsetvlstatic16 w vlen is_w
> +        .if \w <= 2
> +                vsetivli        zero, \w, e16, mf4, ta, ma
> +        .elseif \w <= 4 && \vlen == 128
> +                vsetivli        zero, \w, e16, mf2, ta, ma
> +        .elseif \w <= 4 && \vlen >= 256
> +                vsetivli        zero, \w, e16, mf4, ta, ma
> +        .elseif \w <= 8 && \vlen == 128
> +                vsetivli        zero, \w, e16, m1, ta, ma
> +        .elseif \w <= 8 && \vlen >= 256
> +                vsetivli        zero, \w, e16, mf2, ta, ma
> +        .elseif \w <= 16 && \vlen == 128
> +                vsetivli        zero, \w, e16, m2, ta, ma
> +        .elseif \w <= 16 && \vlen >= 256
> +                vsetivli        zero, \w, e16, m1, ta, ma
> +        .elseif \w <= 32 && \vlen >= 256
> +                li t0, \w
> +                vsetvli         zero, t0, e16, m2, ta, ma
> +        .elseif \w <= (\vlen / 4) || \is_w
> +                li t0, 64
> +                vsetvli         zero, t0, e16, m4, ta, ma
> +        .else
> +                li t0, \w
> +                vsetvli         zero, t0, e16, m8, ta, ma
> +        .endif
> +.endm
> +
> +.macro vsetvlstatic32 w vlen
> +        .if \w <= 2
> +                vsetivli        zero, \w, e32, mf2, ta, ma
> +        .elseif \w <= 4 && \vlen == 128
> +                vsetivli        zero, \w, e32, m1, ta, ma
> +        .elseif \w <= 4 && \vlen >= 256
> +                vsetivli        zero, \w, e32, mf2, ta, ma
> +        .elseif \w <= 8 && \vlen == 128
> +                vsetivli        zero, \w, e32, m2, ta, ma
> +        .elseif \w <= 8 && \vlen >= 256
> +                vsetivli        zero, \w, e32, m1, ta, ma
> +        .elseif \w <= 16 && \vlen == 128
> +                vsetivli        zero, \w, e32, m4, ta, ma
> +        .elseif \w <= 16 && \vlen >= 256
> +                vsetivli        zero, \w, e32, m2, ta, ma
> +        .elseif \w <= 32 && \vlen >= 256
> +                li t0, \w
> +                vsetvli         zero, t0, e32, m4, ta, ma
> +        .else
> +                li t0, \w
> +                vsetvli         zero, t0, e32, m8, ta, ma
> +        .endif
> +.endm
> +
> +.macro avg_nx1 w vlen
> +        vsetvlstatic16    \w, \vlen, 0
> +        vle16.v           v0, (a2)
> +        vle16.v           v8, (a3)
> +        vadd.vv           v8, v8, v0
> +        vmax.vx           v8, v8, zero
> +        vsetvlstatic8     \w, \vlen, 0
> +        vnclipu.wi        v8, v8, 7
> +        vse8.v            v8, (a0)
> +.endm
> +
> +.macro avg w h vlen
> +        csrw              vxrm, zero
> +
> +.if \w <= (\vlen / 4) && \h >= 4
> +.rept (\h / 4)
> +        vsetvlstatic16    \w, \vlen, 0
> +        addi              t0, a2, 128*2
> +        addi              t1, a3, 128*2
> +        addi              t3, a2, 128*2*2
> +        addi              t4, a3, 128*2*2
> +        addi              a7, a3, 128*2*3
> +        addi              t6, a2, 128*2*3
> +        add               t2, a0, a1
> +        sh1add            t5, a1, a0
> +        add               a6, t5, a1
> +        vle16.v           v0, (a2)
> +        vle16.v           v4, (a3)
> +        vle16.v           v8, (t0)
> +        vle16.v           v12, (t1)
> +        vle16.v           v16, (t3)
> +        vle16.v           v20, (t4)
> +        vle16.v           v24, (t6)
> +        vle16.v           v28, (a7)

I would expect that you can get better performance by interleaving scalar and 
vector stuff, and possibly also vector loads and vector arithmetic.

> +        vadd.vv           v4, v4, v0
> +        vadd.vv           v12, v12, v8
> +        vadd.vv           v20, v20, v16
> +        vadd.vv           v28, v28, v24
> +        vmax.vx           v4, v4, zero
> +        vmax.vx           v12, v12, zero
> +        vmax.vx           v20, v20, zero
> +        vmax.vx           v28, v28, zero
> +        vsetvlstatic8     \w, \vlen, 0
> +        vnclipu.wi        v4, v4, 7
> +        vnclipu.wi        v12, v12, 7
> +        vnclipu.wi        v20, v20, 7
> +        vnclipu.wi        v28, v28, 7
> +        vse8.v            v4, (a0)
> +        vse8.v            v12, (t2)
> +        vse8.v            v20, (t5)
> +        vse8.v            v28, (a6)
> +        addi              a2, a2, 128*8
> +        addi              a3, a3, 128*8
> +        sh2add            a0, a1, a0
> +.endr
> +
> +.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2))
> +.rept (\h / 2)
> +        vsetvlstatic16 \w, \vlen, 0
> +        addi              t0, a2, 128*2
> +        addi              t1, a3, 128*2
> +        add               t2, a0, a1
> +        vle16.v           v0, (a2)
> +        vle16.v           v8, (a3)
> +        vle16.v           v16, (t0)
> +        vle16.v           v24, (t1)
> +        vadd.vv           v8, v8, v0
> +        vadd.vv           v24, v24, v16
> +        vmax.vx           v8, v8, zero
> +        vmax.vx           v24, v24, zero
> +        vsetvlstatic8     \w, \vlen, 0
> +        vnclipu.wi        v8, v8, 7
> +        vnclipu.wi        v24, v24, 7
> +        vse8.v            v8, (a0)
> +        vse8.v            v24, (t2)
> +        addi              a2, a2, 128*4
> +        addi              a3, a3, 128*4
> +        sh1add            a0, a1, a0
> +.endr
> +
> +.else
> +.rept \h
> +        avg_nx1           \w, \vlen
> +        .if \w == 128 && \vlen == 128
> +        addi              a2, a2, 64*2
> +        addi              a3, a3, 64*2
> +        addi              a0, a0, 64
> +        avg_nx1           \w, \vlen
> +        addi              a2, a2, -64*2
> +        addi              a3, a3, -64*2
> +        addi              a0, a0, -64
> +        .endif
> +        addi              a2, a2, 128*2
> +        addi              a3, a3, 128*2
> +        add               a0, a0, a1
> +.endr
> +.endif
> +.endm
> +
> +.macro w_avg_nx1 w vlen
> +        vsetvlstatic16    \w, \vlen, 1
> +        vle16.v           v0, (a2)
> +        vle16.v           v8, (a3)
> +        vwmul.vx          v16, v0, a7
> +        vwmacc.vx         v16, t3, v8
> +        vsetvlstatic32    \w, \vlen
> +        vadd.vx           v16, v16, t4
> +        vsetvlstatic16    \w, \vlen, 1
> +        vnsrl.wx          v16, v16, t6
> +        vmax.vx           v16, v16, zero
> +        vsetvlstatic8     \w, \vlen, 1
> +        vnclipu.wi        v16, v16, 0
> +        vse8.v            v16, (a0)
> +.endm
> +
> +#if (__riscv_xlen == 64)
> +.macro w_avg w h vlen
> +        csrw              vxrm, zero
> +        addi              t6, a6, 7
> +        ld                t3, (sp)
> +        ld                t4, 8(sp)
> +        ld                t5, 16(sp)
> +        add               t4, t4, t5
> +        addi              t4, t4, 1       // o0 + o1 + 1
> +        addi              t5, t6, -1      // shift - 1
> +        sll               t4, t4, t5
> +
> +.if \w <= (\vlen / 8)
> +        .rept (\h / 2)
> +        vsetvlstatic16    \w, \vlen, 1
> +        addi              t0, a2, 128*2
> +        addi              t1, a3, 128*2
> +        add               t2, a0, a1
> +        vle16.v           v0, (a2)
> +        vle16.v           v8, (a3)
> +        vle16.v           v20, (t0)
> +        vle16.v           v24, (t1)
> +        vwmul.vx          v16, v0, a7
> +        vwmul.vx          v28, v20, a7
> +        vwmacc.vx         v16, t3, v8
> +        vwmacc.vx         v28, t3, v24
> +        vsetvlstatic32    \w, \vlen
> +        vadd.vx           v16, v16, t4
> +        vadd.vx           v28, v28, t4
> +        vsetvlstatic16    \w, \vlen, 1
> +        vnsrl.wx          v16, v16, t6
> +        vnsrl.wx          v28, v28, t6
> +        vmax.vx           v16, v16, zero
> +        vmax.vx           v28, v28, zero
> +        vsetvlstatic8     \w, \vlen, 1
> +        vnclipu.wi        v16, v16, 0
> +        vnclipu.wi        v28, v28, 0
> +        vse8.v            v16, (a0)
> +        vse8.v            v28, (t2)
> +        addi              a2, a2, 128*4
> +        addi              a3, a3, 128*4
> +        sh1add            a0, a1, a0
> +        .endr
> +.else
> +        .rept \h
> +        w_avg_nx1         \w, \vlen
> +        .if \w == (\vlen / 2)
> +        addi              a2, a2, (\vlen / 2)
> +        addi              a3, a3, (\vlen / 2)
> +        addi              a0, a0, (\vlen / 4)
> +        w_avg_nx1         \w, \vlen
> +        addi              a2, a2, -(\vlen / 2)
> +        addi              a3, a3, -(\vlen / 2)
> +        addi              a0, a0, -(\vlen / 4)
> +        .elseif \w == 128 && \vlen == 128
> +        .rept 3
> +        addi              a2, a2, (\vlen / 2)
> +        addi              a3, a3, (\vlen / 2)
> +        addi              a0, a0, (\vlen / 4)
> +        w_avg_nx1         \w, \vlen
> +        .endr
> +        addi              a2, a2, -(\vlen / 2) * 3
> +        addi              a3, a3, -(\vlen / 2) * 3
> +        addi              a0, a0, -(\vlen / 4) * 3
> +        .endif
> +
> +        addi              a2, a2, 128*2
> +        addi              a3, a3, 128*2
> +        add               a0, a0, a1
> +        .endr
> +.endif
> +.endm
> +#endif
> +
> +.macro func_avg name vlen
> +func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x
> +.irp w,2,4,8,16,32,64,128
> +        li                t3, \w
> +        bne               a4, t3, \name\vlen\()end\w
> +.irp h,2,4,8,16,32,64,128
> +        li                t4, \h
> +        bne               a5, t4, \name\vlen\()end\w\h
> +        \name             \w \h \vlen
> +        ret
> +\name\vlen\()end\w\h:
> +.endr
> +\name\vlen\()end\w:

These labels lead to nowhere? If you actually mean to implicitly fall through 
to the next function, you can use the function name directly rather than add 
odd labels.

> +.endr
> +endfunc
> +.endm
> +
> +func_avg avg 256
> +func_avg avg 128
> +#if (__riscv_xlen == 64)
> +func_avg w_avg 256
> +func_avg w_avg 128
> +#endif
> diff --git a/libavcodec/riscv/vvcdsp_init.c b/libavcodec/riscv/vvcdsp_init.c
> new file mode 100644
> index 0000000000..d26b4c1c4a
> --- /dev/null
> +++ b/libavcodec/riscv/vvcdsp_init.c
> @@ -0,0 +1,76 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS). + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include "config.h"
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
> +#include "libavcodec/vvc/dsp.h"
> +
> +#define bf(fn, bd,  opt) fn##_##bd##_##opt
> +
> +#define AVG_PROTOTYPES(bd, opt)                                            
>                          \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst,
> ptrdiff_t dst_stride,                                     \ +    const
> int16_t *src0, const int16_t *src1, int width, int height);                
>                \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t
> dst_stride,                                   \ +    const int16_t *src0,
> const int16_t *src1, int width, int height,                                
> \ +    int denom, int w0, int w1, int o0, int o1);
> +
> +AVG_PROTOTYPES(8, rvv_128)
> +AVG_PROTOTYPES(8, rvv_256)
> +
> +#define AVG_INIT(bd, opt) do {                                       \
> +    c->inter.avg    = bf(ff_vvc_avg, bd, opt);                       \
> +    c->inter.w_avg  = bf(ff_vvc_w_avg, bd, opt);                     \
> +} while (0)
> +
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> +{
> +#if HAVE_RVV
> +    const int flags = av_get_cpu_flags();
> +
> +    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
> +        ff_rv_vlen_least(256)) {
> +        switch (bd) {
> +            case 8:
> +                c->inter.avg    = ff_vvc_avg_8_rvv_256;
> +# if (__riscv_xlen == 64)
> +                c->inter.w_avg    = ff_vvc_w_avg_8_rvv_256;
> +# endif
> +                break;
> +            default:
> +                break;
> +        }
> +    } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags &
> AV_CPU_FLAG_RVB_ADDR) &&
> +               ff_rv_vlen_least(128)) {
> +        switch (bd) {
> +            case 8:
> +                c->inter.avg    = ff_vvc_avg_8_rvv_128;
> +# if (__riscv_xlen == 64)
> +                c->inter.w_avg    = ff_vvc_w_avg_8_rvv_128;
> +# endif
> +                break;
> +            default:
> +                break;
> +        }
> +    }
> +#endif
> +}
> diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
> index 41e830a98a..c55a37d255 100644
> --- a/libavcodec/vvc/dsp.c
> +++ b/libavcodec/vvc/dsp.c
> @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int
> bit_depth) break;
>      }
> 
> -#if ARCH_X86
> +#if ARCH_RISCV
> +    ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
> +#elif ARCH_X86
>      ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
>  #endif
>  }
> diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> index 9810ac314c..dcb978549f 100644
> --- a/libavcodec/vvc/dsp.h
> +++ b/libavcodec/vvc/dsp.h
> @@ -167,6 +167,7 @@ typedef struct VVCDSPContext {
> 
>  void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
> 
> +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
>  void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
> 
>  #endif /* AVCODEC_VVC_DSP_H */
flow gg May 21, 2024, 5:24 p.m. UTC | #3
> I would expect that you can get better performance by interleaving scalar
and
vector stuff, and possibly also vector loads and vector arithmetic.

Okay, I will try

> These labels lead to nowhere? If you actually mean to implicitly fall
through
to the next function, you can use the function name directly rather than add
odd labels.

These labels are used to convert variable parameters to constants to
achieve better performance and prepare for the next .irp. Some names are
strange because they cannot be duplicated. Here, there is only one
function, which should be executed after going through these labels?

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月22日周三 00:04写道:

> Le tiistaina 21. toukokuuta 2024, 10.37.51 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> > ---
> >  libavcodec/riscv/Makefile      |   2 +
> >  libavcodec/riscv/vvc_mc_rvv.S  | 312 +++++++++++++++++++++++++++++++++
> >  libavcodec/riscv/vvcdsp_init.c |  76 ++++++++
> >  libavcodec/vvc/dsp.c           |   4 +-
> >  libavcodec/vvc/dsp.h           |   1 +
> >  5 files changed, 394 insertions(+), 1 deletion(-)
> >  create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
> >  create mode 100644 libavcodec/riscv/vvcdsp_init.c
> >
> > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> > index 27b268ae39..6297664fc9 100644
> > --- a/libavcodec/riscv/Makefile
> > +++ b/libavcodec/riscv/Makefile
> > @@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o
> \
> >  RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> >  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> >  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> > +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
> > +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
> > diff --git a/libavcodec/riscv/vvc_mc_rvv.S
> b/libavcodec/riscv/vvc_mc_rvv.S
> > new file mode 100644
> > index 0000000000..26a6afba1f
> > --- /dev/null
> > +++ b/libavcodec/riscv/vvc_mc_rvv.S
> > @@ -0,0 +1,312 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +.macro vsetvlstatic8 w vlen is_w
> > +        .if \w <= 2
> > +                vsetivli        zero, \w, e8, mf8, ta, ma
> > +        .elseif \w <= 4 && \vlen == 128
> > +                vsetivli        zero, \w, e8, mf4, ta, ma
> > +        .elseif \w <= 4 && \vlen >= 256
> > +                vsetivli        zero, \w, e8, mf8, ta, ma
> > +        .elseif \w <= 8 && \vlen == 128
> > +                vsetivli        zero, \w, e8, mf2, ta, ma
> > +        .elseif \w <= 8 && \vlen >= 256
> > +                vsetivli        zero, \w, e8, mf4, ta, ma
> > +        .elseif \w <= 16 && \vlen == 128
> > +                vsetivli        zero, \w, e8, m1, ta, ma
> > +        .elseif \w <= 16 && \vlen >= 256
> > +                vsetivli        zero, \w, e8, mf2, ta, ma
> > +        .elseif \w <= 32 && \vlen >= 256
> > +                li t0, \w
> > +                vsetvli         zero, t0, e8, m1, ta, ma
> > +        .elseif \w <= (\vlen / 4) || \is_w
> > +                li t0, 64
> > +                vsetvli         zero, t0, e8, m2, ta, ma
> > +        .else
> > +                li t0, \w
> > +                vsetvli         zero, t0, e8, m4, ta, ma
> > +        .endif
> > +.endm
> > +
> > +.macro vsetvlstatic16 w vlen is_w
> > +        .if \w <= 2
> > +                vsetivli        zero, \w, e16, mf4, ta, ma
> > +        .elseif \w <= 4 && \vlen == 128
> > +                vsetivli        zero, \w, e16, mf2, ta, ma
> > +        .elseif \w <= 4 && \vlen >= 256
> > +                vsetivli        zero, \w, e16, mf4, ta, ma
> > +        .elseif \w <= 8 && \vlen == 128
> > +                vsetivli        zero, \w, e16, m1, ta, ma
> > +        .elseif \w <= 8 && \vlen >= 256
> > +                vsetivli        zero, \w, e16, mf2, ta, ma
> > +        .elseif \w <= 16 && \vlen == 128
> > +                vsetivli        zero, \w, e16, m2, ta, ma
> > +        .elseif \w <= 16 && \vlen >= 256
> > +                vsetivli        zero, \w, e16, m1, ta, ma
> > +        .elseif \w <= 32 && \vlen >= 256
> > +                li t0, \w
> > +                vsetvli         zero, t0, e16, m2, ta, ma
> > +        .elseif \w <= (\vlen / 4) || \is_w
> > +                li t0, 64
> > +                vsetvli         zero, t0, e16, m4, ta, ma
> > +        .else
> > +                li t0, \w
> > +                vsetvli         zero, t0, e16, m8, ta, ma
> > +        .endif
> > +.endm
> > +
> > +.macro vsetvlstatic32 w vlen
> > +        .if \w <= 2
> > +                vsetivli        zero, \w, e32, mf2, ta, ma
> > +        .elseif \w <= 4 && \vlen == 128
> > +                vsetivli        zero, \w, e32, m1, ta, ma
> > +        .elseif \w <= 4 && \vlen >= 256
> > +                vsetivli        zero, \w, e32, mf2, ta, ma
> > +        .elseif \w <= 8 && \vlen == 128
> > +                vsetivli        zero, \w, e32, m2, ta, ma
> > +        .elseif \w <= 8 && \vlen >= 256
> > +                vsetivli        zero, \w, e32, m1, ta, ma
> > +        .elseif \w <= 16 && \vlen == 128
> > +                vsetivli        zero, \w, e32, m4, ta, ma
> > +        .elseif \w <= 16 && \vlen >= 256
> > +                vsetivli        zero, \w, e32, m2, ta, ma
> > +        .elseif \w <= 32 && \vlen >= 256
> > +                li t0, \w
> > +                vsetvli         zero, t0, e32, m4, ta, ma
> > +        .else
> > +                li t0, \w
> > +                vsetvli         zero, t0, e32, m8, ta, ma
> > +        .endif
> > +.endm
> > +
> > +.macro avg_nx1 w vlen
> > +        vsetvlstatic16    \w, \vlen, 0
> > +        vle16.v           v0, (a2)
> > +        vle16.v           v8, (a3)
> > +        vadd.vv           v8, v8, v0
> > +        vmax.vx           v8, v8, zero
> > +        vsetvlstatic8     \w, \vlen, 0
> > +        vnclipu.wi        v8, v8, 7
> > +        vse8.v            v8, (a0)
> > +.endm
> > +
> > +.macro avg w h vlen
> > +        csrw              vxrm, zero
> > +
> > +.if \w <= (\vlen / 4) && \h >= 4
> > +.rept (\h / 4)
> > +        vsetvlstatic16    \w, \vlen, 0
> > +        addi              t0, a2, 128*2
> > +        addi              t1, a3, 128*2
> > +        addi              t3, a2, 128*2*2
> > +        addi              t4, a3, 128*2*2
> > +        addi              a7, a3, 128*2*3
> > +        addi              t6, a2, 128*2*3
> > +        add               t2, a0, a1
> > +        sh1add            t5, a1, a0
> > +        add               a6, t5, a1
> > +        vle16.v           v0, (a2)
> > +        vle16.v           v4, (a3)
> > +        vle16.v           v8, (t0)
> > +        vle16.v           v12, (t1)
> > +        vle16.v           v16, (t3)
> > +        vle16.v           v20, (t4)
> > +        vle16.v           v24, (t6)
> > +        vle16.v           v28, (a7)
>
> I would expect that you can get better performance by interleaving scalar
> and
> vector stuff, and possibly also vector loads and vector arithmetic.
>
> > +        vadd.vv           v4, v4, v0
> > +        vadd.vv           v12, v12, v8
> > +        vadd.vv           v20, v20, v16
> > +        vadd.vv           v28, v28, v24
> > +        vmax.vx           v4, v4, zero
> > +        vmax.vx           v12, v12, zero
> > +        vmax.vx           v20, v20, zero
> > +        vmax.vx           v28, v28, zero
> > +        vsetvlstatic8     \w, \vlen, 0
> > +        vnclipu.wi        v4, v4, 7
> > +        vnclipu.wi        v12, v12, 7
> > +        vnclipu.wi        v20, v20, 7
> > +        vnclipu.wi        v28, v28, 7
> > +        vse8.v            v4, (a0)
> > +        vse8.v            v12, (t2)
> > +        vse8.v            v20, (t5)
> > +        vse8.v            v28, (a6)
> > +        addi              a2, a2, 128*8
> > +        addi              a3, a3, 128*8
> > +        sh2add            a0, a1, a0
> > +.endr
> > +
> > +.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2))
> > +.rept (\h / 2)
> > +        vsetvlstatic16 \w, \vlen, 0
> > +        addi              t0, a2, 128*2
> > +        addi              t1, a3, 128*2
> > +        add               t2, a0, a1
> > +        vle16.v           v0, (a2)
> > +        vle16.v           v8, (a3)
> > +        vle16.v           v16, (t0)
> > +        vle16.v           v24, (t1)
> > +        vadd.vv           v8, v8, v0
> > +        vadd.vv           v24, v24, v16
> > +        vmax.vx           v8, v8, zero
> > +        vmax.vx           v24, v24, zero
> > +        vsetvlstatic8     \w, \vlen, 0
> > +        vnclipu.wi        v8, v8, 7
> > +        vnclipu.wi        v24, v24, 7
> > +        vse8.v            v8, (a0)
> > +        vse8.v            v24, (t2)
> > +        addi              a2, a2, 128*4
> > +        addi              a3, a3, 128*4
> > +        sh1add            a0, a1, a0
> > +.endr
> > +
> > +.else
> > +.rept \h
> > +        avg_nx1           \w, \vlen
> > +        .if \w == 128 && \vlen == 128
> > +        addi              a2, a2, 64*2
> > +        addi              a3, a3, 64*2
> > +        addi              a0, a0, 64
> > +        avg_nx1           \w, \vlen
> > +        addi              a2, a2, -64*2
> > +        addi              a3, a3, -64*2
> > +        addi              a0, a0, -64
> > +        .endif
> > +        addi              a2, a2, 128*2
> > +        addi              a3, a3, 128*2
> > +        add               a0, a0, a1
> > +.endr
> > +.endif
> > +.endm
> > +
> > +.macro w_avg_nx1 w vlen
> > +        vsetvlstatic16    \w, \vlen, 1
> > +        vle16.v           v0, (a2)
> > +        vle16.v           v8, (a3)
> > +        vwmul.vx          v16, v0, a7
> > +        vwmacc.vx         v16, t3, v8
> > +        vsetvlstatic32    \w, \vlen
> > +        vadd.vx           v16, v16, t4
> > +        vsetvlstatic16    \w, \vlen, 1
> > +        vnsrl.wx          v16, v16, t6
> > +        vmax.vx           v16, v16, zero
> > +        vsetvlstatic8     \w, \vlen, 1
> > +        vnclipu.wi        v16, v16, 0
> > +        vse8.v            v16, (a0)
> > +.endm
> > +
> > +#if (__riscv_xlen == 64)
> > +.macro w_avg w h vlen
> > +        csrw              vxrm, zero
> > +        addi              t6, a6, 7
> > +        ld                t3, (sp)
> > +        ld                t4, 8(sp)
> > +        ld                t5, 16(sp)
> > +        add               t4, t4, t5
> > +        addi              t4, t4, 1       // o0 + o1 + 1
> > +        addi              t5, t6, -1      // shift - 1
> > +        sll               t4, t4, t5
> > +
> > +.if \w <= (\vlen / 8)
> > +        .rept (\h / 2)
> > +        vsetvlstatic16    \w, \vlen, 1
> > +        addi              t0, a2, 128*2
> > +        addi              t1, a3, 128*2
> > +        add               t2, a0, a1
> > +        vle16.v           v0, (a2)
> > +        vle16.v           v8, (a3)
> > +        vle16.v           v20, (t0)
> > +        vle16.v           v24, (t1)
> > +        vwmul.vx          v16, v0, a7
> > +        vwmul.vx          v28, v20, a7
> > +        vwmacc.vx         v16, t3, v8
> > +        vwmacc.vx         v28, t3, v24
> > +        vsetvlstatic32    \w, \vlen
> > +        vadd.vx           v16, v16, t4
> > +        vadd.vx           v28, v28, t4
> > +        vsetvlstatic16    \w, \vlen, 1
> > +        vnsrl.wx          v16, v16, t6
> > +        vnsrl.wx          v28, v28, t6
> > +        vmax.vx           v16, v16, zero
> > +        vmax.vx           v28, v28, zero
> > +        vsetvlstatic8     \w, \vlen, 1
> > +        vnclipu.wi        v16, v16, 0
> > +        vnclipu.wi        v28, v28, 0
> > +        vse8.v            v16, (a0)
> > +        vse8.v            v28, (t2)
> > +        addi              a2, a2, 128*4
> > +        addi              a3, a3, 128*4
> > +        sh1add            a0, a1, a0
> > +        .endr
> > +.else
> > +        .rept \h
> > +        w_avg_nx1         \w, \vlen
> > +        .if \w == (\vlen / 2)
> > +        addi              a2, a2, (\vlen / 2)
> > +        addi              a3, a3, (\vlen / 2)
> > +        addi              a0, a0, (\vlen / 4)
> > +        w_avg_nx1         \w, \vlen
> > +        addi              a2, a2, -(\vlen / 2)
> > +        addi              a3, a3, -(\vlen / 2)
> > +        addi              a0, a0, -(\vlen / 4)
> > +        .elseif \w == 128 && \vlen == 128
> > +        .rept 3
> > +        addi              a2, a2, (\vlen / 2)
> > +        addi              a3, a3, (\vlen / 2)
> > +        addi              a0, a0, (\vlen / 4)
> > +        w_avg_nx1         \w, \vlen
> > +        .endr
> > +        addi              a2, a2, -(\vlen / 2) * 3
> > +        addi              a3, a3, -(\vlen / 2) * 3
> > +        addi              a0, a0, -(\vlen / 4) * 3
> > +        .endif
> > +
> > +        addi              a2, a2, 128*2
> > +        addi              a3, a3, 128*2
> > +        add               a0, a0, a1
> > +        .endr
> > +.endif
> > +.endm
> > +#endif
> > +
> > +.macro func_avg name vlen
> > +func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x
> > +.irp w,2,4,8,16,32,64,128
> > +        li                t3, \w
> > +        bne               a4, t3, \name\vlen\()end\w
> > +.irp h,2,4,8,16,32,64,128
> > +        li                t4, \h
> > +        bne               a5, t4, \name\vlen\()end\w\h
> > +        \name             \w \h \vlen
> > +        ret
> > +\name\vlen\()end\w\h:
> > +.endr
> > +\name\vlen\()end\w:
>
> These labels lead to nowhere? If you actually mean to implicitly fall
> through
> to the next function, you can use the function name directly rather than
> add
> odd labels.
>
> > +.endr
> > +endfunc
> > +.endm
> > +
> > +func_avg avg 256
> > +func_avg avg 128
> > +#if (__riscv_xlen == 64)
> > +func_avg w_avg 256
> > +func_avg w_avg 128
> > +#endif
> > diff --git a/libavcodec/riscv/vvcdsp_init.c
> b/libavcodec/riscv/vvcdsp_init.c
> > new file mode 100644
> > index 0000000000..d26b4c1c4a
> > --- /dev/null
> > +++ b/libavcodec/riscv/vvcdsp_init.c
> > @@ -0,0 +1,76 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "config.h"
> > +
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavutil/riscv/cpu.h"
> > +#include "libavcodec/vvc/dsp.h"
> > +
> > +#define bf(fn, bd,  opt) fn##_##bd##_##opt
> > +
> > +#define AVG_PROTOTYPES(bd, opt)
>
> >                          \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst,
> > ptrdiff_t dst_stride,                                     \ +    const
> > int16_t *src0, const int16_t *src1, int width, int height);
>
> >                \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t
> > dst_stride,                                   \ +    const int16_t *src0,
> > const int16_t *src1, int width, int height,
>
> > \ +    int denom, int w0, int w1, int o0, int o1);
> > +
> > +AVG_PROTOTYPES(8, rvv_128)
> > +AVG_PROTOTYPES(8, rvv_256)
> > +
> > +#define AVG_INIT(bd, opt) do {                                       \
> > +    c->inter.avg    = bf(ff_vvc_avg, bd, opt);                       \
> > +    c->inter.w_avg  = bf(ff_vvc_w_avg, bd, opt);                     \
> > +} while (0)
> > +
> > +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> > +{
> > +#if HAVE_RVV
> > +    const int flags = av_get_cpu_flags();
> > +
> > +    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)
> &&
> > +        ff_rv_vlen_least(256)) {
> > +        switch (bd) {
> > +            case 8:
> > +                c->inter.avg    = ff_vvc_avg_8_rvv_256;
> > +# if (__riscv_xlen == 64)
> > +                c->inter.w_avg    = ff_vvc_w_avg_8_rvv_256;
> > +# endif
> > +                break;
> > +            default:
> > +                break;
> > +        }
> > +    } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags &
> > AV_CPU_FLAG_RVB_ADDR) &&
> > +               ff_rv_vlen_least(128)) {
> > +        switch (bd) {
> > +            case 8:
> > +                c->inter.avg    = ff_vvc_avg_8_rvv_128;
> > +# if (__riscv_xlen == 64)
> > +                c->inter.w_avg    = ff_vvc_w_avg_8_rvv_128;
> > +# endif
> > +                break;
> > +            default:
> > +                break;
> > +        }
> > +    }
> > +#endif
> > +}
> > diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
> > index 41e830a98a..c55a37d255 100644
> > --- a/libavcodec/vvc/dsp.c
> > +++ b/libavcodec/vvc/dsp.c
> > @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int
> > bit_depth) break;
> >      }
> >
> > -#if ARCH_X86
> > +#if ARCH_RISCV
> > +    ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
> > +#elif ARCH_X86
> >      ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
> >  #endif
> >  }
> > diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
> > index 9810ac314c..dcb978549f 100644
> > --- a/libavcodec/vvc/dsp.h
> > +++ b/libavcodec/vvc/dsp.h
> > @@ -167,6 +167,7 @@ typedef struct VVCDSPContext {
> >
> >  void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
> >
> > +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
> >  void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
> >
> >  #endif /* AVCODEC_VVC_DSP_H */
>
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 27b268ae39..6297664fc9 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -68,3 +68,5 @@  RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
 RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
 OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
 RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
+OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
+RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
diff --git a/libavcodec/riscv/vvc_mc_rvv.S b/libavcodec/riscv/vvc_mc_rvv.S
new file mode 100644
index 0000000000..26a6afba1f
--- /dev/null
+++ b/libavcodec/riscv/vvc_mc_rvv.S
@@ -0,0 +1,312 @@ 
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro vsetvlstatic8 w vlen is_w
+        .if \w <= 2
+                vsetivli        zero, \w, e8, mf8, ta, ma
+        .elseif \w <= 4 && \vlen == 128
+                vsetivli        zero, \w, e8, mf4, ta, ma
+        .elseif \w <= 4 && \vlen >= 256
+                vsetivli        zero, \w, e8, mf8, ta, ma
+        .elseif \w <= 8 && \vlen == 128
+                vsetivli        zero, \w, e8, mf2, ta, ma
+        .elseif \w <= 8 && \vlen >= 256
+                vsetivli        zero, \w, e8, mf4, ta, ma
+        .elseif \w <= 16 && \vlen == 128
+                vsetivli        zero, \w, e8, m1, ta, ma
+        .elseif \w <= 16 && \vlen >= 256
+                vsetivli        zero, \w, e8, mf2, ta, ma
+        .elseif \w <= 32 && \vlen >= 256
+                li t0, \w
+                vsetvli         zero, t0, e8, m1, ta, ma
+        .elseif \w <= (\vlen / 4) || \is_w
+                li t0, 64
+                vsetvli         zero, t0, e8, m2, ta, ma
+        .else
+                li t0, \w
+                vsetvli         zero, t0, e8, m4, ta, ma
+        .endif
+.endm
+
+.macro vsetvlstatic16 w vlen is_w
+        .if \w <= 2
+                vsetivli        zero, \w, e16, mf4, ta, ma
+        .elseif \w <= 4 && \vlen == 128
+                vsetivli        zero, \w, e16, mf2, ta, ma
+        .elseif \w <= 4 && \vlen >= 256
+                vsetivli        zero, \w, e16, mf4, ta, ma
+        .elseif \w <= 8 && \vlen == 128
+                vsetivli        zero, \w, e16, m1, ta, ma
+        .elseif \w <= 8 && \vlen >= 256
+                vsetivli        zero, \w, e16, mf2, ta, ma
+        .elseif \w <= 16 && \vlen == 128
+                vsetivli        zero, \w, e16, m2, ta, ma
+        .elseif \w <= 16 && \vlen >= 256
+                vsetivli        zero, \w, e16, m1, ta, ma
+        .elseif \w <= 32 && \vlen >= 256
+                li t0, \w
+                vsetvli         zero, t0, e16, m2, ta, ma
+        .elseif \w <= (\vlen / 4) || \is_w
+                li t0, 64
+                vsetvli         zero, t0, e16, m4, ta, ma
+        .else
+                li t0, \w
+                vsetvli         zero, t0, e16, m8, ta, ma
+        .endif
+.endm
+
+.macro vsetvlstatic32 w vlen
+        .if \w <= 2
+                vsetivli        zero, \w, e32, mf2, ta, ma
+        .elseif \w <= 4 && \vlen == 128
+                vsetivli        zero, \w, e32, m1, ta, ma
+        .elseif \w <= 4 && \vlen >= 256
+                vsetivli        zero, \w, e32, mf2, ta, ma
+        .elseif \w <= 8 && \vlen == 128
+                vsetivli        zero, \w, e32, m2, ta, ma
+        .elseif \w <= 8 && \vlen >= 256
+                vsetivli        zero, \w, e32, m1, ta, ma
+        .elseif \w <= 16 && \vlen == 128
+                vsetivli        zero, \w, e32, m4, ta, ma
+        .elseif \w <= 16 && \vlen >= 256
+                vsetivli        zero, \w, e32, m2, ta, ma
+        .elseif \w <= 32 && \vlen >= 256
+                li t0, \w
+                vsetvli         zero, t0, e32, m4, ta, ma
+        .else
+                li t0, \w
+                vsetvli         zero, t0, e32, m8, ta, ma
+        .endif
+.endm
+
+.macro avg_nx1 w vlen
+        vsetvlstatic16    \w, \vlen, 0
+        vle16.v           v0, (a2)
+        vle16.v           v8, (a3)
+        vadd.vv           v8, v8, v0
+        vmax.vx           v8, v8, zero
+        vsetvlstatic8     \w, \vlen, 0
+        vnclipu.wi        v8, v8, 7
+        vse8.v            v8, (a0)
+.endm
+
+.macro avg w h vlen
+        csrw              vxrm, zero
+
+.if \w <= (\vlen / 4) && \h >= 4
+.rept (\h / 4)
+        vsetvlstatic16    \w, \vlen, 0
+        addi              t0, a2, 128*2
+        addi              t1, a3, 128*2
+        addi              t3, a2, 128*2*2
+        addi              t4, a3, 128*2*2
+        addi              a7, a3, 128*2*3
+        addi              t6, a2, 128*2*3
+        add               t2, a0, a1
+        sh1add            t5, a1, a0
+        add               a6, t5, a1
+        vle16.v           v0, (a2)
+        vle16.v           v4, (a3)
+        vle16.v           v8, (t0)
+        vle16.v           v12, (t1)
+        vle16.v           v16, (t3)
+        vle16.v           v20, (t4)
+        vle16.v           v24, (t6)
+        vle16.v           v28, (a7)
+        vadd.vv           v4, v4, v0
+        vadd.vv           v12, v12, v8
+        vadd.vv           v20, v20, v16
+        vadd.vv           v28, v28, v24
+        vmax.vx           v4, v4, zero
+        vmax.vx           v12, v12, zero
+        vmax.vx           v20, v20, zero
+        vmax.vx           v28, v28, zero
+        vsetvlstatic8     \w, \vlen, 0
+        vnclipu.wi        v4, v4, 7
+        vnclipu.wi        v12, v12, 7
+        vnclipu.wi        v20, v20, 7
+        vnclipu.wi        v28, v28, 7
+        vse8.v            v4, (a0)
+        vse8.v            v12, (t2)
+        vse8.v            v20, (t5)
+        vse8.v            v28, (a6)
+        addi              a2, a2, 128*8
+        addi              a3, a3, 128*8
+        sh2add            a0, a1, a0
+.endr
+
+.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2))
+.rept (\h / 2)
+        vsetvlstatic16 \w, \vlen, 0
+        addi              t0, a2, 128*2
+        addi              t1, a3, 128*2
+        add               t2, a0, a1
+        vle16.v           v0, (a2)
+        vle16.v           v8, (a3)
+        vle16.v           v16, (t0)
+        vle16.v           v24, (t1)
+        vadd.vv           v8, v8, v0
+        vadd.vv           v24, v24, v16
+        vmax.vx           v8, v8, zero
+        vmax.vx           v24, v24, zero
+        vsetvlstatic8     \w, \vlen, 0
+        vnclipu.wi        v8, v8, 7
+        vnclipu.wi        v24, v24, 7
+        vse8.v            v8, (a0)
+        vse8.v            v24, (t2)
+        addi              a2, a2, 128*4
+        addi              a3, a3, 128*4
+        sh1add            a0, a1, a0
+.endr
+
+.else
+.rept \h
+        avg_nx1           \w, \vlen
+        .if \w == 128 && \vlen == 128
+        addi              a2, a2, 64*2
+        addi              a3, a3, 64*2
+        addi              a0, a0, 64
+        avg_nx1           \w, \vlen
+        addi              a2, a2, -64*2
+        addi              a3, a3, -64*2
+        addi              a0, a0, -64
+        .endif
+        addi              a2, a2, 128*2
+        addi              a3, a3, 128*2
+        add               a0, a0, a1
+.endr
+.endif
+.endm
+
+.macro w_avg_nx1 w vlen
+        vsetvlstatic16    \w, \vlen, 1
+        vle16.v           v0, (a2)
+        vle16.v           v8, (a3)
+        vwmul.vx          v16, v0, a7
+        vwmacc.vx         v16, t3, v8
+        vsetvlstatic32    \w, \vlen
+        vadd.vx           v16, v16, t4
+        vsetvlstatic16    \w, \vlen, 1
+        vnsrl.wx          v16, v16, t6
+        vmax.vx           v16, v16, zero
+        vsetvlstatic8     \w, \vlen, 1
+        vnclipu.wi        v16, v16, 0
+        vse8.v            v16, (a0)
+.endm
+
+#if (__riscv_xlen == 64)
+.macro w_avg w h vlen
+        csrw              vxrm, zero
+        addi              t6, a6, 7
+        ld                t3, (sp)
+        ld                t4, 8(sp)
+        ld                t5, 16(sp)
+        add               t4, t4, t5
+        addi              t4, t4, 1       // o0 + o1 + 1
+        addi              t5, t6, -1      // shift - 1
+        sll               t4, t4, t5
+
+.if \w <= (\vlen / 8)
+        .rept (\h / 2)
+        vsetvlstatic16    \w, \vlen, 1
+        addi              t0, a2, 128*2
+        addi              t1, a3, 128*2
+        add               t2, a0, a1
+        vle16.v           v0, (a2)
+        vle16.v           v8, (a3)
+        vle16.v           v20, (t0)
+        vle16.v           v24, (t1)
+        vwmul.vx          v16, v0, a7
+        vwmul.vx          v28, v20, a7
+        vwmacc.vx         v16, t3, v8
+        vwmacc.vx         v28, t3, v24
+        vsetvlstatic32    \w, \vlen
+        vadd.vx           v16, v16, t4
+        vadd.vx           v28, v28, t4
+        vsetvlstatic16    \w, \vlen, 1
+        vnsrl.wx          v16, v16, t6
+        vnsrl.wx          v28, v28, t6
+        vmax.vx           v16, v16, zero
+        vmax.vx           v28, v28, zero
+        vsetvlstatic8     \w, \vlen, 1
+        vnclipu.wi        v16, v16, 0
+        vnclipu.wi        v28, v28, 0
+        vse8.v            v16, (a0)
+        vse8.v            v28, (t2)
+        addi              a2, a2, 128*4
+        addi              a3, a3, 128*4
+        sh1add            a0, a1, a0
+        .endr
+.else
+        .rept \h
+        w_avg_nx1         \w, \vlen
+        .if \w == (\vlen / 2)
+        addi              a2, a2, (\vlen / 2)
+        addi              a3, a3, (\vlen / 2)
+        addi              a0, a0, (\vlen / 4)
+        w_avg_nx1         \w, \vlen
+        addi              a2, a2, -(\vlen / 2)
+        addi              a3, a3, -(\vlen / 2)
+        addi              a0, a0, -(\vlen / 4)
+        .elseif \w == 128 && \vlen == 128
+        .rept 3
+        addi              a2, a2, (\vlen / 2)
+        addi              a3, a3, (\vlen / 2)
+        addi              a0, a0, (\vlen / 4)
+        w_avg_nx1         \w, \vlen
+        .endr
+        addi              a2, a2, -(\vlen / 2) * 3
+        addi              a3, a3, -(\vlen / 2) * 3
+        addi              a0, a0, -(\vlen / 4) * 3
+        .endif
+
+        addi              a2, a2, 128*2
+        addi              a3, a3, 128*2
+        add               a0, a0, a1
+        .endr
+.endif
+.endm
+#endif
+
+.macro func_avg name vlen
+func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x
+.irp w,2,4,8,16,32,64,128
+        li                t3, \w
+        bne               a4, t3, \name\vlen\()end\w
+.irp h,2,4,8,16,32,64,128
+        li                t4, \h
+        bne               a5, t4, \name\vlen\()end\w\h
+        \name             \w \h \vlen
+        ret
+\name\vlen\()end\w\h:
+.endr
+\name\vlen\()end\w:
+.endr
+endfunc
+.endm
+
+func_avg avg 256
+func_avg avg 128
+#if (__riscv_xlen == 64)
+func_avg w_avg 256
+func_avg w_avg 128
+#endif
diff --git a/libavcodec/riscv/vvcdsp_init.c b/libavcodec/riscv/vvcdsp_init.c
new file mode 100644
index 0000000000..d26b4c1c4a
--- /dev/null
+++ b/libavcodec/riscv/vvcdsp_init.c
@@ -0,0 +1,76 @@ 
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/vvc/dsp.h"
+
+#define bf(fn, bd,  opt) fn##_##bd##_##opt
+
+#define AVG_PROTOTYPES(bd, opt)                                                                      \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,                                     \
+    const int16_t *src0, const int16_t *src1, int width, int height);                                \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,                                   \
+    const int16_t *src0, const int16_t *src1, int width, int height,                                 \
+    int denom, int w0, int w1, int o0, int o1);
+
+AVG_PROTOTYPES(8, rvv_128)
+AVG_PROTOTYPES(8, rvv_256)
+
+#define AVG_INIT(bd, opt) do {                                       \
+    c->inter.avg    = bf(ff_vvc_avg, bd, opt);                       \
+    c->inter.w_avg  = bf(ff_vvc_w_avg, bd, opt);                     \
+} while (0)
+
+void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
+{
+#if HAVE_RVV
+    const int flags = av_get_cpu_flags();
+
+    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+        ff_rv_vlen_least(256)) {
+        switch (bd) {
+            case 8:
+                c->inter.avg    = ff_vvc_avg_8_rvv_256;
+# if (__riscv_xlen == 64)
+                c->inter.w_avg    = ff_vvc_w_avg_8_rvv_256;
+# endif
+                break;
+            default:
+                break;
+        }
+    } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) &&
+               ff_rv_vlen_least(128)) {
+        switch (bd) {
+            case 8:
+                c->inter.avg    = ff_vvc_avg_8_rvv_128;
+# if (__riscv_xlen == 64)
+                c->inter.w_avg    = ff_vvc_w_avg_8_rvv_128;
+# endif
+                break;
+            default:
+                break;
+        }
+    }
+#endif
+}
diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c
index 41e830a98a..c55a37d255 100644
--- a/libavcodec/vvc/dsp.c
+++ b/libavcodec/vvc/dsp.c
@@ -121,7 +121,9 @@  void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
         break;
     }
 
-#if ARCH_X86
+#if ARCH_RISCV
+    ff_vvc_dsp_init_riscv(vvcdsp, bit_depth);
+#elif ARCH_X86
     ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
 #endif
 }
diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h
index 9810ac314c..dcb978549f 100644
--- a/libavcodec/vvc/dsp.h
+++ b/libavcodec/vvc/dsp.h
@@ -167,6 +167,7 @@  typedef struct VVCDSPContext {
 
 void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
 
+void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth);
 void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
 
 #endif /* AVCODEC_VVC_DSP_H */