diff mbox series

[FFmpeg-devel,3/5] lavc/vvc_mc: R-V V put_uni_pixels

Message ID tencent_FDD6731858AF61F23CD1AA2D168D4373C508@qq.com
State New
Headers show
Series None | expand

Commit Message

uk7b@foxmail.com Oct. 28, 2024, 5:08 p.m. UTC
From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                      k230               banana_f3
put_uni_pixels_chroma_8_4x4_c:                         128.3 ( 1.00x)    90.5 ( 1.00x)
put_uni_pixels_chroma_8_4x4_rvv_i32:                    17.6 ( 7.30x)    17.4 ( 5.18x)
put_uni_pixels_chroma_8_8x8_c:                         295.1 ( 1.00x)    163.2 ( 1.00x)
put_uni_pixels_chroma_8_8x8_rvv_i32:                    35.8 ( 8.24x)    27.9 ( 5.84x)
put_uni_pixels_chroma_8_16x16_c:                       619.3 ( 1.00x)    267.4 ( 1.00x)
put_uni_pixels_chroma_8_16x16_rvv_i32:                  72.8 ( 8.50x)    48.7 ( 5.49x)
put_uni_pixels_chroma_8_32x32_c:                      1433.8 ( 1.00x)    538.2 ( 1.00x)
put_uni_pixels_chroma_8_32x32_rvv_i32:                 230.3 ( 6.23x)    236.2 ( 2.28x)
put_uni_pixels_chroma_8_64x64_c:                      3517.3 ( 1.00x)    1455.0 ( 1.00x)
put_uni_pixels_chroma_8_64x64_rvv_i32:                 813.6 ( 4.32x)    590.2 ( 2.47x)
put_uni_pixels_chroma_8_128x128_c:                   10174.6 ( 1.00x)    5798.7 ( 1.00x)
put_uni_pixels_chroma_8_128x128_rvv_i32:              2989.3 ( 3.40x)    2371.4 ( 2.45x)
put_uni_pixels_luma_8_4x4_c:                           128.6 ( 1.00x)    90.5 ( 1.00x)
put_uni_pixels_luma_8_4x4_rvv_i32:                      17.3 ( 7.42x)    17.4 ( 5.18x)
put_uni_pixels_luma_8_8x8_c:                           295.1 ( 1.00x)    142.4 ( 1.00x)
put_uni_pixels_luma_8_8x8_rvv_i32:                      26.6 (11.10x)    27.9 ( 5.10x)
put_uni_pixels_luma_8_16x16_c:                         600.6 ( 1.00x)    277.7 ( 1.00x)
put_uni_pixels_luma_8_16x16_rvv_i32:                    82.1 ( 7.32x)    48.7 ( 5.70x)
put_uni_pixels_luma_8_32x32_c:                        1406.1 ( 1.00x)    528.0 ( 1.00x)
put_uni_pixels_luma_8_32x32_rvv_i32:                   230.3 ( 6.10x)    131.9 ( 4.00x)
put_uni_pixels_luma_8_64x64_c:                        4600.6 ( 1.00x)    1309.2 ( 1.00x)
put_uni_pixels_luma_8_64x64_rvv_i32:                  1073.1 ( 4.29x)    382.2 ( 3.43x)
put_uni_pixels_luma_8_128x128_c:                     11350.3 ( 1.00x)    3506.9 ( 1.00x)
put_uni_pixels_luma_8_128x128_rvv_i32:                3119.1 ( 3.64x)    2017.5 ( 1.74x)
---
 libavcodec/riscv/h26x/h2656_inter_rvv.S | 53 +++++++++++++++++++++++++
 libavcodec/riscv/h26x/h2656dsp.h        | 33 +++++++++++++++
 libavcodec/riscv/vvc/Makefile           |  3 +-
 libavcodec/riscv/vvc/vvcdsp_init.c      |  5 +++
 4 files changed, 93 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/h26x/h2656_inter_rvv.S
 create mode 100644 libavcodec/riscv/h26x/h2656dsp.h

Comments

flow gg Oct. 28, 2024, 5:12 p.m. UTC | #1
> Up to 64-bit rows, you can use strided loads and stores here.

Due to the SRC_OFFSET in testing, only e8 and e16 can be loaded; e32 cannot
be loaded (Bus error).
Since the width ranges from 4 to 128, it seems that strided loads may not
be possible.

> Though for memory copying, unaligned scalar accesses might be just as
fast.

> Or perhaps not if the vectors are not aligned but vectors should not be
> necessary here. This is especially true on the BPi whose memory bus is
rather
> slow, so even scalar copy can saturate it.

I agree in theory, but since the test results seem to show some effect,
it would be great if we could improve the testing to confirm it actually
has no effect...

<uk7b@foxmail.com> 于2024年10月29日周二 01:08写道:

> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
>                                                       k230
>  banana_f3
> put_uni_pixels_chroma_8_4x4_c:                         128.3 ( 1.00x)
> 90.5 ( 1.00x)
> put_uni_pixels_chroma_8_4x4_rvv_i32:                    17.6 ( 7.30x)
> 17.4 ( 5.18x)
> put_uni_pixels_chroma_8_8x8_c:                         295.1 ( 1.00x)
> 163.2 ( 1.00x)
> put_uni_pixels_chroma_8_8x8_rvv_i32:                    35.8 ( 8.24x)
> 27.9 ( 5.84x)
> put_uni_pixels_chroma_8_16x16_c:                       619.3 ( 1.00x)
> 267.4 ( 1.00x)
> put_uni_pixels_chroma_8_16x16_rvv_i32:                  72.8 ( 8.50x)
> 48.7 ( 5.49x)
> put_uni_pixels_chroma_8_32x32_c:                      1433.8 ( 1.00x)
> 538.2 ( 1.00x)
> put_uni_pixels_chroma_8_32x32_rvv_i32:                 230.3 ( 6.23x)
> 236.2 ( 2.28x)
> put_uni_pixels_chroma_8_64x64_c:                      3517.3 ( 1.00x)
> 1455.0 ( 1.00x)
> put_uni_pixels_chroma_8_64x64_rvv_i32:                 813.6 ( 4.32x)
> 590.2 ( 2.47x)
> put_uni_pixels_chroma_8_128x128_c:                   10174.6 ( 1.00x)
> 5798.7 ( 1.00x)
> put_uni_pixels_chroma_8_128x128_rvv_i32:              2989.3 ( 3.40x)
> 2371.4 ( 2.45x)
> put_uni_pixels_luma_8_4x4_c:                           128.6 ( 1.00x)
> 90.5 ( 1.00x)
> put_uni_pixels_luma_8_4x4_rvv_i32:                      17.3 ( 7.42x)
> 17.4 ( 5.18x)
> put_uni_pixels_luma_8_8x8_c:                           295.1 ( 1.00x)
> 142.4 ( 1.00x)
> put_uni_pixels_luma_8_8x8_rvv_i32:                      26.6 (11.10x)
> 27.9 ( 5.10x)
> put_uni_pixels_luma_8_16x16_c:                         600.6 ( 1.00x)
> 277.7 ( 1.00x)
> put_uni_pixels_luma_8_16x16_rvv_i32:                    82.1 ( 7.32x)
> 48.7 ( 5.70x)
> put_uni_pixels_luma_8_32x32_c:                        1406.1 ( 1.00x)
> 528.0 ( 1.00x)
> put_uni_pixels_luma_8_32x32_rvv_i32:                   230.3 ( 6.10x)
> 131.9 ( 4.00x)
> put_uni_pixels_luma_8_64x64_c:                        4600.6 ( 1.00x)
> 1309.2 ( 1.00x)
> put_uni_pixels_luma_8_64x64_rvv_i32:                  1073.1 ( 4.29x)
> 382.2 ( 3.43x)
> put_uni_pixels_luma_8_128x128_c:                     11350.3 ( 1.00x)
> 3506.9 ( 1.00x)
> put_uni_pixels_luma_8_128x128_rvv_i32:                3119.1 ( 3.64x)
> 2017.5 ( 1.74x)
> ---
>  libavcodec/riscv/h26x/h2656_inter_rvv.S | 53 +++++++++++++++++++++++++
>  libavcodec/riscv/h26x/h2656dsp.h        | 33 +++++++++++++++
>  libavcodec/riscv/vvc/Makefile           |  3 +-
>  libavcodec/riscv/vvc/vvcdsp_init.c      |  5 +++
>  4 files changed, 93 insertions(+), 1 deletion(-)
>  create mode 100644 libavcodec/riscv/h26x/h2656_inter_rvv.S
>  create mode 100644 libavcodec/riscv/h26x/h2656dsp.h
>
> diff --git a/libavcodec/riscv/h26x/h2656_inter_rvv.S
> b/libavcodec/riscv/h26x/h2656_inter_rvv.S
> new file mode 100644
> index 0000000000..6692e33acf
> --- /dev/null
> +++ b/libavcodec/riscv/h26x/h2656_inter_rvv.S
> @@ -0,0 +1,53 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavcodec/riscv/h26x/asm.S"
> +
> +.macro put_uni_pixels w, vlen, id
> +\id\w\vlen:
> +.if \w == 128 && \vlen == 128
> +        li                t0, \w
> +        vsetvli           zero, t0, e8, m8, ta, ma
> +.else
> +        vsetvlstatic8     \w, \vlen
> +.endif
> +1:
> +        vle8.v            v0, (a2)
> +        addi              a4, a4, -1
> +        vse8.v            v0, (a0)
> +        add               a2, a2, a3
> +        add               a0, a0, a1
> +        bnez              a4, 1b
> +        ret
> +.endm
> +
> +.macro func_put_uni_pixels vlen
> +func ff_h2656_put_uni_pixels_8_rvv_\vlen\(), zve32x, zbb, zba
> +        lpad    0
> +        POW2_JMP_TABLE    4, \vlen
> +        POW2_J            \vlen, 4, a7
> +        .irp w,2,4,8,16,32,64,128
> +        put_uni_pixels    \w, \vlen, 4
> +        .endr
> +endfunc
> +.endm
> +
> +func_put_uni_pixels 256
> +func_put_uni_pixels 128
> diff --git a/libavcodec/riscv/h26x/h2656dsp.h
> b/libavcodec/riscv/h26x/h2656dsp.h
> new file mode 100644
> index 0000000000..41ba6bc331
> --- /dev/null
> +++ b/libavcodec/riscv/h26x/h2656dsp.h
> @@ -0,0 +1,33 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_RISCV_H26X_H2656DSP_H
> +#define AVCODEC_RISCV_H26X_H2656DSP_H
> +
> +#define H2656_PEL_PROTOTYPE(name, D, opt) \
> +void ff_h2656_put_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst,
> ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int
> height, const int8_t *hf, const int8_t *vf, int width)     \
> +
> +#define H2656_MC_8TAP_PROTOTYPES(fname, bitd, opt)    \
> +    H2656_PEL_PROTOTYPE(fname,  bitd, opt);        \
> +
> +H2656_MC_8TAP_PROTOTYPES(pixels  ,  8, rvv_256);
> +H2656_MC_8TAP_PROTOTYPES(pixels  ,  8, rvv_128);
> +
> +#endif
> diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile
> index 582b051579..ec116aebc1 100644
> --- a/libavcodec/riscv/vvc/Makefile
> +++ b/libavcodec/riscv/vvc/Makefile
> @@ -1,2 +1,3 @@
>  OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
> -RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
> +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o \
> +                                  riscv/h26x/h2656_inter_rvv.o
> diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c
> b/libavcodec/riscv/vvc/vvcdsp_init.c
> index bee892cb7c..9dea70f392 100644
> --- a/libavcodec/riscv/vvc/vvcdsp_init.c
> +++ b/libavcodec/riscv/vvc/vvcdsp_init.c
> @@ -25,6 +25,7 @@
>  #include "libavutil/riscv/cpu.h"
>  #include "libavcodec/vvc/dsp.h"
>  #include "libavcodec/vvc/dec.h"
> +#include "libavcodec/riscv/h26x/h2656dsp.h"
>
>  #define bf(fn, bd,  opt) fn##_##bd##_##opt
>
> @@ -72,8 +73,12 @@ PUT_PIXELS_PROTOTYPES2(8, rvv_256)
>              c->inter.dst[C][w][idx1][idx2] = a;
>          \
>      } while (0)
>          \
>
> +#define DIR_FUNCS(d, C, opt)
>         \
> +        PEL_FUNC(put_##d, C, 0, 0, ff_h2656_put_##d##_pixels_8_##opt);
>         \
> +
>  #define FUNCS(C, opt)
>          \
>          PEL_FUNC(put, C, 0, 0, ff_vvc_put_pixels_8_##opt);
>         \
> +        DIR_FUNCS(uni, C, opt);
>          \
>
>  void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
>  {
> --
> 2.47.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Rémi Denis-Courmont Nov. 9, 2024, 3:33 p.m. UTC | #2
Le maanantaina 28. lokakuuta 2024, 19.08.24 EET uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
>                                                       k230              
> banana_f3 put_uni_pixels_chroma_8_4x4_c:                         128.3 (
> 1.00x)    90.5 ( 1.00x) put_uni_pixels_chroma_8_4x4_rvv_i32:               
>     17.6 ( 7.30x)    17.4 ( 5.18x) put_uni_pixels_chroma_8_8x8_c:          
>               295.1 ( 1.00x)    163.2 ( 1.00x)
> put_uni_pixels_chroma_8_8x8_rvv_i32:                    35.8 ( 8.24x)   
> 27.9 ( 5.84x) put_uni_pixels_chroma_8_16x16_c:                       619.3
> ( 1.00x)    267.4 ( 1.00x) put_uni_pixels_chroma_8_16x16_rvv_i32:          
>        72.8 ( 8.50x)    48.7 ( 5.49x) put_uni_pixels_chroma_8_32x32_c:     
>                 1433.8 ( 1.00x)    538.2 ( 1.00x)
> put_uni_pixels_chroma_8_32x32_rvv_i32:                 230.3 ( 6.23x)   
> 236.2 ( 2.28x) put_uni_pixels_chroma_8_64x64_c:                      3517.3
> ( 1.00x)    1455.0 ( 1.00x) put_uni_pixels_chroma_8_64x64_rvv_i32:         
>        813.6 ( 4.32x)    590.2 ( 2.47x) put_uni_pixels_chroma_8_128x128_c: 
>                  10174.6 ( 1.00x)    5798.7 ( 1.00x)
> put_uni_pixels_chroma_8_128x128_rvv_i32:              2989.3 ( 3.40x)   
> 2371.4 ( 2.45x) put_uni_pixels_luma_8_4x4_c:                          
> 128.6 ( 1.00x)    90.5 ( 1.00x) put_uni_pixels_luma_8_4x4_rvv_i32:         
>             17.3 ( 7.42x)    17.4 ( 5.18x) put_uni_pixels_luma_8_8x8_c:    
>                       295.1 ( 1.00x)    142.4 ( 1.00x)
> put_uni_pixels_luma_8_8x8_rvv_i32:                      26.6 (11.10x)   
> 27.9 ( 5.10x) put_uni_pixels_luma_8_16x16_c:                         600.6
> ( 1.00x)    277.7 ( 1.00x) put_uni_pixels_luma_8_16x16_rvv_i32:            
>        82.1 ( 7.32x)    48.7 ( 5.70x) put_uni_pixels_luma_8_32x32_c:       
>                 1406.1 ( 1.00x)    528.0 ( 1.00x)
> put_uni_pixels_luma_8_32x32_rvv_i32:                   230.3 ( 6.10x)   
> 131.9 ( 4.00x) put_uni_pixels_luma_8_64x64_c:                        4600.6
> ( 1.00x)    1309.2 ( 1.00x) put_uni_pixels_luma_8_64x64_rvv_i32:           
>       1073.1 ( 4.29x)    382.2 ( 3.43x) put_uni_pixels_luma_8_128x128_c:   
>                  11350.3 ( 1.00x)    3506.9 ( 1.00x)
> put_uni_pixels_luma_8_128x128_rvv_i32:                3119.1 ( 3.64x)   
> 2017.5 ( 1.74x) ---
>  libavcodec/riscv/h26x/h2656_inter_rvv.S | 53 +++++++++++++++++++++++++
>  libavcodec/riscv/h26x/h2656dsp.h        | 33 +++++++++++++++
>  libavcodec/riscv/vvc/Makefile           |  3 +-
>  libavcodec/riscv/vvc/vvcdsp_init.c      |  5 +++
>  4 files changed, 93 insertions(+), 1 deletion(-)
>  create mode 100644 libavcodec/riscv/h26x/h2656_inter_rvv.S
>  create mode 100644 libavcodec/riscv/h26x/h2656dsp.h
> 
> diff --git a/libavcodec/riscv/h26x/h2656_inter_rvv.S
> b/libavcodec/riscv/h26x/h2656_inter_rvv.S new file mode 100644
> index 0000000000..6692e33acf
> --- /dev/null
> +++ b/libavcodec/riscv/h26x/h2656_inter_rvv.S
> @@ -0,0 +1,53 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS). + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#include "libavcodec/riscv/h26x/asm.S"
> +
> +.macro put_uni_pixels w, vlen, id
> +\id\w\vlen:
> +.if \w == 128 && \vlen == 128
> +        li                t0, \w
> +        vsetvli           zero, t0, e8, m8, ta, ma
> +.else
> +        vsetvlstatic8     \w, \vlen
> +.endif
> +1:
> +        vle8.v            v0, (a2)
> +        addi              a4, a4, -1
> +        vse8.v            v0, (a0)
> +        add               a2, a2, a3
> +        add               a0, a0, a1
> +        bnez              a4, 1b
> +        ret
> +.endm

Is this going to be reused anywhere? it seems the macro is only used once atm.

Also is there a reason to use RVV here instead of just unaligned RVI?

> +
> +.macro func_put_uni_pixels vlen
> +func ff_h2656_put_uni_pixels_8_rvv_\vlen\(), zve32x, zbb, zba
> +        lpad    0
> +        POW2_JMP_TABLE    4, \vlen
> +        POW2_J            \vlen, 4, a7
> +        .irp w,2,4,8,16,32,64,128
> +        put_uni_pixels    \w, \vlen, 4
> +        .endr
> +endfunc
> +.endm
> +
> +func_put_uni_pixels 256
> +func_put_uni_pixels 128
> diff --git a/libavcodec/riscv/h26x/h2656dsp.h
> b/libavcodec/riscv/h26x/h2656dsp.h new file mode 100644
> index 0000000000..41ba6bc331
> --- /dev/null
> +++ b/libavcodec/riscv/h26x/h2656dsp.h
> @@ -0,0 +1,33 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS). + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA + */
> +
> +#ifndef AVCODEC_RISCV_H26X_H2656DSP_H
> +#define AVCODEC_RISCV_H26X_H2656DSP_H
> +
> +#define H2656_PEL_PROTOTYPE(name, D, opt) \
> +void ff_h2656_put_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t
> _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, const
> int8_t *hf, const int8_t *vf, int width)     \ +
> +#define H2656_MC_8TAP_PROTOTYPES(fname, bitd, opt)    \
> +    H2656_PEL_PROTOTYPE(fname,  bitd, opt);        \
> +
> +H2656_MC_8TAP_PROTOTYPES(pixels  ,  8, rvv_256);
> +H2656_MC_8TAP_PROTOTYPES(pixels  ,  8, rvv_128);
> +
> +#endif
> diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile
> index 582b051579..ec116aebc1 100644
> --- a/libavcodec/riscv/vvc/Makefile
> +++ b/libavcodec/riscv/vvc/Makefile
> @@ -1,2 +1,3 @@
>  OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
> -RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
> +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o \
> +                                  riscv/h26x/h2656_inter_rvv.o
> diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c
> b/libavcodec/riscv/vvc/vvcdsp_init.c index bee892cb7c..9dea70f392 100644
> --- a/libavcodec/riscv/vvc/vvcdsp_init.c
> +++ b/libavcodec/riscv/vvc/vvcdsp_init.c
> @@ -25,6 +25,7 @@
>  #include "libavutil/riscv/cpu.h"
>  #include "libavcodec/vvc/dsp.h"
>  #include "libavcodec/vvc/dec.h"
> +#include "libavcodec/riscv/h26x/h2656dsp.h"
> 
>  #define bf(fn, bd,  opt) fn##_##bd##_##opt
> 
> @@ -72,8 +73,12 @@ PUT_PIXELS_PROTOTYPES2(8, rvv_256)
>              c->inter.dst[C][w][idx1][idx2] = a;                            
>       \ } while (0)                                                        
>           \
> 
> +#define DIR_FUNCS(d, C, opt)                                               
>       \ +        PEL_FUNC(put_##d, C, 0, 0,
> ff_h2656_put_##d##_pixels_8_##opt);            \ +
>  #define FUNCS(C, opt)                                                      
>       \ PEL_FUNC(put, C, 0, 0, ff_vvc_put_pixels_8_##opt);                 
>       \ +        DIR_FUNCS(uni, C, opt);                                   
>                \
> 
>  void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
>  {
flow gg Nov. 10, 2024, 11:36 a.m. UTC | #3
> Is this going to be reused anywhere? it seems the macro is only used once
atm.

The next patch will use ([PATCH 4/5] lavc/hevc: R-V V pel_uni(pow2))

> Also is there a reason to use RVV here instead of just unaligned RVI?

Yes, RVI is enough; I deleted it and resent it.

Rémi Denis-Courmont <remi@remlab.net> 于2024年11月9日周六 23:33写道:

> Le maanantaina 28. lokakuuta 2024, 19.08.24 EET uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> >                                                       k230
> > banana_f3 put_uni_pixels_chroma_8_4x4_c:                         128.3 (
> > 1.00x)    90.5 ( 1.00x) put_uni_pixels_chroma_8_4x4_rvv_i32:
>
> >     17.6 ( 7.30x)    17.4 ( 5.18x) put_uni_pixels_chroma_8_8x8_c:
>
> >               295.1 ( 1.00x)    163.2 ( 1.00x)
> > put_uni_pixels_chroma_8_8x8_rvv_i32:                    35.8 ( 8.24x)
> > 27.9 ( 5.84x) put_uni_pixels_chroma_8_16x16_c:
>  619.3
> > ( 1.00x)    267.4 ( 1.00x) put_uni_pixels_chroma_8_16x16_rvv_i32:
>
> >        72.8 ( 8.50x)    48.7 ( 5.49x) put_uni_pixels_chroma_8_32x32_c:
>
> >                 1433.8 ( 1.00x)    538.2 ( 1.00x)
> > put_uni_pixels_chroma_8_32x32_rvv_i32:                 230.3 ( 6.23x)
> > 236.2 ( 2.28x) put_uni_pixels_chroma_8_64x64_c:
> 3517.3
> > ( 1.00x)    1455.0 ( 1.00x) put_uni_pixels_chroma_8_64x64_rvv_i32:
>
> >        813.6 ( 4.32x)    590.2 ( 2.47x)
> put_uni_pixels_chroma_8_128x128_c:
> >                  10174.6 ( 1.00x)    5798.7 ( 1.00x)
> > put_uni_pixels_chroma_8_128x128_rvv_i32:              2989.3 ( 3.40x)
> > 2371.4 ( 2.45x) put_uni_pixels_luma_8_4x4_c:
> > 128.6 ( 1.00x)    90.5 ( 1.00x) put_uni_pixels_luma_8_4x4_rvv_i32:
>
> >             17.3 ( 7.42x)    17.4 ( 5.18x) put_uni_pixels_luma_8_8x8_c:
>
> >                       295.1 ( 1.00x)    142.4 ( 1.00x)
> > put_uni_pixels_luma_8_8x8_rvv_i32:                      26.6 (11.10x)
> > 27.9 ( 5.10x) put_uni_pixels_luma_8_16x16_c:
>  600.6
> > ( 1.00x)    277.7 ( 1.00x) put_uni_pixels_luma_8_16x16_rvv_i32:
>
> >        82.1 ( 7.32x)    48.7 ( 5.70x) put_uni_pixels_luma_8_32x32_c:
>
> >                 1406.1 ( 1.00x)    528.0 ( 1.00x)
> > put_uni_pixels_luma_8_32x32_rvv_i32:                   230.3 ( 6.10x)
> > 131.9 ( 4.00x) put_uni_pixels_luma_8_64x64_c:
> 4600.6
> > ( 1.00x)    1309.2 ( 1.00x) put_uni_pixels_luma_8_64x64_rvv_i32:
>
> >       1073.1 ( 4.29x)    382.2 ( 3.43x)
> put_uni_pixels_luma_8_128x128_c:
> >                  11350.3 ( 1.00x)    3506.9 ( 1.00x)
> > put_uni_pixels_luma_8_128x128_rvv_i32:                3119.1 ( 3.64x)
> > 2017.5 ( 1.74x) ---
> >  libavcodec/riscv/h26x/h2656_inter_rvv.S | 53 +++++++++++++++++++++++++
> >  libavcodec/riscv/h26x/h2656dsp.h        | 33 +++++++++++++++
> >  libavcodec/riscv/vvc/Makefile           |  3 +-
> >  libavcodec/riscv/vvc/vvcdsp_init.c      |  5 +++
> >  4 files changed, 93 insertions(+), 1 deletion(-)
> >  create mode 100644 libavcodec/riscv/h26x/h2656_inter_rvv.S
> >  create mode 100644 libavcodec/riscv/h26x/h2656dsp.h
> >
> > diff --git a/libavcodec/riscv/h26x/h2656_inter_rvv.S
> > b/libavcodec/riscv/h26x/h2656_inter_rvv.S new file mode 100644
> > index 0000000000..6692e33acf
> > --- /dev/null
> > +++ b/libavcodec/riscv/h26x/h2656_inter_rvv.S
> > @@ -0,0 +1,53 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "libavcodec/riscv/h26x/asm.S"
> > +
> > +.macro put_uni_pixels w, vlen, id
> > +\id\w\vlen:
> > +.if \w == 128 && \vlen == 128
> > +        li                t0, \w
> > +        vsetvli           zero, t0, e8, m8, ta, ma
> > +.else
> > +        vsetvlstatic8     \w, \vlen
> > +.endif
> > +1:
> > +        vle8.v            v0, (a2)
> > +        addi              a4, a4, -1
> > +        vse8.v            v0, (a0)
> > +        add               a2, a2, a3
> > +        add               a0, a0, a1
> > +        bnez              a4, 1b
> > +        ret
> > +.endm
>
> Is this going to be reused anywhere? it seems the macro is only used once
> atm.
>
> Also is there a reason to use RVV here instead of just unaligned RVI?
>
> > +
> > +.macro func_put_uni_pixels vlen
> > +func ff_h2656_put_uni_pixels_8_rvv_\vlen\(), zve32x, zbb, zba
> > +        lpad    0
> > +        POW2_JMP_TABLE    4, \vlen
> > +        POW2_J            \vlen, 4, a7
> > +        .irp w,2,4,8,16,32,64,128
> > +        put_uni_pixels    \w, \vlen, 4
> > +        .endr
> > +endfunc
> > +.endm
> > +
> > +func_put_uni_pixels 256
> > +func_put_uni_pixels 128
> > diff --git a/libavcodec/riscv/h26x/h2656dsp.h
> > b/libavcodec/riscv/h26x/h2656dsp.h new file mode 100644
> > index 0000000000..41ba6bc331
> > --- /dev/null
> > +++ b/libavcodec/riscv/h26x/h2656dsp.h
> > @@ -0,0 +1,33 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#ifndef AVCODEC_RISCV_H26X_H2656DSP_H
> > +#define AVCODEC_RISCV_H26X_H2656DSP_H
> > +
> > +#define H2656_PEL_PROTOTYPE(name, D, opt) \
> > +void ff_h2656_put_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst,
> ptrdiff_t
> > _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, const
> > int8_t *hf, const int8_t *vf, int width)     \ +
> > +#define H2656_MC_8TAP_PROTOTYPES(fname, bitd, opt)    \
> > +    H2656_PEL_PROTOTYPE(fname,  bitd, opt);        \
> > +
> > +H2656_MC_8TAP_PROTOTYPES(pixels  ,  8, rvv_256);
> > +H2656_MC_8TAP_PROTOTYPES(pixels  ,  8, rvv_128);
> > +
> > +#endif
> > diff --git a/libavcodec/riscv/vvc/Makefile
> b/libavcodec/riscv/vvc/Makefile
> > index 582b051579..ec116aebc1 100644
> > --- a/libavcodec/riscv/vvc/Makefile
> > +++ b/libavcodec/riscv/vvc/Makefile
> > @@ -1,2 +1,3 @@
> >  OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
> > -RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
> > +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o \
> > +                                  riscv/h26x/h2656_inter_rvv.o
> > diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c
> > b/libavcodec/riscv/vvc/vvcdsp_init.c index bee892cb7c..9dea70f392 100644
> > --- a/libavcodec/riscv/vvc/vvcdsp_init.c
> > +++ b/libavcodec/riscv/vvc/vvcdsp_init.c
> > @@ -25,6 +25,7 @@
> >  #include "libavutil/riscv/cpu.h"
> >  #include "libavcodec/vvc/dsp.h"
> >  #include "libavcodec/vvc/dec.h"
> > +#include "libavcodec/riscv/h26x/h2656dsp.h"
> >
> >  #define bf(fn, bd,  opt) fn##_##bd##_##opt
> >
> > @@ -72,8 +73,12 @@ PUT_PIXELS_PROTOTYPES2(8, rvv_256)
> >              c->inter.dst[C][w][idx1][idx2] = a;
>
> >       \ } while (0)
>
> >           \
> >
> > +#define DIR_FUNCS(d, C, opt)
>
> >       \ +        PEL_FUNC(put_##d, C, 0, 0,
> > ff_h2656_put_##d##_pixels_8_##opt);            \ +
> >  #define FUNCS(C, opt)
>
> >       \ PEL_FUNC(put, C, 0, 0, ff_vvc_put_pixels_8_##opt);
>
> >       \ +        DIR_FUNCS(uni, C, opt);
>
> >                \
> >
> >  void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
> >  {
>
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavcodec/riscv/h26x/h2656_inter_rvv.S b/libavcodec/riscv/h26x/h2656_inter_rvv.S
new file mode 100644
index 0000000000..6692e33acf
--- /dev/null
+++ b/libavcodec/riscv/h26x/h2656_inter_rvv.S
@@ -0,0 +1,53 @@ 
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/riscv/h26x/asm.S"
+
+.macro put_uni_pixels w, vlen, id
+\id\w\vlen:
+.if \w == 128 && \vlen == 128
+        li                t0, \w
+        vsetvli           zero, t0, e8, m8, ta, ma
+.else
+        vsetvlstatic8     \w, \vlen
+.endif
+1:
+        vle8.v            v0, (a2)
+        addi              a4, a4, -1
+        vse8.v            v0, (a0)
+        add               a2, a2, a3
+        add               a0, a0, a1
+        bnez              a4, 1b
+        ret
+.endm
+
+.macro func_put_uni_pixels vlen
+func ff_h2656_put_uni_pixels_8_rvv_\vlen\(), zve32x, zbb, zba
+        lpad    0
+        POW2_JMP_TABLE    4, \vlen
+        POW2_J            \vlen, 4, a7
+        .irp w,2,4,8,16,32,64,128
+        put_uni_pixels    \w, \vlen, 4
+        .endr
+endfunc
+.endm
+
+func_put_uni_pixels 256
+func_put_uni_pixels 128
diff --git a/libavcodec/riscv/h26x/h2656dsp.h b/libavcodec/riscv/h26x/h2656dsp.h
new file mode 100644
index 0000000000..41ba6bc331
--- /dev/null
+++ b/libavcodec/riscv/h26x/h2656dsp.h
@@ -0,0 +1,33 @@ 
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H26X_H2656DSP_H
+#define AVCODEC_RISCV_H26X_H2656DSP_H
+
+#define H2656_PEL_PROTOTYPE(name, D, opt) \
+void ff_h2656_put_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width)     \
+
+#define H2656_MC_8TAP_PROTOTYPES(fname, bitd, opt)    \
+    H2656_PEL_PROTOTYPE(fname,  bitd, opt);        \
+
+H2656_MC_8TAP_PROTOTYPES(pixels  ,  8, rvv_256);
+H2656_MC_8TAP_PROTOTYPES(pixels  ,  8, rvv_128);
+
+#endif
diff --git a/libavcodec/riscv/vvc/Makefile b/libavcodec/riscv/vvc/Makefile
index 582b051579..ec116aebc1 100644
--- a/libavcodec/riscv/vvc/Makefile
+++ b/libavcodec/riscv/vvc/Makefile
@@ -1,2 +1,3 @@ 
 OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvcdsp_init.o
-RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o
+RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc/vvc_mc_rvv.o \
+                                  riscv/h26x/h2656_inter_rvv.o
diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c b/libavcodec/riscv/vvc/vvcdsp_init.c
index bee892cb7c..9dea70f392 100644
--- a/libavcodec/riscv/vvc/vvcdsp_init.c
+++ b/libavcodec/riscv/vvc/vvcdsp_init.c
@@ -25,6 +25,7 @@ 
 #include "libavutil/riscv/cpu.h"
 #include "libavcodec/vvc/dsp.h"
 #include "libavcodec/vvc/dec.h"
+#include "libavcodec/riscv/h26x/h2656dsp.h"
 
 #define bf(fn, bd,  opt) fn##_##bd##_##opt
 
@@ -72,8 +73,12 @@  PUT_PIXELS_PROTOTYPES2(8, rvv_256)
             c->inter.dst[C][w][idx1][idx2] = a;                                   \
     } while (0)                                                                   \
 
+#define DIR_FUNCS(d, C, opt)                                                      \
+        PEL_FUNC(put_##d, C, 0, 0, ff_h2656_put_##d##_pixels_8_##opt);            \
+
 #define FUNCS(C, opt)                                                             \
         PEL_FUNC(put, C, 0, 0, ff_vvc_put_pixels_8_##opt);                        \
+        DIR_FUNCS(uni, C, opt);                                                   \
 
 void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
 {