diff mbox series

[FFmpeg-devel,2/2] lavc/flacdsp: optimise RVV vector type for lpc16

Message ID 20240514193557.32759-2-remi@remlab.net
State New
Headers show
Series [FFmpeg-devel,1/2] lavu/riscv: assembler macros for VTYPE fields | expand

Checks

Context Check Description
andriy/configure_x86 warning Failed to apply patch

Commit Message

Rémi Denis-Courmont May 14, 2024, 7:35 p.m. UTC
This calculates the optimal vector type value at run-time based on the
hardware vector length and the FLAC LPC prediction order. In this
particular case, the additional computation is easily amortised over
the loop iterations:

T-Head C908:       C        V before   V after
flac_lpc_16_13:     14180.2  11229.0     7338.5
flac_lpc_16_16:     16833.2  11091.0     7248.5
flac_lpc_16_29:     28817.2  11455.7    10506.5
flac_lpc_16_32:     31059.7  10368.5    11305.2

With 128-bit vectors, improvements are expected for the first two
test cases only. For the other two, there is overhead but below noise.
Improvements should be better observable with prediction order of 8
and less, or on hardware with larger vector sizes.

The same optimisation strategy should be applicable to LPC32
(and work-in-progress LPC33), but is left as a future exercise.
---
 libavcodec/riscv/flacdsp_init.c |  2 +-
 libavcodec/riscv/flacdsp_rvv.S  | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

Comments

Rémi Denis-Courmont May 14, 2024, 7:42 p.m. UTC | #1
Le tiistaina 14. toukokuuta 2024, 22.35.57 EEST Rémi Denis-Courmont a écrit :
> This calculates the optimal vector type value at run-time based on the
> hardware vector length and the FLAC LPC prediction order. In this
> particular case, the additional computation is easily amortised over
> the loop iterations:
> 
> T-Head C908:       C        V before   V after
> flac_lpc_16_13:     14180.2  11229.0     7338.5
> flac_lpc_16_16:     16833.2  11091.0     7248.5
> flac_lpc_16_29:     28817.2  11455.7    10506.5
> flac_lpc_16_32:     31059.7  10368.5    11305.2
> 
> With 128-bit vectors, improvements are expected for the first two
> test cases only. For the other two, there is overhead but below noise.
> Improvements should be better observable with prediction order of 8
> and less, or on hardware with larger vector sizes.
> 
> The same optimisation strategy should be applicable to LPC32
> (and work-in-progress LPC33), but is left as a future exercise.
> ---
>  libavcodec/riscv/flacdsp_init.c |  2 +-
>  libavcodec/riscv/flacdsp_rvv.S  | 10 ++++++++--
>  2 files changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/libavcodec/riscv/flacdsp_init.c
> b/libavcodec/riscv/flacdsp_init.c index 77ffd09244..097f938f04 100644
> --- a/libavcodec/riscv/flacdsp_init.c
> +++ b/libavcodec/riscv/flacdsp_init.c
> @@ -71,7 +71,7 @@ av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum
> AVSampleFormat fmt, if ((flags & AV_CPU_FLAG_RVV_I32) && (flags &
> AV_CPU_FLAG_RVB_ADDR)) { int vlenb = ff_get_rv_vlenb();
> 
> -        if (vlenb >= 16)
> +        if ((flags & AV_CPU_FLAG_RVB_BASIC) && vlenb >= 16)
>              c->lpc16 = ff_flac_lpc16_rvv;
> 
>          c->wasted32 = ff_flac_wasted32_rvv;
> diff --git a/libavcodec/riscv/flacdsp_rvv.S b/libavcodec/riscv/flacdsp_rvv.S
> index 8b9c626198..42cece9786 100644
> --- a/libavcodec/riscv/flacdsp_rvv.S
> +++ b/libavcodec/riscv/flacdsp_rvv.S
> @@ -20,8 +20,14 @@
> 
>  #include "libavutil/riscv/asm.S"
> 
> -func ff_flac_lpc16_rvv, zve32x
> -        vsetvli zero, a2, e32, m8, ta, ma
> +func ff_flac_lpc16_rvv, zve32x, zbb
> +        csrr    t0, vlenb
> +        addi    t2, a2, -1
> +        clz     t0, t0
> +        clz     t2, t2
> +        addi    t0, t0, VTYPE_E32 | VTYPE_M8 | VTYPE_TA | VTYPE_MA
> +        sub     t0, t0, t2 // t0 += log2(next_power_of_two(len) / vlenb) -
> 1

Ok so checkasm can't sense it since we don't test that,
but I guess that this might crash due to illegal vector configuration if
- pred_order <= 2 with 128-bit vectors,
- pred_order <= 4 with 256-bit vectors,
- and so on.

This needs a little bit more work.

> +        vsetvl  zero, a2, t0
>          vle32.v v8, (a1)
>          sub     a4, a4, a2
>          vle32.v v16, (a0)
diff mbox series

Patch

diff --git a/libavcodec/riscv/flacdsp_init.c b/libavcodec/riscv/flacdsp_init.c
index 77ffd09244..097f938f04 100644
--- a/libavcodec/riscv/flacdsp_init.c
+++ b/libavcodec/riscv/flacdsp_init.c
@@ -71,7 +71,7 @@  av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum AVSampleFormat fmt,
     if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
         int vlenb = ff_get_rv_vlenb();
 
-        if (vlenb >= 16)
+        if ((flags & AV_CPU_FLAG_RVB_BASIC) && vlenb >= 16)
             c->lpc16 = ff_flac_lpc16_rvv;
 
         c->wasted32 = ff_flac_wasted32_rvv;
diff --git a/libavcodec/riscv/flacdsp_rvv.S b/libavcodec/riscv/flacdsp_rvv.S
index 8b9c626198..42cece9786 100644
--- a/libavcodec/riscv/flacdsp_rvv.S
+++ b/libavcodec/riscv/flacdsp_rvv.S
@@ -20,8 +20,14 @@ 
 
 #include "libavutil/riscv/asm.S"
 
-func ff_flac_lpc16_rvv, zve32x
-        vsetvli zero, a2, e32, m8, ta, ma
+func ff_flac_lpc16_rvv, zve32x, zbb
+        csrr    t0, vlenb
+        addi    t2, a2, -1
+        clz     t0, t0
+        clz     t2, t2
+        addi    t0, t0, VTYPE_E32 | VTYPE_M8 | VTYPE_TA | VTYPE_MA
+        sub     t0, t0, t2 // t0 += log2(next_power_of_two(len) / vlenb) - 1
+        vsetvl  zero, a2, t0
         vle32.v v8, (a1)
         sub     a4, a4, a2
         vle32.v v16, (a0)