[FFmpeg-devel,2/2] lavc/lpc: R-V V compute_autocorr

Message ID	20231212210240.19886-2-remi@remlab.net
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= <remi@remlab.net> To: ffmpeg-devel@ffmpeg.org Date: Tue, 12 Dec 2023 23:02:40 +0200 Message-ID: <20231212210240.19886-2-remi@remlab.net> In-Reply-To: <20231212210240.19886-1-remi@remlab.net> References: <20231212210240.19886-1-remi@remlab.net> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 2/2] lavc/lpc: R-V V compute_autocorr Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel,1/2] checkasm/lpc: test compute_autocorr \| expand [FFmpeg-devel,1/2] checkasm/lpc: test compute_autocorr [FFmpeg-devel,2/2] lavc/lpc: R-V V compute_autocorr

Message ID

20231212210240.19886-2-remi@remlab.net

State

New

Headers

Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org
 designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100;
From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= <remi@remlab.net>
To: ffmpeg-devel@ffmpeg.org
Date: Tue, 12 Dec 2023 23:02:40 +0200
Message-ID: <20231212210240.19886-2-remi@remlab.net>
In-Reply-To: <20231212210240.19886-1-remi@remlab.net>
References: <20231212210240.19886-1-remi@remlab.net>
MIME-Version: 1.0
Subject: [FFmpeg-devel] [PATCH 2/2] lavc/lpc: R-V V compute_autocorr
Precedence: list
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

Series

[FFmpeg-devel,1/2] checkasm/lpc: test compute_autocorr | expand

Checks

Context	Check	Description
yinshiyou/make_loongarch64	success	Make finished
yinshiyou/make_fate_loongarch64	success	Make fate finished
andriy/make_x86	success	Make finished
andriy/make_fate_x86	fail	Make fate failed

Context

Check

Description

yinshiyou/make_loongarch64

success

Make finished

yinshiyou/make_fate_loongarch64

success

Make fate finished

andriy/make_x86

success

Make finished

andriy/make_fate_x86

fail

Make fate failed

Commit Message

Rémi Denis-Courmont Dec. 12, 2023, 9:02 p.m. UTC

The loop iterates over the length of the vector, not the order. This is
to avoid reloading the same data for each lag value. However this means
the loop only works if the maximum order is no larger than VLENB.

The loop is roughly equivalent to:

    for (size_t j = 0; j < lag; j++)
        autoc[j] = 1.;

    while (len > lag) {
        for (ptrdiff_t j = 0; j < lag; j++)
            autoc[j] += data[j] * *data;
        data++;
        len--;
    }

    while (len > 0) {
        for (ptrdiff_t j = 0; j < len; j++)
            autoc[j] += data[j] * *data;
        data++;
        len--;
    }

Since register pressure is only at 50%, it should be possible to implement
the same loop for order up to 2xVLENB. But this is left for future work.

Performance numbers are all over the place from ~1.25x to ~4x speedups,
but at least they are always noticeably better than nothing.
---
 libavcodec/riscv/lpc_init.c |  8 +++++++-
 libavcodec/riscv/lpc_rvv.S  | 29 +++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

Comments

Rémi Denis-Courmont Dec. 12, 2023, 9:05 p.m. UTC | #1

Le tiistaina 12. joulukuuta 2023, 23.02.40 EET Rémi Denis-Courmont a écrit :
> The loop iterates over the length of the vector, not the order. This is
> to avoid reloading the same data for each lag value. However this means
> the loop only works if the maximum order is no larger than VLENB.
> 
> The loop is roughly equivalent to:
> 
>     for (size_t j = 0; j < lag; j++)
>         autoc[j] = 1.;
> 
>     while (len > lag) {
>         for (ptrdiff_t j = 0; j < lag; j++)
>             autoc[j] += data[j] * *data;
>         data++;
>         len--;
>     }
> 
>     while (len > 0) {
>         for (ptrdiff_t j = 0; j < len; j++)
>             autoc[j] += data[j] * *data;
>         data++;
>         len--;
>     }
> 
> Since register pressure is only at 50%, it should be possible to implement
> the same loop for order up to 2xVLENB. But this is left for future work.
> 
> Performance numbers are all over the place from ~1.25x to ~4x speedups,
> but at least they are always noticeably better than nothing.
> ---
>  libavcodec/riscv/lpc_init.c |  8 +++++++-
>  libavcodec/riscv/lpc_rvv.S  | 29 +++++++++++++++++++++++++++++
>  2 files changed, 36 insertions(+), 1 deletion(-)
> 
> diff --git a/libavcodec/riscv/lpc_init.c b/libavcodec/riscv/lpc_init.c
> index c16e5745f0..ab91956f2d 100644
> --- a/libavcodec/riscv/lpc_init.c
> +++ b/libavcodec/riscv/lpc_init.c
> @@ -22,16 +22,22 @@
> 
>  #include "libavutil/attributes.h"
>  #include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
>  #include "libavcodec/lpc.h"
> 
>  void ff_lpc_apply_welch_window_rvv(const int32_t *, ptrdiff_t, double *);
> +void ff_lpc_compute_autocorr_rvv(const double *, ptrdiff_t, int, double *);
> 
>  av_cold void ff_lpc_init_riscv(LPCContext *c)
>  {
>  #if HAVE_RVV && (__riscv_xlen >= 64)
>      int flags = av_get_cpu_flags();
> 
> -    if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR))
> +    if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
>          c->lpc_apply_welch_window = ff_lpc_apply_welch_window_rvv;
> +
> +        if (ff_get_rv_vlenb() >= c->max_order)
> +            c->lpc_compute_autocorr = ff_lpc_compute_autocorr_rvv;
> +    }
>  #endif
>  }
> diff --git a/libavcodec/riscv/lpc_rvv.S b/libavcodec/riscv/lpc_rvv.S
> index f81a2392c1..654156bf12 100644
> --- a/libavcodec/riscv/lpc_rvv.S
> +++ b/libavcodec/riscv/lpc_rvv.S
> @@ -85,4 +85,33 @@ func ff_lpc_apply_welch_window_rvv, zve64d
> 
>          ret
>  endfunc
> +
> +func ff_lpc_compute_autocorr_rvv, zve64d
> +        li        t0, 1
> +        vsetvli   t1, a2, e64, m8, ta, ma

t1 is unused and should be zero. This is leftover from incomplete attempt to 
unroll.

> +        fcvt.d.l  ft0, t0
> +        vle64.v   v0, (a0)
> +        sh3add    a0, a2, a0   # data += lag
> +        vfmv.v.f  v16, ft0
> +        bge       a2, a1, 2f
> +1:
> +        vfmv.f.s  ft0, v0
> +        fld       ft1, (a0)    # ft1 = data[lag + i]
> +        vfmacc.vf v16, ft0, v0 # v16[j] += data[i] * data[i + j]
> +        addi      a1, a1, -1
> +        vfslide1down.vf v0, v0, ft1
> +        addi      a0, a0, 8
> +        bgt       a1, a2, 1b   # while (len > lag);
> +2:
> +        vfmv.f.s  ft0, v0
> +        vsetvli   zero, a1, e64, m8, tu, ma
> +        vfmacc.vf v16, ft0, v0
> +        addi      a1, a1, -1
> +        vslide1down.vx v0, v0, zero
> +        bnez      a1, 2b       # while (len > 0);
> +
> +        vsetvli   zero, a2, e64, m8, ta, ma
> +        vse64.v   v16, (a3)
> +        ret
> +endfunc
>  #endif

diff --git a/libavcodec/riscv/lpc_init.c b/libavcodec/riscv/lpc_init.c
index c16e5745f0..ab91956f2d 100644
--- a/libavcodec/riscv/lpc_init.c
+++ b/libavcodec/riscv/lpc_init.c
@@ -22,16 +22,22 @@ 
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
 #include "libavcodec/lpc.h"
 
 void ff_lpc_apply_welch_window_rvv(const int32_t *, ptrdiff_t, double *);
+void ff_lpc_compute_autocorr_rvv(const double *, ptrdiff_t, int, double *);
 
 av_cold void ff_lpc_init_riscv(LPCContext *c)
 {
 #if HAVE_RVV && (__riscv_xlen >= 64)
     int flags = av_get_cpu_flags();
 
-    if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR))
+    if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
         c->lpc_apply_welch_window = ff_lpc_apply_welch_window_rvv;
+
+        if (ff_get_rv_vlenb() >= c->max_order)
+            c->lpc_compute_autocorr = ff_lpc_compute_autocorr_rvv;
+    }
 #endif
 }
diff --git a/libavcodec/riscv/lpc_rvv.S b/libavcodec/riscv/lpc_rvv.S
index f81a2392c1..654156bf12 100644
--- a/libavcodec/riscv/lpc_rvv.S
+++ b/libavcodec/riscv/lpc_rvv.S
@@ -85,4 +85,33 @@  func ff_lpc_apply_welch_window_rvv, zve64d
 
         ret
 endfunc
+
+func ff_lpc_compute_autocorr_rvv, zve64d
+        li        t0, 1
+        vsetvli   t1, a2, e64, m8, ta, ma
+        fcvt.d.l  ft0, t0
+        vle64.v   v0, (a0)
+        sh3add    a0, a2, a0   # data += lag
+        vfmv.v.f  v16, ft0
+        bge       a2, a1, 2f
+1:
+        vfmv.f.s  ft0, v0
+        fld       ft1, (a0)    # ft1 = data[lag + i]
+        vfmacc.vf v16, ft0, v0 # v16[j] += data[i] * data[i + j]
+        addi      a1, a1, -1
+        vfslide1down.vf v0, v0, ft1
+        addi      a0, a0, 8
+        bgt       a1, a2, 1b   # while (len > lag);
+2:
+        vfmv.f.s  ft0, v0
+        vsetvli   zero, a1, e64, m8, tu, ma
+        vfmacc.vf v16, ft0, v0
+        addi      a1, a1, -1
+        vslide1down.vx v0, v0, zero
+        bnez      a1, 2b       # while (len > 0);
+
+        vsetvli   zero, a2, e64, m8, ta, ma
+        vse64.v   v16, (a3)
+        ret
+endfunc
 #endif

[FFmpeg-devel,2/2] lavc/lpc: R-V V compute_autocorr

Checks

Commit Message

Comments

Patch