Message ID | 20231212210240.19886-2-remi@remlab.net |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/2] checkasm/lpc: test compute_autocorr | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | fail | Make fate failed |
Le tiistaina 12. joulukuuta 2023, 23.02.40 EET Rémi Denis-Courmont a écrit : > The loop iterates over the length of the vector, not the order. This is > to avoid reloading the same data for each lag value. However this means > the loop only works if the maximum order is no larger than VLENB. > > The loop is roughly equivalent to: > > for (size_t j = 0; j < lag; j++) > autoc[j] = 1.; > > while (len > lag) { > for (ptrdiff_t j = 0; j < lag; j++) > autoc[j] += data[j] * *data; > data++; > len--; > } > > while (len > 0) { > for (ptrdiff_t j = 0; j < len; j++) > autoc[j] += data[j] * *data; > data++; > len--; > } > > Since register pressure is only at 50%, it should be possible to implement > the same loop for order up to 2xVLENB. But this is left for future work. > > Performance numbers are all over the place from ~1.25x to ~4x speedups, > but at least they are always noticeably better than nothing. > --- > libavcodec/riscv/lpc_init.c | 8 +++++++- > libavcodec/riscv/lpc_rvv.S | 29 +++++++++++++++++++++++++++++ > 2 files changed, 36 insertions(+), 1 deletion(-) > > diff --git a/libavcodec/riscv/lpc_init.c b/libavcodec/riscv/lpc_init.c > index c16e5745f0..ab91956f2d 100644 > --- a/libavcodec/riscv/lpc_init.c > +++ b/libavcodec/riscv/lpc_init.c > @@ -22,16 +22,22 @@ > > #include "libavutil/attributes.h" > #include "libavutil/cpu.h" > +#include "libavutil/riscv/cpu.h" > #include "libavcodec/lpc.h" > > void ff_lpc_apply_welch_window_rvv(const int32_t *, ptrdiff_t, double *); > +void ff_lpc_compute_autocorr_rvv(const double *, ptrdiff_t, int, double *); > > av_cold void ff_lpc_init_riscv(LPCContext *c) > { > #if HAVE_RVV && (__riscv_xlen >= 64) > int flags = av_get_cpu_flags(); > > - if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR)) > + if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR)) { > c->lpc_apply_welch_window = ff_lpc_apply_welch_window_rvv; > + > + if (ff_get_rv_vlenb() >= c->max_order) > + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_rvv; > + } > #endif > } > diff --git a/libavcodec/riscv/lpc_rvv.S b/libavcodec/riscv/lpc_rvv.S > index f81a2392c1..654156bf12 100644 > --- a/libavcodec/riscv/lpc_rvv.S > +++ b/libavcodec/riscv/lpc_rvv.S > @@ -85,4 +85,33 @@ func ff_lpc_apply_welch_window_rvv, zve64d > > ret > endfunc > + > +func ff_lpc_compute_autocorr_rvv, zve64d > + li t0, 1 > + vsetvli t1, a2, e64, m8, ta, ma t1 is unused and should be zero. This is leftover from incomplete attempt to unroll. > + fcvt.d.l ft0, t0 > + vle64.v v0, (a0) > + sh3add a0, a2, a0 # data += lag > + vfmv.v.f v16, ft0 > + bge a2, a1, 2f > +1: > + vfmv.f.s ft0, v0 > + fld ft1, (a0) # ft1 = data[lag + i] > + vfmacc.vf v16, ft0, v0 # v16[j] += data[i] * data[i + j] > + addi a1, a1, -1 > + vfslide1down.vf v0, v0, ft1 > + addi a0, a0, 8 > + bgt a1, a2, 1b # while (len > lag); > +2: > + vfmv.f.s ft0, v0 > + vsetvli zero, a1, e64, m8, tu, ma > + vfmacc.vf v16, ft0, v0 > + addi a1, a1, -1 > + vslide1down.vx v0, v0, zero > + bnez a1, 2b # while (len > 0); > + > + vsetvli zero, a2, e64, m8, ta, ma > + vse64.v v16, (a3) > + ret > +endfunc > #endif
diff --git a/libavcodec/riscv/lpc_init.c b/libavcodec/riscv/lpc_init.c index c16e5745f0..ab91956f2d 100644 --- a/libavcodec/riscv/lpc_init.c +++ b/libavcodec/riscv/lpc_init.c @@ -22,16 +22,22 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" +#include "libavutil/riscv/cpu.h" #include "libavcodec/lpc.h" void ff_lpc_apply_welch_window_rvv(const int32_t *, ptrdiff_t, double *); +void ff_lpc_compute_autocorr_rvv(const double *, ptrdiff_t, int, double *); av_cold void ff_lpc_init_riscv(LPCContext *c) { #if HAVE_RVV && (__riscv_xlen >= 64) int flags = av_get_cpu_flags(); - if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR)) + if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR)) { c->lpc_apply_welch_window = ff_lpc_apply_welch_window_rvv; + + if (ff_get_rv_vlenb() >= c->max_order) + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_rvv; + } #endif } diff --git a/libavcodec/riscv/lpc_rvv.S b/libavcodec/riscv/lpc_rvv.S index f81a2392c1..654156bf12 100644 --- a/libavcodec/riscv/lpc_rvv.S +++ b/libavcodec/riscv/lpc_rvv.S @@ -85,4 +85,33 @@ func ff_lpc_apply_welch_window_rvv, zve64d ret endfunc + +func ff_lpc_compute_autocorr_rvv, zve64d + li t0, 1 + vsetvli t1, a2, e64, m8, ta, ma + fcvt.d.l ft0, t0 + vle64.v v0, (a0) + sh3add a0, a2, a0 # data += lag + vfmv.v.f v16, ft0 + bge a2, a1, 2f +1: + vfmv.f.s ft0, v0 + fld ft1, (a0) # ft1 = data[lag + i] + vfmacc.vf v16, ft0, v0 # v16[j] += data[i] * data[i + j] + addi a1, a1, -1 + vfslide1down.vf v0, v0, ft1 + addi a0, a0, 8 + bgt a1, a2, 1b # while (len > lag); +2: + vfmv.f.s ft0, v0 + vsetvli zero, a1, e64, m8, tu, ma + vfmacc.vf v16, ft0, v0 + addi a1, a1, -1 + vslide1down.vx v0, v0, zero + bnez a1, 2b # while (len > 0); + + vsetvli zero, a2, e64, m8, ta, ma + vse64.v v16, (a3) + ret +endfunc #endif