diff mbox series

[FFmpeg-devel] sws/range_convert: R-V V to/from JPEG

Message ID 20240607155350.17931-1-remi@remlab.net
State New
Headers show
Series [FFmpeg-devel] sws/range_convert: R-V V to/from JPEG | expand

Checks

Context Check Description
andriy/configure_x86 warning Failed to apply patch

Commit Message

Rémi Denis-Courmont June 7, 2024, 3:53 p.m. UTC
C908   X60
chrRangeFromJpeg_8_c:          2.7    2.5
chrRangeFromJpeg_8_rvv_i32:    1.7    1.5
chrRangeFromJpeg_24_c:         7.5    6.7
chrRangeFromJpeg_24_rvv_i32:   1.7    1.5
chrRangeFromJpeg_128_c:       55.2   34.7
chrRangeFromJpeg_128_rvv_i32:  6.5    3.0
chrRangeFromJpeg_144_c:       44.0   39.2
chrRangeFromJpeg_144_rvv_i32:  7.7    4.5
chrRangeFromJpeg_256_c:       78.2   69.5
chrRangeFromJpeg_256_rvv_i32: 12.2    6.0
chrRangeFromJpeg_512_c:      172.2  138.5
chrRangeFromJpeg_512_rvv_i32: 24.5   11.7
chrRangeToJpeg_8_c:            4.7    4.2
chrRangeToJpeg_8_rvv_i32:      2.0    1.7
chrRangeToJpeg_24_c:          13.7   12.2
chrRangeToJpeg_24_rvv_i32:     2.0    1.5
chrRangeToJpeg_128_c:         72.0   63.7
chrRangeToJpeg_128_rvv_i32:    6.7    3.2
chrRangeToJpeg_144_c:         80.7   71.7
chrRangeToJpeg_144_rvv_i32:    8.5    4.7
chrRangeToJpeg_256_c:        143.2  127.2
chrRangeToJpeg_256_rvv_i32:   13.5    6.5
chrRangeToJpeg_512_c:        285.7  253.7
chrRangeToJpeg_512_rvv_i32:   27.0   13.0
lumRangeFromJpeg_8_c:          1.7    1.5
lumRangeFromJpeg_8_rvv_i32:    1.2    1.0
lumRangeFromJpeg_24_c:         4.2    3.7
lumRangeFromJpeg_24_rvv_i32:   1.2    1.0
lumRangeFromJpeg_128_c:       21.7   19.2
lumRangeFromJpeg_128_rvv_i32:  3.7    1.7
lumRangeFromJpeg_144_c:       24.7   22.0
lumRangeFromJpeg_144_rvv_i32:  4.7    2.7
lumRangeFromJpeg_256_c:       43.7   39.0
lumRangeFromJpeg_256_rvv_i32:  7.5    3.2
lumRangeFromJpeg_512_c:       87.0   77.2
lumRangeFromJpeg_512_rvv_i32: 14.5    6.7
lumRangeToJpeg_8_c:            2.7    2.2
lumRangeToJpeg_8_rvv_i32:      1.0    1.0
lumRangeToJpeg_24_c:           7.2    6.5
lumRangeToJpeg_24_rvv_i32:     1.2    1.0
lumRangeToJpeg_128_c:         37.7   33.7
lumRangeToJpeg_128_rvv_i32:    3.7    2.0
lumRangeToJpeg_144_c:         42.5   37.7
lumRangeToJpeg_144_rvv_i32:    4.7    2.7
lumRangeToJpeg_256_c:         75.0   66.7
lumRangeToJpeg_256_rvv_i32:    7.5    3.5
lumRangeToJpeg_512_c:        149.5  133.0
lumRangeToJpeg_512_rvv_i32:   14.7    7.0
---
 libswscale/riscv/Makefile    |   1 +
 libswscale/riscv/range_rvv.S | 114 +++++++++++++++++++++++++++++++++++
 libswscale/riscv/swscale.c   |  30 ++++++++-
 3 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 libswscale/riscv/range_rvv.S
diff mbox series

Patch

diff --git a/libswscale/riscv/Makefile b/libswscale/riscv/Makefile
index de31684f6d..6038c8d873 100644
--- a/libswscale/riscv/Makefile
+++ b/libswscale/riscv/Makefile
@@ -2,4 +2,5 @@  OBJS += riscv/rgb2rgb.o \
         riscv/swscale.o
 RV-OBJS += riscv/rgb2rgb_rvb.o
 RVV-OBJS += riscv/input_rvv.o \
+            riscv/range_rvv.o \
             riscv/rgb2rgb_rvv.o
diff --git a/libswscale/riscv/range_rvv.S b/libswscale/riscv/range_rvv.S
new file mode 100644
index 0000000000..9da80e6199
--- /dev/null
+++ b/libswscale/riscv/range_rvv.S
@@ -0,0 +1,114 @@ 
+/*
+ * Copyright © 2024 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+func ff_range_lum_to_jpeg_16_rvv, zve32x
+        li       t1, 30189
+        li       t2, 19077
+        li       t3, -39057361
+1:
+        vsetvli  t0, a1, e16, m4, ta, ma
+        vle16.v  v0, (a0)
+        sub      a1, a1, t0
+        vmin.vx  v0, v0, t1
+        vwmul.vx v8, v0, t2
+        vsetvli  zero, zero, e32, m8, ta, ma
+        vadd.vx  v8, v8, t3
+        vsetvli  zero, zero, e16, m4, ta, ma
+        vnsra.wi v0, v8, 14
+        vse16.v  v0, (a0)
+        sh1add   a0, t0, a0
+        bnez     a1, 1b
+
+        ret
+endfunc
+
+func ff_range_lum_from_jpeg_16_rvv, zve32x
+        li       t1, 14071
+        li       t2, 33561947
+1:
+        vsetvli  t0, a1, e16, m4, ta, ma
+        vle16.v  v0, (a0)
+        sub      a1, a1, t0
+        vwmul.vx v8, v0, t1
+        vsetvli  zero, zero, e32, m8, ta, ma
+        vadd.vx  v8, v8, t2
+        vsetvli  zero, zero, e16, m4, ta, ma
+        vnsra.wi v0, v8, 14
+        vse16.v  v0, (a0)
+        sh1add   a0, t0, a0
+        bnez     a1, 1b
+
+        ret
+endfunc
+
+func ff_range_chr_to_jpeg_16_rvv, zve32x
+        li      t1, 30775
+        li      t2, 4663
+        li      t3, -9289992
+1:
+        vsetvli  t0, a2, e16, m4, ta, ma
+        vle16.v  v0, (a0)
+        sub      a2, a2, t0
+        vle16.v  v4, (a1)
+        vmin.vx  v0, v0, t1
+        vmin.vx  v4, v4, t1
+        vwmul.vx v8, v0, t2
+        vwmul.vx v16, v4, t2
+        vsetvli  zero, zero, e32, m8, ta, ma
+        vadd.vx  v8, v8, t3
+        vadd.vx  v16, v16, t3
+        vsetvli  zero, zero, e16, m4, ta, ma
+        vnsra.wi v0, v8, 12
+        vnsra.wi v4, v16, 12
+        vse16.v  v0, (a0)
+        sh1add   a0, t0, a0
+        vse16.v  v4, (a1)
+        sh1add   a1, t0, a1
+        bnez     a2, 1b
+
+        ret
+endfunc
+
+func ff_range_chr_from_jpeg_16_rvv, zve32x
+        li      t1, 1799
+        li      t2, 4081085
+1:
+        vsetvli  t0, a2, e16, m4, ta, ma
+        vle16.v  v0, (a0)
+        sub      a2, a2, t0
+        vle16.v  v4, (a1)
+        vwmul.vx v8, v0, t1
+        vwmul.vx v16, v4, t1
+        vsetvli  zero, zero, e32, m8, ta, ma
+        vadd.vx  v8, v8, t2
+        vadd.vx  v16, v16, t2
+        vsetvli  zero, zero, e16, m4, ta, ma
+        vnsra.wi v0, v8, 11
+        vnsra.wi v4, v16, 11
+        vse16.v  v0, (a0)
+        sh1add   a0, t0, a0
+        vse16.v  v0, (a1)
+        sh1add   a1, t0, a1
+        bnez     a2, 1b
+
+        ret
+endfunc
diff --git a/libswscale/riscv/swscale.c b/libswscale/riscv/swscale.c
index ffeb7876bf..529d817aaa 100644
--- a/libswscale/riscv/swscale.c
+++ b/libswscale/riscv/swscale.c
@@ -21,6 +21,33 @@ 
 #include "libavutil/riscv/cpu.h"
 #include "libswscale/swscale_internal.h"
 
+void ff_range_lum_to_jpeg_16_rvv(int16_t *, int);
+void ff_range_chr_to_jpeg_16_rvv(int16_t *, int16_t *, int);
+void ff_range_lum_from_jpeg_16_rvv(int16_t *, int);
+void ff_range_chr_from_jpeg_16_rvv(int16_t *, int16_t *, int);
+
+av_cold static void ff_sws_init_range_convert_riscv(SwsContext *c, int flags)
+{
+#if HAVE_RVV
+    static const struct {
+        void (*lum)(int16_t *, int);
+        void (*chr)(int16_t *, int16_t *, int);
+    } convs[2] = {
+        { ff_range_lum_to_jpeg_16_rvv, ff_range_chr_to_jpeg_16_rvv },
+        { ff_range_lum_from_jpeg_16_rvv, ff_range_chr_from_jpeg_16_rvv },
+    };
+
+    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat) &&
+        c->dstBpc <= 14 &&
+        (flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
+        bool from = c->srcRange != 0;
+
+        c->lumConvertRange = convs[from].lum;
+        c->chrConvertRange = convs[from].chr;
+    }
+#endif
+}
+
 #define RVV_INPUT(name) \
 void ff_##name##ToY_rvv(uint8_t *dst, const uint8_t *src, const uint8_t *, \
                         const uint8_t *, int w, uint32_t *coeffs, void *); \
@@ -40,9 +67,9 @@  RVV_INPUT(rgba32);
 
 av_cold void ff_sws_init_swscale_riscv(SwsContext *c)
 {
-#if HAVE_RVV
     int flags = av_get_cpu_flags();
 
+#if HAVE_RVV
     if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
         switch (c->srcFormat) {
             case AV_PIX_FMT_ABGR:
@@ -95,4 +122,5 @@  av_cold void ff_sws_init_swscale_riscv(SwsContext *c)
         }
     }
 #endif
+    ff_sws_init_range_convert_riscv(c, flags);
 }