diff mbox series

[FFmpeg-devel,3/3] swscale: [LA] Optimize swscale funcs in input.c

Message ID 20240316030333.31269-4-yinshiyou-hf@loongson.cn
State Accepted
Commit 2a7d622ddd0394f20de06b5f1da2f3c3cbc90f6f
Headers show
Series [FFmpeg-devel,1/3] swscale: [LA] Optimize range convert for yuvj420p. | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Shiyou Yin March 16, 2024, 3:03 a.m. UTC
Optimized 7 funcs with LSX and LASX:
1. yuy2ToUV_c
2. yvy2ToUV_c
3. uyvyToUV_c
4. nv12ToUV_c
5. nv21ToUV_c
6. abgrToA_c
7. rgbaToA_c
---
 libswscale/loongarch/Makefile                 |   1 +
 libswscale/loongarch/input.S                  | 495 ++++++++++++++++++
 libswscale/loongarch/input_lasx.c             |  43 ++
 libswscale/loongarch/input_lsx.c              |  65 +++
 libswscale/loongarch/swscale_init_loongarch.c |  20 +-
 libswscale/loongarch/swscale_loongarch.h      |  46 ++
 6 files changed, 652 insertions(+), 18 deletions(-)
 create mode 100644 libswscale/loongarch/input_lsx.c
diff mbox series

Patch

diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile
index c35ba309a4..7ba11d492e 100644
--- a/libswscale/loongarch/Makefile
+++ b/libswscale/loongarch/Makefile
@@ -9,4 +9,5 @@  LSX-OBJS-$(CONFIG_SWSCALE)  += loongarch/swscale.o \
                                loongarch/input.o   \
                                loongarch/output.o  \
                                loongarch/output_lsx.o  \
+                               loongarch/input_lsx.o   \
                                loongarch/yuv2rgb_lsx.o
diff --git a/libswscale/loongarch/input.S b/libswscale/loongarch/input.S
index d01f7384b1..717592b004 100644
--- a/libswscale/loongarch/input.S
+++ b/libswscale/loongarch/input.S
@@ -283,3 +283,498 @@  function planar_rgb_to_uv_lsx
     ld.d            s3,     sp,    16
     addi.d          sp,     sp,    24
 endfunc
+
+/*
+ * void yuy2ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ *                   const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function yuy2ToUV_lsx
+    andi         t0,    a5,    7
+    srli.d       a5,    a5,    3
+    beqz         a5,    2f
+1:
+    vld          vr0,   a3,    1
+    vld          vr1,   a3,    17
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    32
+    vpickev.b    vr2,   vr1,   vr0
+    vpickev.b    vr0,   vr2,   vr2
+    vpickod.b    vr1,   vr2,   vr2
+    fst.d        f0,    a0,    0
+    fst.d        f1,    a1,    0
+    addi.d       a0,    a0,    8
+    addi.d       a1,    a1,    8
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    1
+    ld.b         t2,    a3,    3
+    addi.d       a3,    a3,    4
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a0,    0
+    st.b         t2,    a1,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+function yuy2ToUV_lasx
+    andi         t0,    a5,    15
+    srli.d       a5,    a5,    4
+    beqz         a5,    2f
+1:
+    xvld         xr0,   a3,    1
+    xvld         xr1,   a3,    33
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    64
+    xvpickev.b   xr2,   xr1,   xr0
+    xvpermi.d    xr2,   xr2,   0xd8
+    xvpickev.b   xr0,   xr2,   xr2
+    xvpermi.d    xr0,   xr0,   0xd8
+    xvpickod.b   xr1,   xr2,   xr2
+    xvpermi.d    xr1,   xr1,   0xd8
+    vst          vr0,   a0,    0
+    vst          vr1,   a1,    0
+    addi.d       a0,    a0,    16
+    addi.d       a1,    a1,    16
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    1
+    ld.b         t2,    a3,    3
+    addi.d       a3,    a3,    4
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a0,    0
+    st.b         t2,    a1,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+/*
+ * void yvy2ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ *                   const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function yvy2ToUV_lsx
+    andi         t0,    a5,    7
+    srli.d       a5,    a5,    3
+    beqz         a5,    2f
+1:
+    vld          vr0,   a3,    1
+    vld          vr1,   a3,    17
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    32
+    vpickev.b    vr2,   vr1,   vr0
+    vpickev.b    vr0,   vr2,   vr2
+    vpickod.b    vr1,   vr2,   vr2
+    fst.d        f0,    a1,    0
+    fst.d        f1,    a0,    0
+    addi.d       a0,    a0,    8
+    addi.d       a1,    a1,    8
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    1
+    ld.b         t2,    a3,    3
+    addi.d       a3,    a3,    4
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a1,    0
+    st.b         t2,    a0,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+function yvy2ToUV_lasx
+    andi         t0,    a5,    15
+    srli.d       a5,    a5,    4
+    beqz         a5,    2f
+1:
+    xvld         xr0,   a3,    1
+    xvld         xr1,   a3,    33
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    64
+    xvpickev.b   xr2,   xr1,   xr0
+    xvpermi.d    xr2,   xr2,   0xd8
+    xvpickev.b   xr0,   xr2,   xr2
+    xvpermi.d    xr0,   xr0,   0xd8
+    xvpickod.b   xr1,   xr2,   xr2
+    xvpermi.d    xr1,   xr1,   0xd8
+    vst          vr0,   a1,    0
+    vst          vr1,   a0,    0
+    addi.d       a0,    a0,    16
+    addi.d       a1,    a1,    16
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    1
+    ld.b         t2,    a3,    3
+    addi.d       a3,    a3,    4
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a1,    0
+    st.b         t2,    a0,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+/*
+ * void uyvyToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ *                   const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function uyvyToUV_lsx
+    andi         t0,    a5,    7
+    srli.d       a5,    a5,    3
+    beqz         a5,    2f
+1:
+    vld          vr0,   a3,    0
+    vld          vr1,   a3,    16
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    32
+    vpickev.b    vr2,   vr1,   vr0
+    vpickev.b    vr0,   vr2,   vr2
+    vpickod.b    vr1,   vr2,   vr2
+    fst.d        f0,    a0,    0
+    fst.d        f1,    a1,    0
+    addi.d       a0,    a0,    8
+    addi.d       a1,    a1,    8
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    1
+    ld.b         t2,    a3,    3
+    addi.d       a3,    a3,    4
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a0,    0
+    st.b         t2,    a1,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+function uyvyToUV_lasx
+    andi         t0,    a5,    15
+    srli.d       a5,    a5,    4
+    beqz         a5,    2f
+1:
+    xvld         xr0,   a3,    0
+    xvld         xr1,   a3,    32
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    64
+    xvpickev.b   xr2,   xr1,   xr0
+    xvpermi.d    xr2,   xr2,   0xd8
+    xvpickev.b   xr0,   xr2,   xr2
+    xvpermi.d    xr0,   xr0,   0xd8
+    xvpickod.b   xr1,   xr2,   xr2
+    xvpermi.d    xr1,   xr1,   0xd8
+    vst          vr0,   a0,    0
+    vst          vr1,   a1,    0
+    addi.d       a0,    a0,    16
+    addi.d       a1,    a1,    16
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    1
+    ld.b         t2,    a3,    3
+    addi.d       a3,    a3,    4
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a0,    0
+    st.b         t2,    a1,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+/*
+ * void nv12ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ *                   const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function nv12ToUV_lsx
+    andi         t0,    a5,    15
+    srli.d       a5,    a5,    4
+    beqz         a5,    2f
+1:
+    vld          vr0,   a3,    0
+    vld          vr1,   a3,    16
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    32
+    vpickev.b    vr2,   vr1,   vr0
+    vpickod.b    vr3,   vr1,   vr0
+    vst          vr2,   a0,    0
+    vst          vr3,   a1,    0
+    addi.d       a0,    a0,    16
+    addi.d       a1,    a1,    16
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    0
+    ld.b         t2,    a3,    1
+    addi.d       a3,    a3,    2
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a0,    0
+    st.b         t2,    a1,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+function nv12ToUV_lasx
+    andi         t0,    a5,    31
+    srli.d       a5,    a5,    5
+    beqz         a5,    2f
+1:
+    xvld         xr0,   a3,    0
+    xvld         xr1,   a3,    32
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    64
+    xvpickev.b   xr2,   xr1,   xr0
+    xvpickod.b   xr3,   xr1,   xr0
+    xvpermi.d    xr2,   xr2,   0xd8
+    xvpermi.d    xr3,   xr3,   0xd8
+    xvst         xr2,   a0,    0
+    xvst         xr3,   a1,    0
+    addi.d       a0,    a0,    32
+    addi.d       a1,    a1,    32
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    0
+    ld.b         t2,    a3,    1
+    addi.d       a3,    a3,    2
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a0,    0
+    st.b         t2,    a1,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+/*
+ * void nv21ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+ *                   const uint8_t *src2, int width, uint32_t *unused, void *opq)
+ */
+function nv21ToUV_lsx
+    andi         t0,    a5,    15
+    srli.d       a5,    a5,    4
+    beqz         a5,    2f
+1:
+    vld          vr0,   a3,    0
+    vld          vr1,   a3,    16
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    32
+    vpickev.b    vr2,   vr1,   vr0
+    vpickod.b    vr3,   vr1,   vr0
+    vst          vr2,   a1,    0
+    vst          vr3,   a0,    0
+    addi.d       a0,    a0,    16
+    addi.d       a1,    a1,    16
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    0
+    ld.b         t2,    a3,    1
+    addi.d       a3,    a3,    2
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a1,    0
+    st.b         t2,    a0,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+function nv21ToUV_lasx
+    andi         t0,    a5,    31
+    srli.d       a5,    a5,    5
+    beqz         a5,    2f
+1:
+    xvld         xr0,   a3,    0
+    xvld         xr1,   a3,    32
+    addi.d       a5,    a5,    -1
+    addi.d       a3,    a3,    64
+    xvpickev.b   xr2,   xr1,   xr0
+    xvpickod.b   xr3,   xr1,   xr0
+    xvpermi.d    xr2,   xr2,   0xd8
+    xvpermi.d    xr3,   xr3,   0xd8
+    xvst         xr2,   a1,    0
+    xvst         xr3,   a0,    0
+    addi.d       a0,    a0,    32
+    addi.d       a1,    a1,    32
+    bnez         a5,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a3,    0
+    ld.b         t2,    a3,    1
+    addi.d       a3,    a3,    2
+    addi.d       t0,    t0,    -1
+    st.b         t1,    a1,    0
+    st.b         t2,    a0,    0
+    addi.d       a0,    a0,    1
+    addi.d       a1,    a1,    1
+    bnez         t0,    3b
+4:
+endfunc
+
+/*
+ *void abgrToA_lsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+ *                 const uint8_t *unused2, int width, uint32_t *unused, void *opq)
+ */
+function abgrToA_lsx
+    andi         t0,    a4,    7
+    srli.d       a4,    a4,    3
+    vxor.v       vr0,   vr0,   vr0
+    beqz         a4,    2f
+1:
+    vld          vr1,   a1,    0
+    vld          vr2,   a1,    16
+    addi.d       a4,    a4,    -1
+    addi.d       a1,    a1,    32
+    vpickev.b    vr3,   vr2,   vr1
+    vpackev.b    vr3,   vr0,   vr3
+    vslli.h      vr1,   vr3,   6
+    vsrli.h      vr2,   vr3,   2
+    vor.v        vr3,   vr2,   vr1
+    vst          vr3,   a0,    0
+    addi.d       a0,    a0,    16
+    bnez         a4,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a1,    3
+    addi.d       t0,    t0,    -1
+    addi.d       a1,    a1,    4
+    andi         t1,    t1,    0xff
+    slli.w       t2,    t1,    6
+    srli.w       t3,    t1,    2
+    or           t1,    t2,    t3
+    st.h         t1,    a0,    0
+    addi.d       a0,    a0,    2
+    bnez         t0,    3b
+4:
+endfunc
+
+function abgrToA_lasx
+    andi         t0,    a4,    15
+    srli.d       a4,    a4,    4
+    xvxor.v      xr0,   xr0,   xr0
+    beqz         a4,    2f
+1:
+    xvld         xr1,   a1,    0
+    xvld         xr2,   a1,    32
+    addi.d       a4,    a4,    -1
+    addi.d       a1,    a1,    64
+    xvpickev.b   xr3,   xr2,   xr1
+    xvpermi.d    xr3,   xr3,   0xd8
+    xvpackev.b   xr3,   xr0,   xr3
+    xvslli.h     xr1,   xr3,   6
+    xvsrli.h     xr2,   xr3,   2
+    xvor.v       xr3,   xr2,   xr1
+    xvst         xr3,   a0,    0
+    addi.d       a0,    a0,    32
+    bnez         a4,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a1,    3
+    addi.d       t0,    t0,    -1
+    addi.d       a1,    a1,    4
+    andi         t1,    t1,    0xff
+    slli.w       t2,    t1,    6
+    srli.w       t3,    t1,    2
+    or           t1,    t2,    t3
+    st.h         t1,    a0,    0
+    addi.d       a0,    a0,    2
+    bnez         t0,    3b
+4:
+endfunc
+
+/*
+ *void rgbaToA_lsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+ *                 const uint8_t *unused2, int width, uint32_t *unused, void *opq)
+ */
+function rgbaToA_lsx
+    andi         t0,    a4,    7
+    srli.d       a4,    a4,    3
+    vxor.v       vr0,   vr0,   vr0
+    beqz         a4,    2f
+1:
+    vld          vr1,   a1,    3
+    vld          vr2,   a1,    19
+    addi.d       a4,    a4,    -1
+    addi.d       a1,    a1,    32
+    vpickev.b    vr3,   vr2,   vr1
+    vpackev.b    vr3,   vr0,   vr3
+    vslli.h      vr1,   vr3,   6
+    vsrli.h      vr2,   vr3,   2
+    vor.v        vr3,   vr2,   vr1
+    vst          vr3,   a0,    0
+    addi.d       a0,    a0,    16
+    bnez         a4,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a1,    3
+    addi.d       t0,    t0,    -1
+    addi.d       a1,    a1,    4
+    andi         t1,    t1,    0xff
+    slli.w       t2,    t1,    6
+    srli.w       t3,    t1,    2
+    or           t1,    t2,    t3
+    st.h         t1,    a0,    0
+    addi.d       a0,    a0,    2
+    bnez         t0,    3b
+4:
+endfunc
+
+function rgbaToA_lasx
+    andi         t0,    a4,    15
+    srli.d       a4,    a4,    4
+    xvxor.v      xr0,   xr0,   xr0
+    beqz         a4,    2f
+1:
+    xvld         xr1,   a1,    3
+    xvld         xr2,   a1,    35
+    addi.d       a4,    a4,    -1
+    addi.d       a1,    a1,    64
+    xvpickev.b   xr3,   xr2,   xr1
+    xvpermi.d    xr3,   xr3,   0xd8
+    xvpackev.b   xr3,   xr0,   xr3
+    xvslli.h     xr1,   xr3,   6
+    xvsrli.h     xr2,   xr3,   2
+    xvor.v       xr3,   xr2,   xr1
+    xvst         xr3,   a0,    0
+    addi.d       a0,    a0,    32
+    bnez         a4,    1b
+2:
+    beqz         t0,    4f
+3:
+    ld.b         t1,    a1,    3
+    addi.d       t0,    t0,    -1
+    addi.d       a1,    a1,    4
+    andi         t1,    t1,    0xff
+    slli.w       t2,    t1,    6
+    srli.w       t3,    t1,    2
+    or           t1,    t2,    t3
+    st.h         t1,    a0,    0
+    addi.d       a0,    a0,    2
+    bnez         t0,    3b
+4:
+endfunc
diff --git a/libswscale/loongarch/input_lasx.c b/libswscale/loongarch/input_lasx.c
index 4830072eaf..0f1d954880 100644
--- a/libswscale/loongarch/input_lasx.c
+++ b/libswscale/loongarch/input_lasx.c
@@ -200,3 +200,46 @@  void planar_rgb_to_y_lasx(uint8_t *_dst, const uint8_t *src[4], int width,
         dst[i] = (tem_ry * r + tem_gy * g + tem_by * b + set) >> shift;
     }
 }
+
+av_cold void ff_sws_init_input_lasx(SwsContext *c)
+{
+    enum AVPixelFormat srcFormat = c->srcFormat;
+
+    switch (srcFormat) {
+    case AV_PIX_FMT_YUYV422:
+        c->chrToYV12 = yuy2ToUV_lasx;
+        break;
+    case AV_PIX_FMT_YVYU422:
+        c->chrToYV12 = yvy2ToUV_lasx;
+        break;
+    case AV_PIX_FMT_UYVY422:
+        c->chrToYV12 = uyvyToUV_lasx;
+        break;
+    case AV_PIX_FMT_NV12:
+    case AV_PIX_FMT_NV16:
+    case AV_PIX_FMT_NV24:
+        c->chrToYV12 = nv12ToUV_lasx;
+        break;
+    case AV_PIX_FMT_NV21:
+    case AV_PIX_FMT_NV42:
+        c->chrToYV12 = nv21ToUV_lasx;
+        break;
+    case AV_PIX_FMT_GBRAP:
+    case AV_PIX_FMT_GBRP:
+        c->readChrPlanar = planar_rgb_to_uv_lasx;
+        break;
+    }
+
+    if (c->needAlpha) {
+        switch (srcFormat) {
+        case AV_PIX_FMT_BGRA:
+        case AV_PIX_FMT_RGBA:
+            c->alpToYV12 = rgbaToA_lasx;
+            break;
+        case AV_PIX_FMT_ABGR:
+        case AV_PIX_FMT_ARGB:
+            c->alpToYV12 = abgrToA_lasx;
+            break;
+        }
+    }
+}
diff --git a/libswscale/loongarch/input_lsx.c b/libswscale/loongarch/input_lsx.c
new file mode 100644
index 0000000000..1bb04457bb
--- /dev/null
+++ b/libswscale/loongarch/input_lsx.c
@@ -0,0 +1,65 @@ 
+/*
+ * Copyright (C) 2024 Loongson Technology Corporation Limited
+ * Contributed by Shiyou Yin<yinshiyou-hf@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "swscale_loongarch.h"
+
+av_cold void ff_sws_init_input_lsx(SwsContext *c)
+{
+    enum AVPixelFormat srcFormat = c->srcFormat;
+
+    switch (srcFormat) {
+    case AV_PIX_FMT_YUYV422:
+        c->chrToYV12 = yuy2ToUV_lsx;
+        break;
+    case AV_PIX_FMT_YVYU422:
+        c->chrToYV12 = yvy2ToUV_lsx;
+        break;
+    case AV_PIX_FMT_UYVY422:
+        c->chrToYV12 = uyvyToUV_lsx;
+        break;
+    case AV_PIX_FMT_NV12:
+    case AV_PIX_FMT_NV16:
+    case AV_PIX_FMT_NV24:
+        c->chrToYV12 = nv12ToUV_lsx;
+        break;
+    case AV_PIX_FMT_NV21:
+    case AV_PIX_FMT_NV42:
+        c->chrToYV12 = nv21ToUV_lsx;
+        break;
+    case AV_PIX_FMT_GBRAP:
+    case AV_PIX_FMT_GBRP:
+        c->readChrPlanar = planar_rgb_to_uv_lsx;
+        break;
+    }
+
+    if (c->needAlpha) {
+        switch (srcFormat) {
+        case AV_PIX_FMT_BGRA:
+        case AV_PIX_FMT_RGBA:
+            c->alpToYV12 = rgbaToA_lsx;
+            break;
+        case AV_PIX_FMT_ABGR:
+        case AV_PIX_FMT_ARGB:
+            c->alpToYV12 = abgrToA_lsx;
+            break;
+        }
+    }
+}
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 04d2553fa4..3a5a7ee856 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -63,6 +63,7 @@  av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
         ff_sws_init_output_lsx(c, &c->yuv2plane1, &c->yuv2planeX,
                                &c->yuv2nv12cX, &c->yuv2packed1,
                                &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
+        ff_sws_init_input_lsx(c);
         if (c->srcBpc == 8) {
             if (c->dstBpc <= 14) {
                 c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
@@ -73,21 +74,13 @@  av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
             c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lsx
                                                      : ff_hscale_16_to_15_lsx;
         }
-        switch (c->srcFormat) {
-        case AV_PIX_FMT_GBRAP:
-        case AV_PIX_FMT_GBRP:
-            {
-                c->readChrPlanar = planar_rgb_to_uv_lsx;
-                c->readLumPlanar = planar_rgb_to_y_lsx;
-            }
-            break;
-        }
     }
 #if HAVE_LASX
     if (have_lasx(cpu_flags)) {
         ff_sws_init_output_lasx(c, &c->yuv2plane1, &c->yuv2planeX,
                                 &c->yuv2nv12cX, &c->yuv2packed1,
                                 &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
+        ff_sws_init_input_lasx(c);
         if (c->srcBpc == 8) {
             if (c->dstBpc <= 14) {
                 c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@@ -98,15 +91,6 @@  av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
             c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lasx
                                                      : ff_hscale_16_to_15_lasx;
         }
-        switch (c->srcFormat) {
-        case AV_PIX_FMT_GBRAP:
-        case AV_PIX_FMT_GBRP:
-            {
-                c->readChrPlanar = planar_rgb_to_uv_lasx;
-                c->readLumPlanar = planar_rgb_to_y_lasx;
-            }
-            break;
-        }
     }
 #endif // #if HAVE_LASX
     ff_sws_init_range_convert_loongarch(c);
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index ea93881f8e..07c91bc25c 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -68,6 +68,29 @@  void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
 void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
                       const uint8_t *dither, int offset);
 
+void yuy2ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                  const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void yvy2ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                  const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void uyvyToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                  const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void nv12ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                  const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void nv21ToUV_lsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                  const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void abgrToA_lsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                 const uint8_t *unused2, int width, uint32_t *unused, void *opq);
+
+void rgbaToA_lsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                 const uint8_t *unused2, int width, uint32_t *unused, void *opq);
+
+av_cold void ff_sws_init_input_lsx(SwsContext *c);
+
 av_cold void ff_sws_init_output_lsx(SwsContext *c,
                                     yuv2planar1_fn *yuv2plane1,
                                     yuv2planarX_fn *yuv2planeX,
@@ -152,6 +175,29 @@  void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
 void yuv2plane1_8_lasx(const int16_t *src, uint8_t *dest, int dstW,
                       const uint8_t *dither, int offset);
 
+void yuy2ToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                   const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void yvy2ToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                   const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void uyvyToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                   const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void nv12ToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                   const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void nv21ToUV_lasx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
+                   const uint8_t *src2, int width, uint32_t *unused, void *opq);
+
+void abgrToA_lasx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                  const uint8_t *unused2, int width, uint32_t *unused, void *opq);
+
+void rgbaToA_lasx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1,
+                  const uint8_t *unused2, int width, uint32_t *unused, void *opq);
+
+av_cold void ff_sws_init_input_lasx(SwsContext *c);
+
 av_cold void ff_sws_init_output_lasx(SwsContext *c,
                                      yuv2planar1_fn *yuv2plane1,
                                      yuv2planarX_fn *yuv2planeX,