diff mbox series

[FFmpeg-devel,1/3] swscale: [LA] Optimize range convert for yuvj420p.

Message ID 20240316030333.31269-2-yinshiyou-hf@loongson.cn
State Accepted
Commit f3fe2cb5f72a669bd737203f6f82ed7f2fa60ded
Headers show
Series [FFmpeg-devel,1/3] swscale: [LA] Optimize range convert for yuvj420p. | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Shiyou Yin March 16, 2024, 3:03 a.m. UTC
---
 libswscale/loongarch/swscale.S                | 368 ++++++++++++++++++
 libswscale/loongarch/swscale_init_loongarch.c |  33 ++
 libswscale/loongarch/swscale_loongarch.h      |  11 +
 libswscale/swscale_internal.h                 |   1 +
 libswscale/utils.c                            |   6 +-
 5 files changed, 418 insertions(+), 1 deletion(-)

Comments

陈昊 March 16, 2024, 6:22 a.m. UTC | #1
LGTM

2024-03-16 11:03:31 "yinshiyou-hf@loongson.cn" <yinshiyou-hf@loongson.cn> 写道:
> ---
>  libswscale/loongarch/swscale.S                | 368 ++++++++++++++++++
>  libswscale/loongarch/swscale_init_loongarch.c |  33 ++
>  libswscale/loongarch/swscale_loongarch.h      |  11 +
>  libswscale/swscale_internal.h                 |   1 +
>  libswscale/utils.c                            |   6 +-
>  5 files changed, 418 insertions(+), 1 deletion(-)
> 
> diff --git a/libswscale/loongarch/swscale.S b/libswscale/loongarch/swscale.S
> index aa4c5cbe28..67b1bc834d 100644
> --- a/libswscale/loongarch/swscale.S
> +++ b/libswscale/loongarch/swscale.S
> @@ -1866,3 +1866,371 @@ function ff_hscale_16_to_19_sub_lsx
>      ld.d             s8,      sp,     64
>      addi.d           sp,      sp,     72
>  endfunc
> +
> +function lumRangeFromJpeg_lsx
> +    li.w          t0,    14071
> +    li.w          t1,    33561947
> +    vreplgr2vr.h  vr0,   t0
> +    srli.w        t2,    a1,    3
> +    andi          t3,    a1,    7
> +    beqz          t2,    2f
> +1:
> +    vld           vr1,   a0,    0
> +    vreplgr2vr.w  vr2,   t1
> +    vreplgr2vr.w  vr3,   t1
> +    vmaddwev.w.h  vr2,   vr0,   vr1
> +    vmaddwod.w.h  vr3,   vr0,   vr1
> +    vsrai.w       vr2,   vr2,   14
> +    vsrai.w       vr3,   vr3,   14
> +    vpackev.h     vr1,   vr3,   vr2
> +    vst           vr1,   a0,    0
> +    addi.d        a0,    a0,    16
> +    addi.d        t2,    t2,    -1
> +    bnez          t2,    1b
> +2:
> +    beqz          t3,    4f
> +3:
> +    ld.h          t4,    a0,    0
> +    mul.w         t4,    t4,    t0
> +    add.w         t4,    t4,    t1
> +    srai.w        t4,    t4,    14
> +    st.h          t4,    a0,    0
> +    addi.d        a0,    a0,    2
> +    addi.d        t3,    t3,    -1
> +    bnez          t3,    3b
> +4:
> +endfunc
> +
> +function lumRangeFromJpeg_lasx
> +    li.w           t0,    14071
> +    li.w           t1,    33561947
> +    xvreplgr2vr.h  xr0,   t0
> +    srli.w         t2,    a1,    4
> +    andi           t3,    a1,    15
> +    beqz           t2,    2f
> +1:
> +    xvld           xr1,   a0,    0
> +    xvreplgr2vr.w  xr2,   t1
> +    xvreplgr2vr.w  xr3,   t1
> +    xvmaddwev.w.h  xr2,   xr0,   xr1
> +    xvmaddwod.w.h  xr3,   xr0,   xr1
> +    xvsrai.w       xr2,   xr2,   14
> +    xvsrai.w       xr3,   xr3,   14
> +    xvpackev.h     xr1,   xr3,   xr2
> +    xvst           xr1,   a0,    0
> +    addi.d         a0,    a0,    32
> +    addi.d         t2,    t2,    -1
> +    bnez           t2,    1b
> +2:
> +    beqz          t3,    4f
> +3:
> +    ld.h          t4,    a0,    0
> +    mul.w         t4,    t4,    t0
> +    add.w         t4,    t4,    t1
> +    srai.w        t4,    t4,    14
> +    st.h          t4,    a0,    0
> +    addi.d        a0,    a0,    2
> +    addi.d        t3,    t3,    -1
> +    bnez          t3,    3b
> +4:
> +endfunc
> +
> +function lumRangeToJpeg_lsx
> +    li.w          t0,    19077
> +    li.w          t1,    -39057361
> +    li.w          t2,    30189
> +    vreplgr2vr.h  vr0,   t0
> +    vreplgr2vr.h  vr4,   t2
> +    srli.w        t2,    a1,    3
> +    andi          t3,    a1,    7
> +    beqz          t2,    2f
> +1:
> +    vld           vr1,   a0,    0
> +    vreplgr2vr.w  vr2,   t1
> +    vreplgr2vr.w  vr3,   t1
> +    vmin.h        vr1,   vr1,   vr4
> +    vmaddwev.w.h  vr2,   vr0,   vr1
> +    vmaddwod.w.h  vr3,   vr0,   vr1
> +    vsrai.w       vr2,   vr2,   14
> +    vsrai.w       vr3,   vr3,   14
> +    vpackev.h     vr1,   vr3,   vr2
> +    vst           vr1,   a0,    0
> +    addi.d        a0,    a0,    16
> +    addi.d        t2,    t2,    -1
> +    bnez          t2,    1b
> +2:
> +    beqz          t3,    4f
> +3:
> +    ld.h          t4,    a0,    0
> +    vreplgr2vr.h  vr1,   t4
> +    vmin.h        vr1,   vr1,   vr4
> +    vpickve2gr.h  t4,    vr1,   0
> +    mul.w         t4,    t4,    t0
> +    add.w         t4,    t4,    t1
> +    srai.w        t4,    t4,    14
> +    st.h          t4,    a0,    0
> +    addi.d        a0,    a0,    2
> +    addi.d        t3,    t3,    -1
> +    bnez          t3,    3b
> +4:
> +endfunc
> +
> +function lumRangeToJpeg_lasx
> +    li.w           t0,    19077
> +    li.w           t1,    -39057361
> +    li.w           t2,    30189
> +    xvreplgr2vr.h  xr0,   t0
> +    xvreplgr2vr.h  xr4,   t2
> +    srli.w         t2,    a1,    4
> +    andi           t3,    a1,    15
> +    beqz           t2,    2f
> +1:
> +    xvld           xr1,   a0,    0
> +    xvreplgr2vr.w  xr2,   t1
> +    xvreplgr2vr.w  xr3,   t1
> +    xvmin.h        xr1,   xr1,   xr4
> +    xvmaddwev.w.h  xr2,   xr0,   xr1
> +    xvmaddwod.w.h  xr3,   xr0,   xr1
> +    xvsrai.w       xr2,   xr2,   14
> +    xvsrai.w       xr3,   xr3,   14
> +    xvpackev.h     xr1,   xr3,   xr2
> +    xvst           xr1,   a0,    0
> +    addi.d         a0,    a0,    32
> +    addi.d         t2,    t2,    -1
> +    bnez           t2,    1b
> +2:
> +    beqz           t3,    4f
> +3:
> +    ld.h           t4,    a0,    0
> +    vreplgr2vr.h   vr1,   t4
> +    vmin.h         vr1,   vr1,   vr4
> +    vpickve2gr.h   t4,    vr1,   0
> +    mul.w          t4,    t4,    t0
> +    add.w          t4,    t4,    t1
> +    srai.w         t4,    t4,    14
> +    st.h           t4,    a0,    0
> +    addi.d         a0,    a0,    2
> +    addi.d         t3,    t3,    -1
> +    bnez           t3,    3b
> +4:
> +endfunc
> +
> +function chrRangeFromJpeg_lsx
> +    li.w          t0,    1799
> +    li.w          t1,    4081085
> +    vreplgr2vr.h  vr0,   t0
> +    srli.w        t2,    a2,    3
> +    andi          t3,    a2,    7
> +    beqz          t2,    2f
> +1:
> +    vld           vr1,   a0,    0
> +    vld           vr2,   a1,    0
> +    vreplgr2vr.w  vr3,   t1
> +    vreplgr2vr.w  vr4,   t1
> +    vreplgr2vr.w  vr5,   t1
> +    vreplgr2vr.w  vr6,   t1
> +    vmaddwev.w.h  vr3,   vr0,   vr1
> +    vmaddwod.w.h  vr4,   vr0,   vr1
> +    vmaddwev.w.h  vr5,   vr0,   vr2
> +    vmaddwod.w.h  vr6,   vr0,   vr2
> +    vsrai.w       vr3,   vr3,   11
> +    vsrai.w       vr4,   vr4,   11
> +    vsrai.w       vr5,   vr5,   11
> +    vsrai.w       vr6,   vr6,   11
> +    vpackev.h     vr1,   vr4,   vr3
> +    vpackev.h     vr2,   vr6,   vr5
> +    vst           vr1,   a0,    0
> +    vst           vr2,   a1,    0
> +    addi.d        a0,    a0,    16
> +    addi.d        a1,    a1,    16
> +    addi.d        t2,    t2,    -1
> +    bnez          t2,    1b
> +2:
> +    beqz          t3,    4f
> +3:
> +    ld.h          t4,    a0,    0
> +    ld.h          t5,    a1,    0
> +    mul.w         t4,    t4,    t0
> +    mul.w         t5,    t5,    t0
> +    add.w         t4,    t4,    t1
> +    add.w         t5,    t5,    t1
> +    srai.w        t4,    t4,    11
> +    srai.w        t5,    t5,    11
> +    st.h          t4,    a0,    0
> +    st.h          t5,    a1,    0
> +    addi.d        a0,    a0,    2
> +    addi.d        a1,    a1,    2
> +    addi.d        t3,    t3,    -1
> +    bnez          t3,    3b
> +4:
> +endfunc
> +
> +function chrRangeFromJpeg_lasx
> +    li.w           t0,    1799
> +    li.w           t1,    4081085
> +    xvreplgr2vr.h  xr0,   t0
> +    srli.w         t2,    a2,    4
> +    andi           t3,    a2,    15
> +    beqz           t2,    2f
> +1:
> +    xvld           xr1,   a0,    0
> +    xvld           xr2,   a1,    0
> +    xvreplgr2vr.w  xr3,   t1
> +    xvreplgr2vr.w  xr4,   t1
> +    xvreplgr2vr.w  xr5,   t1
> +    xvreplgr2vr.w  xr6,   t1
> +    xvmaddwev.w.h  xr3,   xr0,   xr1
> +    xvmaddwod.w.h  xr4,   xr0,   xr1
> +    xvmaddwev.w.h  xr5,   xr0,   xr2
> +    xvmaddwod.w.h  xr6,   xr0,   xr2
> +    xvsrai.w       xr3,   xr3,   11
> +    xvsrai.w       xr4,   xr4,   11
> +    xvsrai.w       xr5,   xr5,   11
> +    xvsrai.w       xr6,   xr6,   11
> +    xvpackev.h     xr1,   xr4,   xr3
> +    xvpackev.h     xr2,   xr6,   xr5
> +    xvst           xr1,   a0,    0
> +    xvst           xr2,   a1,    0
> +    addi.d         a0,    a0,    32
> +    addi.d         a1,    a1,    32
> +    addi.d         t2,    t2,    -1
> +    bnez           t2,    1b
> +2:
> +    beqz          t3,    4f
> +3:
> +    ld.h          t4,    a0,    0
> +    ld.h          t5,    a1,    0
> +    mul.w         t4,    t4,    t0
> +    mul.w         t5,    t5,    t0
> +    add.w         t4,    t4,    t1
> +    add.w         t5,    t5,    t1
> +    srai.w        t4,    t4,    11
> +    srai.w        t5,    t5,    11
> +    st.h          t4,    a0,    0
> +    st.h          t5,    a1,    0
> +    addi.d        a0,    a0,    2
> +    addi.d        a1,    a1,    2
> +    addi.d        t3,    t3,    -1
> +    bnez          t3,    3b
> +4:
> +endfunc
> +
> +function chrRangeToJpeg_lsx
> +    li.w          t0,    4663
> +    li.w          t1,    -9289992
> +    li.w          t2,    30775
> +    vreplgr2vr.h  vr0,   t0
> +    vreplgr2vr.h  vr7,   t2
> +    srli.w        t2,    a2,    3
> +    andi          t3,    a2,    7
> +    beqz          t2,    2f
> +1:
> +    vld           vr1,   a0,    0
> +    vld           vr2,   a1,    0
> +    vreplgr2vr.w  vr3,   t1
> +    vreplgr2vr.w  vr4,   t1
> +    vreplgr2vr.w  vr5,   t1
> +    vreplgr2vr.w  vr6,   t1
> +    vmin.h        vr1,   vr1,   vr7
> +    vmin.h        vr2,   vr2,   vr7
> +    vmaddwev.w.h  vr3,   vr0,   vr1
> +    vmaddwod.w.h  vr4,   vr0,   vr1
> +    vmaddwev.w.h  vr5,   vr0,   vr2
> +    vmaddwod.w.h  vr6,   vr0,   vr2
> +    vsrai.w       vr3,   vr3,   12
> +    vsrai.w       vr4,   vr4,   12
> +    vsrai.w       vr5,   vr5,   12
> +    vsrai.w       vr6,   vr6,   12
> +    vpackev.h     vr1,   vr4,   vr3
> +    vpackev.h     vr2,   vr6,   vr5
> +    vst           vr1,   a0,    0
> +    vst           vr2,   a1,    0
> +    addi.d        a0,    a0,    16
> +    addi.d        a1,    a1,    16
> +    addi.d        t2,    t2,    -1
> +    bnez          t2,    1b
> +2:
> +    beqz          t3,    4f
> +3:
> +    ld.h          t4,    a0,    0
> +    ld.h          t5,    a1,    0
> +    vreplgr2vr.h  vr1,   t4
> +    vreplgr2vr.h  vr2,   t5
> +    vmin.h        vr1,   vr1,   vr7
> +    vmin.h        vr2,   vr2,   vr7
> +    vpickve2gr.h  t4,    vr1,   0
> +    vpickve2gr.h  t5,    vr2,   0
> +    mul.w         t4,    t4,    t0
> +    mul.w         t5,    t5,    t0
> +    add.w         t4,    t4,    t1
> +    add.w         t5,    t5,    t1
> +    srai.w        t4,    t4,    12
> +    srai.w        t5,    t5,    12
> +    st.h          t4,    a0,    0
> +    st.h          t5,    a1,    0
> +    addi.d        a0,    a0,    2
> +    addi.d        a1,    a1,    2
> +    addi.d        t3,    t3,    -1
> +    bnez          t3,    3b
> +4:
> +endfunc
> +
> +function chrRangeToJpeg_lasx
> +    li.w           t0,    4663
> +    li.w           t1,    -9289992
> +    li.w           t2,    30775
> +    xvreplgr2vr.h  xr0,   t0
> +    xvreplgr2vr.h  xr7,   t2
> +    srli.w         t2,    a2,    4
> +    andi           t3,    a2,    15
> +    beqz           t2,    2f
> +1:
> +    xvld           xr1,   a0,    0
> +    xvld           xr2,   a1,    0
> +    xvreplgr2vr.w  xr3,   t1
> +    xvreplgr2vr.w  xr4,   t1
> +    xvreplgr2vr.w  xr5,   t1
> +    xvreplgr2vr.w  xr6,   t1
> +    xvmin.h        xr1,   xr1,   xr7
> +    xvmin.h        xr2,   xr2,   xr7
> +    xvmaddwev.w.h  xr3,   xr0,   xr1
> +    xvmaddwod.w.h  xr4,   xr0,   xr1
> +    xvmaddwev.w.h  xr5,   xr0,   xr2
> +    xvmaddwod.w.h  xr6,   xr0,   xr2
> +    xvsrai.w       xr3,   xr3,   12
> +    xvsrai.w       xr4,   xr4,   12
> +    xvsrai.w       xr5,   xr5,   12
> +    xvsrai.w       xr6,   xr6,   12
> +    xvpackev.h     xr1,   xr4,   xr3
> +    xvpackev.h     xr2,   xr6,   xr5
> +    xvst           xr1,   a0,    0
> +    xvst           xr2,   a1,    0
> +    addi.d         a0,    a0,    32
> +    addi.d         a1,    a1,    32
> +    addi.d         t2,    t2,    -1
> +    bnez           t2,    1b
> +2:
> +    beqz          t3,    4f
> +3:
> +    ld.h          t4,    a0,    0
> +    ld.h          t5,    a1,    0
> +    vreplgr2vr.h  vr1,   t4
> +    vreplgr2vr.h  vr2,   t5
> +    vmin.h        vr1,   vr1,   vr7
> +    vmin.h        vr2,   vr2,   vr7
> +    vpickve2gr.h  t4,    vr1,   0
> +    vpickve2gr.h  t5,    vr2,   0
> +    mul.w         t4,    t4,    t0
> +    mul.w         t5,    t5,    t0
> +    add.w         t4,    t4,    t1
> +    add.w         t5,    t5,    t1
> +    srai.w        t4,    t4,    12
> +    srai.w        t5,    t5,    12
> +    st.h          t4,    a0,    0
> +    st.h          t5,    a1,    0
> +    addi.d        a0,    a0,    2
> +    addi.d        a1,    a1,    2
> +    addi.d        t3,    t3,    -1
> +    bnez          t3,    3b
> +4:
> +endfunc
> diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
> index 53e4f970b6..6d2786c55f 100644
> --- a/libswscale/loongarch/swscale_init_loongarch.c
> +++ b/libswscale/loongarch/swscale_init_loongarch.c
> @@ -24,6 +24,38 @@
>  #include "libswscale/rgb2rgb.h"
>  #include "libavutil/loongarch/cpu.h"
>  
> +av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (have_lsx(cpu_flags)) {
> +        if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
> +            if (c->dstBpc <= 14) {
> +                if (c->srcRange) {
> +                    c->lumConvertRange = lumRangeFromJpeg_lsx;
> +                    c->chrConvertRange = chrRangeFromJpeg_lsx;
> +                } else {
> +                    c->lumConvertRange = lumRangeToJpeg_lsx;
> +                    c->chrConvertRange = chrRangeToJpeg_lsx;
> +                }
> +            }
> +        }
> +    }
> +    if (have_lasx(cpu_flags)) {
> +        if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
> +            if (c->dstBpc <= 14) {
> +                if (c->srcRange) {
> +                    c->lumConvertRange = lumRangeFromJpeg_lasx;
> +                    c->chrConvertRange = chrRangeFromJpeg_lasx;
> +                } else {
> +                    c->lumConvertRange = lumRangeToJpeg_lasx;
> +                    c->chrConvertRange = chrRangeToJpeg_lasx;
> +                }
> +            }
> +        }
> +    }
> +}
> +
>  av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
>  {
>      int cpu_flags = av_get_cpu_flags();
> @@ -77,6 +109,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
>              c->yuv2planeX = ff_yuv2planeX_8_lasx;
>      }
>  #endif // #if HAVE_LASX
> +    ff_sws_init_range_convert_loongarch(c);
>  }
>  
>  av_cold void rgb2rgb_init_loongarch(void)
> diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
> index 0514abae21..c96b085982 100644
> --- a/libswscale/loongarch/swscale_loongarch.h
> +++ b/libswscale/loongarch/swscale_loongarch.h
> @@ -50,6 +50,11 @@ void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
>                                  const uint8_t *_src, const int16_t *filter,
>                                  const int32_t *filterPos, int filterSize, int sh);
>  
> +void lumRangeFromJpeg_lsx(int16_t *dst, int width);
> +void chrRangeFromJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
> +void lumRangeToJpeg_lsx(int16_t *dst, int width);
> +void chrRangeToJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
> +
>  void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
>                            int width, int32_t *rgb2yuv, void *opq);
>  
> @@ -97,6 +102,11 @@ void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
>                               const uint8_t *_src, const int16_t *filter,
>                               const int32_t *filterPos, int filterSize);
>  
> +void lumRangeFromJpeg_lasx(int16_t *dst, int width);
> +void chrRangeFromJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
> +void lumRangeToJpeg_lasx(int16_t *dst, int width);
> +void chrRangeToJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
> +
>  void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
>                             int width, int32_t *rgb2yuv, void *opq);
>  
> @@ -130,6 +140,7 @@ void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
>                            const uint8_t *dither, int offset);
>  
>  av_cold void ff_sws_init_output_lasx(SwsContext *c);
> +
>  #endif // #if HAVE_LASX
>  
>  #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
> index abeebbb002..0db581acf8 100644
> --- a/libswscale/swscale_internal.h
> +++ b/libswscale/swscale_internal.h
> @@ -695,6 +695,7 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
>  void ff_updateMMXDitherTables(SwsContext *c, int dstY);
>  
>  av_cold void ff_sws_init_range_convert(SwsContext *c);
> +av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
>  
>  SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
>  SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
> diff --git a/libswscale/utils.c b/libswscale/utils.c
> index ab8a68e241..47db65ef0e 100644
> --- a/libswscale/utils.c
> +++ b/libswscale/utils.c
> @@ -1049,8 +1049,12 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
>      c->srcRange   = srcRange;
>      c->dstRange   = dstRange;
>  
> -    if (need_reinit)
> +    if (need_reinit) {
>          ff_sws_init_range_convert(c);
> +#if ARCH_LOONGARCH64
> +        ff_sws_init_range_convert_loongarch(c);
> +#endif
> +    }
>  
>      c->dstFormatBpp = av_get_bits_per_pixel(desc_dst);
>      c->srcFormatBpp = av_get_bits_per_pixel(desc_src);
> -- 
> 2.20.1
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".


本邮件及其附件含有龙芯中科的商业秘密信息,仅限于发送给上面地址中列出的个人或群组。禁止任何其他人以任何形式使用(包括但不限于全部或部分地泄露、复制或散发)本邮件及其附件中的信息。如果您错收本邮件,请您立即电话或邮件通知发件人并删除本邮件。 
This email and its attachments contain confidential information from Loongson Technology , which is intended only for the person or entity whose address is listed above. Any use of the information contained herein in any way (including, but not limited to, total or partial disclosure, reproduction or dissemination) by persons other than the intended recipient(s) is prohibited. If you receive this email in error, please notify the sender by phone or email immediately and delete it.
diff mbox series

Patch

diff --git a/libswscale/loongarch/swscale.S b/libswscale/loongarch/swscale.S
index aa4c5cbe28..67b1bc834d 100644
--- a/libswscale/loongarch/swscale.S
+++ b/libswscale/loongarch/swscale.S
@@ -1866,3 +1866,371 @@  function ff_hscale_16_to_19_sub_lsx
     ld.d             s8,      sp,     64
     addi.d           sp,      sp,     72
 endfunc
+
+function lumRangeFromJpeg_lsx
+    li.w          t0,    14071
+    li.w          t1,    33561947
+    vreplgr2vr.h  vr0,   t0
+    srli.w        t2,    a1,    3
+    andi          t3,    a1,    7
+    beqz          t2,    2f
+1:
+    vld           vr1,   a0,    0
+    vreplgr2vr.w  vr2,   t1
+    vreplgr2vr.w  vr3,   t1
+    vmaddwev.w.h  vr2,   vr0,   vr1
+    vmaddwod.w.h  vr3,   vr0,   vr1
+    vsrai.w       vr2,   vr2,   14
+    vsrai.w       vr3,   vr3,   14
+    vpackev.h     vr1,   vr3,   vr2
+    vst           vr1,   a0,    0
+    addi.d        a0,    a0,    16
+    addi.d        t2,    t2,    -1
+    bnez          t2,    1b
+2:
+    beqz          t3,    4f
+3:
+    ld.h          t4,    a0,    0
+    mul.w         t4,    t4,    t0
+    add.w         t4,    t4,    t1
+    srai.w        t4,    t4,    14
+    st.h          t4,    a0,    0
+    addi.d        a0,    a0,    2
+    addi.d        t3,    t3,    -1
+    bnez          t3,    3b
+4:
+endfunc
+
+function lumRangeFromJpeg_lasx
+    li.w           t0,    14071
+    li.w           t1,    33561947
+    xvreplgr2vr.h  xr0,   t0
+    srli.w         t2,    a1,    4
+    andi           t3,    a1,    15
+    beqz           t2,    2f
+1:
+    xvld           xr1,   a0,    0
+    xvreplgr2vr.w  xr2,   t1
+    xvreplgr2vr.w  xr3,   t1
+    xvmaddwev.w.h  xr2,   xr0,   xr1
+    xvmaddwod.w.h  xr3,   xr0,   xr1
+    xvsrai.w       xr2,   xr2,   14
+    xvsrai.w       xr3,   xr3,   14
+    xvpackev.h     xr1,   xr3,   xr2
+    xvst           xr1,   a0,    0
+    addi.d         a0,    a0,    32
+    addi.d         t2,    t2,    -1
+    bnez           t2,    1b
+2:
+    beqz          t3,    4f
+3:
+    ld.h          t4,    a0,    0
+    mul.w         t4,    t4,    t0
+    add.w         t4,    t4,    t1
+    srai.w        t4,    t4,    14
+    st.h          t4,    a0,    0
+    addi.d        a0,    a0,    2
+    addi.d        t3,    t3,    -1
+    bnez          t3,    3b
+4:
+endfunc
+
+function lumRangeToJpeg_lsx
+    li.w          t0,    19077
+    li.w          t1,    -39057361
+    li.w          t2,    30189
+    vreplgr2vr.h  vr0,   t0
+    vreplgr2vr.h  vr4,   t2
+    srli.w        t2,    a1,    3
+    andi          t3,    a1,    7
+    beqz          t2,    2f
+1:
+    vld           vr1,   a0,    0
+    vreplgr2vr.w  vr2,   t1
+    vreplgr2vr.w  vr3,   t1
+    vmin.h        vr1,   vr1,   vr4
+    vmaddwev.w.h  vr2,   vr0,   vr1
+    vmaddwod.w.h  vr3,   vr0,   vr1
+    vsrai.w       vr2,   vr2,   14
+    vsrai.w       vr3,   vr3,   14
+    vpackev.h     vr1,   vr3,   vr2
+    vst           vr1,   a0,    0
+    addi.d        a0,    a0,    16
+    addi.d        t2,    t2,    -1
+    bnez          t2,    1b
+2:
+    beqz          t3,    4f
+3:
+    ld.h          t4,    a0,    0
+    vreplgr2vr.h  vr1,   t4
+    vmin.h        vr1,   vr1,   vr4
+    vpickve2gr.h  t4,    vr1,   0
+    mul.w         t4,    t4,    t0
+    add.w         t4,    t4,    t1
+    srai.w        t4,    t4,    14
+    st.h          t4,    a0,    0
+    addi.d        a0,    a0,    2
+    addi.d        t3,    t3,    -1
+    bnez          t3,    3b
+4:
+endfunc
+
+function lumRangeToJpeg_lasx
+    li.w           t0,    19077
+    li.w           t1,    -39057361
+    li.w           t2,    30189
+    xvreplgr2vr.h  xr0,   t0
+    xvreplgr2vr.h  xr4,   t2
+    srli.w         t2,    a1,    4
+    andi           t3,    a1,    15
+    beqz           t2,    2f
+1:
+    xvld           xr1,   a0,    0
+    xvreplgr2vr.w  xr2,   t1
+    xvreplgr2vr.w  xr3,   t1
+    xvmin.h        xr1,   xr1,   xr4
+    xvmaddwev.w.h  xr2,   xr0,   xr1
+    xvmaddwod.w.h  xr3,   xr0,   xr1
+    xvsrai.w       xr2,   xr2,   14
+    xvsrai.w       xr3,   xr3,   14
+    xvpackev.h     xr1,   xr3,   xr2
+    xvst           xr1,   a0,    0
+    addi.d         a0,    a0,    32
+    addi.d         t2,    t2,    -1
+    bnez           t2,    1b
+2:
+    beqz           t3,    4f
+3:
+    ld.h           t4,    a0,    0
+    vreplgr2vr.h   vr1,   t4
+    vmin.h         vr1,   vr1,   vr4
+    vpickve2gr.h   t4,    vr1,   0
+    mul.w          t4,    t4,    t0
+    add.w          t4,    t4,    t1
+    srai.w         t4,    t4,    14
+    st.h           t4,    a0,    0
+    addi.d         a0,    a0,    2
+    addi.d         t3,    t3,    -1
+    bnez           t3,    3b
+4:
+endfunc
+
+function chrRangeFromJpeg_lsx
+    li.w          t0,    1799
+    li.w          t1,    4081085
+    vreplgr2vr.h  vr0,   t0
+    srli.w        t2,    a2,    3
+    andi          t3,    a2,    7
+    beqz          t2,    2f
+1:
+    vld           vr1,   a0,    0
+    vld           vr2,   a1,    0
+    vreplgr2vr.w  vr3,   t1
+    vreplgr2vr.w  vr4,   t1
+    vreplgr2vr.w  vr5,   t1
+    vreplgr2vr.w  vr6,   t1
+    vmaddwev.w.h  vr3,   vr0,   vr1
+    vmaddwod.w.h  vr4,   vr0,   vr1
+    vmaddwev.w.h  vr5,   vr0,   vr2
+    vmaddwod.w.h  vr6,   vr0,   vr2
+    vsrai.w       vr3,   vr3,   11
+    vsrai.w       vr4,   vr4,   11
+    vsrai.w       vr5,   vr5,   11
+    vsrai.w       vr6,   vr6,   11
+    vpackev.h     vr1,   vr4,   vr3
+    vpackev.h     vr2,   vr6,   vr5
+    vst           vr1,   a0,    0
+    vst           vr2,   a1,    0
+    addi.d        a0,    a0,    16
+    addi.d        a1,    a1,    16
+    addi.d        t2,    t2,    -1
+    bnez          t2,    1b
+2:
+    beqz          t3,    4f
+3:
+    ld.h          t4,    a0,    0
+    ld.h          t5,    a1,    0
+    mul.w         t4,    t4,    t0
+    mul.w         t5,    t5,    t0
+    add.w         t4,    t4,    t1
+    add.w         t5,    t5,    t1
+    srai.w        t4,    t4,    11
+    srai.w        t5,    t5,    11
+    st.h          t4,    a0,    0
+    st.h          t5,    a1,    0
+    addi.d        a0,    a0,    2
+    addi.d        a1,    a1,    2
+    addi.d        t3,    t3,    -1
+    bnez          t3,    3b
+4:
+endfunc
+
+function chrRangeFromJpeg_lasx
+    li.w           t0,    1799
+    li.w           t1,    4081085
+    xvreplgr2vr.h  xr0,   t0
+    srli.w         t2,    a2,    4
+    andi           t3,    a2,    15
+    beqz           t2,    2f
+1:
+    xvld           xr1,   a0,    0
+    xvld           xr2,   a1,    0
+    xvreplgr2vr.w  xr3,   t1
+    xvreplgr2vr.w  xr4,   t1
+    xvreplgr2vr.w  xr5,   t1
+    xvreplgr2vr.w  xr6,   t1
+    xvmaddwev.w.h  xr3,   xr0,   xr1
+    xvmaddwod.w.h  xr4,   xr0,   xr1
+    xvmaddwev.w.h  xr5,   xr0,   xr2
+    xvmaddwod.w.h  xr6,   xr0,   xr2
+    xvsrai.w       xr3,   xr3,   11
+    xvsrai.w       xr4,   xr4,   11
+    xvsrai.w       xr5,   xr5,   11
+    xvsrai.w       xr6,   xr6,   11
+    xvpackev.h     xr1,   xr4,   xr3
+    xvpackev.h     xr2,   xr6,   xr5
+    xvst           xr1,   a0,    0
+    xvst           xr2,   a1,    0
+    addi.d         a0,    a0,    32
+    addi.d         a1,    a1,    32
+    addi.d         t2,    t2,    -1
+    bnez           t2,    1b
+2:
+    beqz          t3,    4f
+3:
+    ld.h          t4,    a0,    0
+    ld.h          t5,    a1,    0
+    mul.w         t4,    t4,    t0
+    mul.w         t5,    t5,    t0
+    add.w         t4,    t4,    t1
+    add.w         t5,    t5,    t1
+    srai.w        t4,    t4,    11
+    srai.w        t5,    t5,    11
+    st.h          t4,    a0,    0
+    st.h          t5,    a1,    0
+    addi.d        a0,    a0,    2
+    addi.d        a1,    a1,    2
+    addi.d        t3,    t3,    -1
+    bnez          t3,    3b
+4:
+endfunc
+
+function chrRangeToJpeg_lsx
+    li.w          t0,    4663
+    li.w          t1,    -9289992
+    li.w          t2,    30775
+    vreplgr2vr.h  vr0,   t0
+    vreplgr2vr.h  vr7,   t2
+    srli.w        t2,    a2,    3
+    andi          t3,    a2,    7
+    beqz          t2,    2f
+1:
+    vld           vr1,   a0,    0
+    vld           vr2,   a1,    0
+    vreplgr2vr.w  vr3,   t1
+    vreplgr2vr.w  vr4,   t1
+    vreplgr2vr.w  vr5,   t1
+    vreplgr2vr.w  vr6,   t1
+    vmin.h        vr1,   vr1,   vr7
+    vmin.h        vr2,   vr2,   vr7
+    vmaddwev.w.h  vr3,   vr0,   vr1
+    vmaddwod.w.h  vr4,   vr0,   vr1
+    vmaddwev.w.h  vr5,   vr0,   vr2
+    vmaddwod.w.h  vr6,   vr0,   vr2
+    vsrai.w       vr3,   vr3,   12
+    vsrai.w       vr4,   vr4,   12
+    vsrai.w       vr5,   vr5,   12
+    vsrai.w       vr6,   vr6,   12
+    vpackev.h     vr1,   vr4,   vr3
+    vpackev.h     vr2,   vr6,   vr5
+    vst           vr1,   a0,    0
+    vst           vr2,   a1,    0
+    addi.d        a0,    a0,    16
+    addi.d        a1,    a1,    16
+    addi.d        t2,    t2,    -1
+    bnez          t2,    1b
+2:
+    beqz          t3,    4f
+3:
+    ld.h          t4,    a0,    0
+    ld.h          t5,    a1,    0
+    vreplgr2vr.h  vr1,   t4
+    vreplgr2vr.h  vr2,   t5
+    vmin.h        vr1,   vr1,   vr7
+    vmin.h        vr2,   vr2,   vr7
+    vpickve2gr.h  t4,    vr1,   0
+    vpickve2gr.h  t5,    vr2,   0
+    mul.w         t4,    t4,    t0
+    mul.w         t5,    t5,    t0
+    add.w         t4,    t4,    t1
+    add.w         t5,    t5,    t1
+    srai.w        t4,    t4,    12
+    srai.w        t5,    t5,    12
+    st.h          t4,    a0,    0
+    st.h          t5,    a1,    0
+    addi.d        a0,    a0,    2
+    addi.d        a1,    a1,    2
+    addi.d        t3,    t3,    -1
+    bnez          t3,    3b
+4:
+endfunc
+
+function chrRangeToJpeg_lasx
+    li.w           t0,    4663
+    li.w           t1,    -9289992
+    li.w           t2,    30775
+    xvreplgr2vr.h  xr0,   t0
+    xvreplgr2vr.h  xr7,   t2
+    srli.w         t2,    a2,    4
+    andi           t3,    a2,    15
+    beqz           t2,    2f
+1:
+    xvld           xr1,   a0,    0
+    xvld           xr2,   a1,    0
+    xvreplgr2vr.w  xr3,   t1
+    xvreplgr2vr.w  xr4,   t1
+    xvreplgr2vr.w  xr5,   t1
+    xvreplgr2vr.w  xr6,   t1
+    xvmin.h        xr1,   xr1,   xr7
+    xvmin.h        xr2,   xr2,   xr7
+    xvmaddwev.w.h  xr3,   xr0,   xr1
+    xvmaddwod.w.h  xr4,   xr0,   xr1
+    xvmaddwev.w.h  xr5,   xr0,   xr2
+    xvmaddwod.w.h  xr6,   xr0,   xr2
+    xvsrai.w       xr3,   xr3,   12
+    xvsrai.w       xr4,   xr4,   12
+    xvsrai.w       xr5,   xr5,   12
+    xvsrai.w       xr6,   xr6,   12
+    xvpackev.h     xr1,   xr4,   xr3
+    xvpackev.h     xr2,   xr6,   xr5
+    xvst           xr1,   a0,    0
+    xvst           xr2,   a1,    0
+    addi.d         a0,    a0,    32
+    addi.d         a1,    a1,    32
+    addi.d         t2,    t2,    -1
+    bnez           t2,    1b
+2:
+    beqz          t3,    4f
+3:
+    ld.h          t4,    a0,    0
+    ld.h          t5,    a1,    0
+    vreplgr2vr.h  vr1,   t4
+    vreplgr2vr.h  vr2,   t5
+    vmin.h        vr1,   vr1,   vr7
+    vmin.h        vr2,   vr2,   vr7
+    vpickve2gr.h  t4,    vr1,   0
+    vpickve2gr.h  t5,    vr2,   0
+    mul.w         t4,    t4,    t0
+    mul.w         t5,    t5,    t0
+    add.w         t4,    t4,    t1
+    add.w         t5,    t5,    t1
+    srai.w        t4,    t4,    12
+    srai.w        t5,    t5,    12
+    st.h          t4,    a0,    0
+    st.h          t5,    a1,    0
+    addi.d        a0,    a0,    2
+    addi.d        a1,    a1,    2
+    addi.d        t3,    t3,    -1
+    bnez          t3,    3b
+4:
+endfunc
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 53e4f970b6..6d2786c55f 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -24,6 +24,38 @@ 
 #include "libswscale/rgb2rgb.h"
 #include "libavutil/loongarch/cpu.h"
 
+av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_lsx(cpu_flags)) {
+        if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+            if (c->dstBpc <= 14) {
+                if (c->srcRange) {
+                    c->lumConvertRange = lumRangeFromJpeg_lsx;
+                    c->chrConvertRange = chrRangeFromJpeg_lsx;
+                } else {
+                    c->lumConvertRange = lumRangeToJpeg_lsx;
+                    c->chrConvertRange = chrRangeToJpeg_lsx;
+                }
+            }
+        }
+    }
+    if (have_lasx(cpu_flags)) {
+        if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+            if (c->dstBpc <= 14) {
+                if (c->srcRange) {
+                    c->lumConvertRange = lumRangeFromJpeg_lasx;
+                    c->chrConvertRange = chrRangeFromJpeg_lasx;
+                } else {
+                    c->lumConvertRange = lumRangeToJpeg_lasx;
+                    c->chrConvertRange = chrRangeToJpeg_lasx;
+                }
+            }
+        }
+    }
+}
+
 av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -77,6 +109,7 @@  av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
             c->yuv2planeX = ff_yuv2planeX_8_lasx;
     }
 #endif // #if HAVE_LASX
+    ff_sws_init_range_convert_loongarch(c);
 }
 
 av_cold void rgb2rgb_init_loongarch(void)
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index 0514abae21..c96b085982 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -50,6 +50,11 @@  void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
                                 const uint8_t *_src, const int16_t *filter,
                                 const int32_t *filterPos, int filterSize, int sh);
 
+void lumRangeFromJpeg_lsx(int16_t *dst, int width);
+void chrRangeFromJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
+void lumRangeToJpeg_lsx(int16_t *dst, int width);
+void chrRangeToJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
+
 void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
                           int width, int32_t *rgb2yuv, void *opq);
 
@@ -97,6 +102,11 @@  void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
                              const uint8_t *_src, const int16_t *filter,
                              const int32_t *filterPos, int filterSize);
 
+void lumRangeFromJpeg_lasx(int16_t *dst, int width);
+void chrRangeFromJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
+void lumRangeToJpeg_lasx(int16_t *dst, int width);
+void chrRangeToJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
+
 void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
                            int width, int32_t *rgb2yuv, void *opq);
 
@@ -130,6 +140,7 @@  void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
                           const uint8_t *dither, int offset);
 
 av_cold void ff_sws_init_output_lasx(SwsContext *c);
+
 #endif // #if HAVE_LASX
 
 #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index abeebbb002..0db581acf8 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -695,6 +695,7 @@  void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
 void ff_updateMMXDitherTables(SwsContext *c, int dstY);
 
 av_cold void ff_sws_init_range_convert(SwsContext *c);
+av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
 
 SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
 SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
diff --git a/libswscale/utils.c b/libswscale/utils.c
index ab8a68e241..47db65ef0e 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1049,8 +1049,12 @@  int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
     c->srcRange   = srcRange;
     c->dstRange   = dstRange;
 
-    if (need_reinit)
+    if (need_reinit) {
         ff_sws_init_range_convert(c);
+#if ARCH_LOONGARCH64
+        ff_sws_init_range_convert_loongarch(c);
+#endif
+    }
 
     c->dstFormatBpp = av_get_bits_per_pixel(desc_dst);
     c->srcFormatBpp = av_get_bits_per_pixel(desc_src);