diff mbox series

[FFmpeg-devel,v3,2/3] swscale/aarch64: Add bgra/rgba to yuv

Message ID tencent_7724B883967D4D3DA02FCDC3CF86D52C2707@qq.com
State New
Headers show
Series [FFmpeg-devel,v3,1/3] swscale/aarch64: Add bgr24 to yuv | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Zhao Zhili June 24, 2024, 11:37 a.m. UTC
From: Zhao Zhili <zhilizhao@tencent.com>

Test on Apple M1 with kperf
				: -O3		: -O3 -fno-vectorize
bgra_to_uv_8_c			: 13.4		: 27.5
bgra_to_uv_8_neon		: 37.4		: 41.7
bgra_to_uv_128_c		: 155.9		: 550.2
bgra_to_uv_128_neon		: 91.7		: 92.7
bgra_to_uv_1080_c		: 1173.2	: 4558.2
bgra_to_uv_1080_neon		: 822.7		: 809.5
bgra_to_uv_1920_c		: 2078.2	: 8115.2
bgra_to_uv_1920_neon		: 1437.7	: 1438.7
bgra_to_uv_half_8_c		: 17.9		: 14.2
bgra_to_uv_half_8_neon		: 37.4		: 10.5
bgra_to_uv_half_128_c		: 103.9		: 326.0
bgra_to_uv_half_128_neon	: 73.9		: 68.7
bgra_to_uv_half_1080_c		: 850.2		: 3732.0
bgra_to_uv_half_1080_neon	: 484.2		: 490.0
bgra_to_uv_half_1920_c		: 1479.2	: 4942.7
bgra_to_uv_half_1920_neon	: 824.2		: 824.7
bgra_to_y_8_c			: 8.2		: 29.5
bgra_to_y_8_neon		: 18.2		: 32.7
bgra_to_y_128_c			: 101.4		: 361.5
bgra_to_y_128_neon		: 74.9		: 73.7
bgra_to_y_1080_c		: 739.4		: 3018.0
bgra_to_y_1080_neon		: 613.4		: 544.2
bgra_to_y_1920_c		: 1298.7	: 5326.0
bgra_to_y_1920_neon		: 918.7		: 934.2
---
 libswscale/aarch64/input.S   | 91 ++++++++++++++++++++++++++++++------
 libswscale/aarch64/swscale.c | 16 +++++++
 2 files changed, 94 insertions(+), 13 deletions(-)

Comments

Martin Storsjö June 24, 2024, 11:55 a.m. UTC | #1
On Mon, 24 Jun 2024, Zhao Zhili wrote:

> From: Zhao Zhili <zhilizhao@tencent.com>
>
> Test on Apple M1 with kperf
> 				: -O3		: -O3 -fno-vectorize
> bgra_to_uv_8_c			: 13.4		: 27.5
> bgra_to_uv_8_neon		: 37.4		: 41.7
> bgra_to_uv_128_c		: 155.9		: 550.2
> bgra_to_uv_128_neon		: 91.7		: 92.7
> bgra_to_uv_1080_c		: 1173.2	: 4558.2
> bgra_to_uv_1080_neon		: 822.7		: 809.5
> bgra_to_uv_1920_c		: 2078.2	: 8115.2
> bgra_to_uv_1920_neon		: 1437.7	: 1438.7
> bgra_to_uv_half_8_c		: 17.9		: 14.2
> bgra_to_uv_half_8_neon		: 37.4		: 10.5
> bgra_to_uv_half_128_c		: 103.9		: 326.0
> bgra_to_uv_half_128_neon	: 73.9		: 68.7
> bgra_to_uv_half_1080_c		: 850.2		: 3732.0
> bgra_to_uv_half_1080_neon	: 484.2		: 490.0
> bgra_to_uv_half_1920_c		: 1479.2	: 4942.7
> bgra_to_uv_half_1920_neon	: 824.2		: 824.7
> bgra_to_y_8_c			: 8.2		: 29.5
> bgra_to_y_8_neon		: 18.2		: 32.7
> bgra_to_y_128_c			: 101.4		: 361.5
> bgra_to_y_128_neon		: 74.9		: 73.7
> bgra_to_y_1080_c		: 739.4		: 3018.0
> bgra_to_y_1080_neon		: 613.4		: 544.2
> bgra_to_y_1920_c		: 1298.7	: 5326.0
> bgra_to_y_1920_neon		: 918.7		: 934.2
> ---
> libswscale/aarch64/input.S   | 91 ++++++++++++++++++++++++++++++------
> libswscale/aarch64/swscale.c | 16 +++++++
> 2 files changed, 94 insertions(+), 13 deletions(-)
>
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> index 2cfec4cb6a..6d2c6034bb 100644
> --- a/libswscale/aarch64/input.S
> +++ b/libswscale/aarch64/input.S
> @@ -20,8 +20,12 @@
>
> #include "libavutil/aarch64/asm.S"
>
> -.macro rgb_to_yuv_load_rgb src
> +.macro rgb_to_yuv_load_rgb src, element=3
> +    .if \element == 3
>         ld3             { v16.16b, v17.16b, v18.16b }, [\src]
> +    .else
> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
> +    .endif
>         uxtl            v19.8h, v16.8b             // v19: r
>         uxtl            v20.8h, v17.8b             // v20: g
>         uxtl            v21.8h, v18.8b             // v21: b
> @@ -51,7 +55,8 @@ function ff_bgr24ToY_neon, export=1
>         ret
> endfunc
>
> -function ff_rgb24ToY_neon, export=1
> +.macro rgbToY_neon fmt, element
> +function ff_\fmt\()ToY_neon, export=1
>         cmp             w4, #0                  // check width > 0
>         ldp             w10, w11, [x5]          // w10: ry, w11: gy
>         ldr             w12, [x5, #8]           // w12: by
> @@ -67,11 +72,11 @@ function ff_rgb24ToY_neon, export=1
>         dup             v2.8h, w12
>         b.lt            2f
> 1:
> -        rgb_to_yuv_load_rgb x1
> +        rgb_to_yuv_load_rgb x1, \element
>         rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>         rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>         sub             w4, w4, #16             // width -= 16
> -        add             x1, x1, #48             // src += 48
> +        add             x1, x1, #(16*\element)
>         cmp             w4, #16                 // width >= 16 ?
>         stp             q16, q17, [x0], #32     // store to dst
>         b.ge            1b
> @@ -86,12 +91,25 @@ function ff_rgb24ToY_neon, export=1
>         smaddl          x13, w15, w12, x13      // x13 += by * b
>         asr             w13, w13, #9            // x13 >>= 9
>         sub             w4, w4, #1              // width--
> -        add             x1, x1, #3              // src += 3
> +        add             x1, x1, #\element
>         strh            w13, [x0], #2           // store to dst
>         cbnz            w4, 2b
> 3:
>         ret
> endfunc
> +.endm
> +
> +rgbToY_neon fmt=rgb24, element=3
> +
> +function ff_bgra32ToY_neon, export=1
> +        cmp             w4, #0                  // check width > 0
> +        ldp             w12, w11, [x5]          // w12: ry, w11: gy
> +        ldr             w10, [x5, #8]           // w10: by
> +        b.gt            4f
> +        ret
> +endfunc
> +
> +rgbToY_neon fmt=rgba32, element=4

It is extremely obscure to jump to a local label (4f) that is defined by 
the following macro. I think this would be much more readable if you'd 
include the bgr(a) version in the macro, so the reference to 4f is near to 
the actual label it refers to.

> .macro rgb_set_uv_coeff half
>     .if \half
> @@ -120,7 +138,8 @@ function ff_bgr24ToUV_half_neon, export=1
>         b               4f
> endfunc
>
> -function ff_rgb24ToUV_half_neon, export=1
> +.macro rgbToUV_half_neon fmt, element
> +function ff_\fmt\()ToUV_half_neon, export=1
>         cmp             w5, #0          // check width > 0
>         b.le            3f
>
> @@ -132,7 +151,11 @@ function ff_rgb24ToUV_half_neon, export=1
>         rgb_set_uv_coeff half=1
>         b.lt            2f
> 1:
> +    .if \element == 3
>         ld3             { v16.16b, v17.16b, v18.16b }, [x3]
> +    .else
> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
> +    .endif
>         uaddlp          v19.8h, v16.16b         // v19: r
>         uaddlp          v20.8h, v17.16b         // v20: g
>         uaddlp          v21.8h, v18.16b         // v21: b
> @@ -140,7 +163,7 @@ function ff_rgb24ToUV_half_neon, export=1
>         rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>         rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>         sub             w5, w5, #8              // width -= 8
> -        add             x3, x3, #48             // src += 48
> +        add             x3, x3, #(16*\element)
>         cmp             w5, #8                  // width >= 8 ?
>         str             q16, [x0], #16          // store dst_u
>         str             q17, [x1], #16          // store dst_v
> @@ -148,9 +171,10 @@ function ff_rgb24ToUV_half_neon, export=1
>         cbz             w5, 3f
> 2:
>         ldrb            w2, [x3]                // w2: r1
> -        ldrb            w4, [x3, #3]            // w4: r2
> +        ldrb            w4, [x3, #\element]     // w4: r2
>         add             w2, w2, w4              // w2 = r1 + r2
>
> +    .if \element == 3
>         ldrb            w4, [x3, #1]            // w4: g1
>         ldrb            w7, [x3, #4]            // w7: g2
>         add             w4, w4, w7              // w4 = g1 + g2
> @@ -158,6 +182,15 @@ function ff_rgb24ToUV_half_neon, export=1
>         ldrb            w7, [x3, #2]            // w7: b1
>         ldrb            w8, [x3, #5]            // w8: b2
>         add             w7, w7, w8              // w7 = b1 + b2
> +    .else
> +        ldrb            w4, [x3, #1]            // w4: g1
> +        ldrb            w7, [x3, #5]            // w7: g2
> +        add             w4, w4, w7              // w4 = g1 + g2
> +
> +        ldrb            w7, [x3, #2]            // w7: b1
> +        ldrb            w8, [x3, #6]            // w8: b2
> +        add             w7, w7, w8              // w7 = b1 + b2
> +    .endif
>
>         smaddl          x8, w2, w10, x9         // dst_u = ru * r + const_offset
>         smaddl          x8, w4, w11, x8         // dst_u += gu * g
> @@ -170,12 +203,28 @@ function ff_rgb24ToUV_half_neon, export=1
>         smaddl          x8, w7, w15, x8         // dst_v += bv * b
>         asr             x8, x8, #10             // dst_v >>= 10
>         sub             w5, w5, #1
> -        add             x3, x3, #6              // src += 6
> +        ldrb            w4, [x3, #1]            // w4: g1
> +        add             x3, x3, #(2*\element)

Is the new ldrb a typo/copypaste mistake here?

// Martin
Zhao Zhili June 24, 2024, 1:04 p.m. UTC | #2
> On Jun 24, 2024, at 19:55, Martin Storsjö <martin@martin.st> wrote:
> 
> On Mon, 24 Jun 2024, Zhao Zhili wrote:
> 
>> From: Zhao Zhili <zhilizhao@tencent.com>
>> 
>> Test on Apple M1 with kperf
>> 				: -O3		: -O3 -fno-vectorize
>> bgra_to_uv_8_c			: 13.4		: 27.5
>> bgra_to_uv_8_neon		: 37.4		: 41.7
>> bgra_to_uv_128_c		: 155.9		: 550.2
>> bgra_to_uv_128_neon		: 91.7		: 92.7
>> bgra_to_uv_1080_c		: 1173.2	: 4558.2
>> bgra_to_uv_1080_neon		: 822.7		: 809.5
>> bgra_to_uv_1920_c		: 2078.2	: 8115.2
>> bgra_to_uv_1920_neon		: 1437.7	: 1438.7
>> bgra_to_uv_half_8_c		: 17.9		: 14.2
>> bgra_to_uv_half_8_neon		: 37.4		: 10.5
>> bgra_to_uv_half_128_c		: 103.9		: 326.0
>> bgra_to_uv_half_128_neon	: 73.9		: 68.7
>> bgra_to_uv_half_1080_c		: 850.2		: 3732.0
>> bgra_to_uv_half_1080_neon	: 484.2		: 490.0
>> bgra_to_uv_half_1920_c		: 1479.2	: 4942.7
>> bgra_to_uv_half_1920_neon	: 824.2		: 824.7
>> bgra_to_y_8_c			: 8.2		: 29.5
>> bgra_to_y_8_neon		: 18.2		: 32.7
>> bgra_to_y_128_c			: 101.4		: 361.5
>> bgra_to_y_128_neon		: 74.9		: 73.7
>> bgra_to_y_1080_c		: 739.4		: 3018.0
>> bgra_to_y_1080_neon		: 613.4		: 544.2
>> bgra_to_y_1920_c		: 1298.7	: 5326.0
>> bgra_to_y_1920_neon		: 918.7		: 934.2
>> ---
>> libswscale/aarch64/input.S   | 91 ++++++++++++++++++++++++++++++------
>> libswscale/aarch64/swscale.c | 16 +++++++
>> 2 files changed, 94 insertions(+), 13 deletions(-)
>> 
>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>> index 2cfec4cb6a..6d2c6034bb 100644
>> --- a/libswscale/aarch64/input.S
>> +++ b/libswscale/aarch64/input.S
>> @@ -20,8 +20,12 @@
>> 
>> #include "libavutil/aarch64/asm.S"
>> 
>> -.macro rgb_to_yuv_load_rgb src
>> +.macro rgb_to_yuv_load_rgb src, element=3
>> +    .if \element == 3
>>        ld3             { v16.16b, v17.16b, v18.16b }, [\src]
>> +    .else
>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
>> +    .endif
>>        uxtl            v19.8h, v16.8b             // v19: r
>>        uxtl            v20.8h, v17.8b             // v20: g
>>        uxtl            v21.8h, v18.8b             // v21: b
>> @@ -51,7 +55,8 @@ function ff_bgr24ToY_neon, export=1
>>        ret
>> endfunc
>> 
>> -function ff_rgb24ToY_neon, export=1
>> +.macro rgbToY_neon fmt, element
>> +function ff_\fmt\()ToY_neon, export=1
>>        cmp             w4, #0                  // check width > 0
>>        ldp             w10, w11, [x5]          // w10: ry, w11: gy
>>        ldr             w12, [x5, #8]           // w12: by
>> @@ -67,11 +72,11 @@ function ff_rgb24ToY_neon, export=1
>>        dup             v2.8h, w12
>>        b.lt            2f
>> 1:
>> -        rgb_to_yuv_load_rgb x1
>> +        rgb_to_yuv_load_rgb x1, \element
>>        rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>>        rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>>        sub             w4, w4, #16             // width -= 16
>> -        add             x1, x1, #48             // src += 48
>> +        add             x1, x1, #(16*\element)
>>        cmp             w4, #16                 // width >= 16 ?
>>        stp             q16, q17, [x0], #32     // store to dst
>>        b.ge            1b
>> @@ -86,12 +91,25 @@ function ff_rgb24ToY_neon, export=1
>>        smaddl          x13, w15, w12, x13      // x13 += by * b
>>        asr             w13, w13, #9            // x13 >>= 9
>>        sub             w4, w4, #1              // width--
>> -        add             x1, x1, #3              // src += 3
>> +        add             x1, x1, #\element
>>        strh            w13, [x0], #2           // store to dst
>>        cbnz            w4, 2b
>> 3:
>>        ret
>> endfunc
>> +.endm
>> +
>> +rgbToY_neon fmt=rgb24, element=3
>> +
>> +function ff_bgra32ToY_neon, export=1
>> +        cmp             w4, #0                  // check width > 0
>> +        ldp             w12, w11, [x5]          // w12: ry, w11: gy
>> +        ldr             w10, [x5, #8]           // w10: by
>> +        b.gt            4f
>> +        ret
>> +endfunc
>> +
>> +rgbToY_neon fmt=rgba32, element=4
> 
> It is extremely obscure to jump to a local label (4f) that is defined by the following macro. I think this would be much more readable if you'd include the bgr(a) version in the macro, so the reference to 4f is near to the actual label it refers to.

Good idea, it saved a lot of typing. Fixed in v4.

> 
>> .macro rgb_set_uv_coeff half
>>    .if \half
>> @@ -120,7 +138,8 @@ function ff_bgr24ToUV_half_neon, export=1
>>        b               4f
>> endfunc
>> 
>> -function ff_rgb24ToUV_half_neon, export=1
>> +.macro rgbToUV_half_neon fmt, element
>> +function ff_\fmt\()ToUV_half_neon, export=1
>>        cmp             w5, #0          // check width > 0
>>        b.le            3f
>> 
>> @@ -132,7 +151,11 @@ function ff_rgb24ToUV_half_neon, export=1
>>        rgb_set_uv_coeff half=1
>>        b.lt            2f
>> 1:
>> +    .if \element == 3
>>        ld3             { v16.16b, v17.16b, v18.16b }, [x3]
>> +    .else
>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
>> +    .endif
>>        uaddlp          v19.8h, v16.16b         // v19: r
>>        uaddlp          v20.8h, v17.16b         // v20: g
>>        uaddlp          v21.8h, v18.16b         // v21: b
>> @@ -140,7 +163,7 @@ function ff_rgb24ToUV_half_neon, export=1
>>        rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>>        rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>>        sub             w5, w5, #8              // width -= 8
>> -        add             x3, x3, #48             // src += 48
>> +        add             x3, x3, #(16*\element)
>>        cmp             w5, #8                  // width >= 8 ?
>>        str             q16, [x0], #16          // store dst_u
>>        str             q17, [x1], #16          // store dst_v
>> @@ -148,9 +171,10 @@ function ff_rgb24ToUV_half_neon, export=1
>>        cbz             w5, 3f
>> 2:
>>        ldrb            w2, [x3]                // w2: r1
>> -        ldrb            w4, [x3, #3]            // w4: r2
>> +        ldrb            w4, [x3, #\element]     // w4: r2
>>        add             w2, w2, w4              // w2 = r1 + r2
>> 
>> +    .if \element == 3
>>        ldrb            w4, [x3, #1]            // w4: g1
>>        ldrb            w7, [x3, #4]            // w7: g2
>>        add             w4, w4, w7              // w4 = g1 + g2
>> @@ -158,6 +182,15 @@ function ff_rgb24ToUV_half_neon, export=1
>>        ldrb            w7, [x3, #2]            // w7: b1
>>        ldrb            w8, [x3, #5]            // w8: b2
>>        add             w7, w7, w8              // w7 = b1 + b2
>> +    .else
>> +        ldrb            w4, [x3, #1]            // w4: g1
>> +        ldrb            w7, [x3, #5]            // w7: g2
>> +        add             w4, w4, w7              // w4 = g1 + g2
>> +
>> +        ldrb            w7, [x3, #2]            // w7: b1
>> +        ldrb            w8, [x3, #6]            // w8: b2
>> +        add             w7, w7, w8              // w7 = b1 + b2
>> +    .endif
>> 
>>        smaddl          x8, w2, w10, x9         // dst_u = ru * r + const_offset
>>        smaddl          x8, w4, w11, x8         // dst_u += gu * g
>> @@ -170,12 +203,28 @@ function ff_rgb24ToUV_half_neon, export=1
>>        smaddl          x8, w7, w15, x8         // dst_v += bv * b
>>        asr             x8, x8, #10             // dst_v >>= 10
>>        sub             w5, w5, #1
>> -        add             x3, x3, #6              // src += 6
>> +        ldrb            w4, [x3, #1]            // w4: g1
>> +        add             x3, x3, #(2*\element)
> 
> Is the new ldrb a typo/copypaste mistake here?

Yes, it’s a copypaste mistake. Fixed in v4.

> 
> // Martin
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff mbox series

Patch

diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index 2cfec4cb6a..6d2c6034bb 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -20,8 +20,12 @@ 
 
 #include "libavutil/aarch64/asm.S"
 
-.macro rgb_to_yuv_load_rgb src
+.macro rgb_to_yuv_load_rgb src, element=3
+    .if \element == 3
         ld3             { v16.16b, v17.16b, v18.16b }, [\src]
+    .else
+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
+    .endif
         uxtl            v19.8h, v16.8b             // v19: r
         uxtl            v20.8h, v17.8b             // v20: g
         uxtl            v21.8h, v18.8b             // v21: b
@@ -51,7 +55,8 @@  function ff_bgr24ToY_neon, export=1
         ret
 endfunc
 
-function ff_rgb24ToY_neon, export=1
+.macro rgbToY_neon fmt, element
+function ff_\fmt\()ToY_neon, export=1
         cmp             w4, #0                  // check width > 0
         ldp             w10, w11, [x5]          // w10: ry, w11: gy
         ldr             w12, [x5, #8]           // w12: by
@@ -67,11 +72,11 @@  function ff_rgb24ToY_neon, export=1
         dup             v2.8h, w12
         b.lt            2f
 1:
-        rgb_to_yuv_load_rgb x1
+        rgb_to_yuv_load_rgb x1, \element
         rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
         rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
         sub             w4, w4, #16             // width -= 16
-        add             x1, x1, #48             // src += 48
+        add             x1, x1, #(16*\element)
         cmp             w4, #16                 // width >= 16 ?
         stp             q16, q17, [x0], #32     // store to dst
         b.ge            1b
@@ -86,12 +91,25 @@  function ff_rgb24ToY_neon, export=1
         smaddl          x13, w15, w12, x13      // x13 += by * b
         asr             w13, w13, #9            // x13 >>= 9
         sub             w4, w4, #1              // width--
-        add             x1, x1, #3              // src += 3
+        add             x1, x1, #\element
         strh            w13, [x0], #2           // store to dst
         cbnz            w4, 2b
 3:
         ret
 endfunc
+.endm
+
+rgbToY_neon fmt=rgb24, element=3
+
+function ff_bgra32ToY_neon, export=1
+        cmp             w4, #0                  // check width > 0
+        ldp             w12, w11, [x5]          // w12: ry, w11: gy
+        ldr             w10, [x5, #8]           // w10: by
+        b.gt            4f
+        ret
+endfunc
+
+rgbToY_neon fmt=rgba32, element=4
 
 .macro rgb_set_uv_coeff half
     .if \half
@@ -120,7 +138,8 @@  function ff_bgr24ToUV_half_neon, export=1
         b               4f
 endfunc
 
-function ff_rgb24ToUV_half_neon, export=1
+.macro rgbToUV_half_neon fmt, element
+function ff_\fmt\()ToUV_half_neon, export=1
         cmp             w5, #0          // check width > 0
         b.le            3f
 
@@ -132,7 +151,11 @@  function ff_rgb24ToUV_half_neon, export=1
         rgb_set_uv_coeff half=1
         b.lt            2f
 1:
+    .if \element == 3
         ld3             { v16.16b, v17.16b, v18.16b }, [x3]
+    .else
+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
+    .endif
         uaddlp          v19.8h, v16.16b         // v19: r
         uaddlp          v20.8h, v17.16b         // v20: g
         uaddlp          v21.8h, v18.16b         // v21: b
@@ -140,7 +163,7 @@  function ff_rgb24ToUV_half_neon, export=1
         rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
         rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
         sub             w5, w5, #8              // width -= 8
-        add             x3, x3, #48             // src += 48
+        add             x3, x3, #(16*\element)
         cmp             w5, #8                  // width >= 8 ?
         str             q16, [x0], #16          // store dst_u
         str             q17, [x1], #16          // store dst_v
@@ -148,9 +171,10 @@  function ff_rgb24ToUV_half_neon, export=1
         cbz             w5, 3f
 2:
         ldrb            w2, [x3]                // w2: r1
-        ldrb            w4, [x3, #3]            // w4: r2
+        ldrb            w4, [x3, #\element]     // w4: r2
         add             w2, w2, w4              // w2 = r1 + r2
 
+    .if \element == 3
         ldrb            w4, [x3, #1]            // w4: g1
         ldrb            w7, [x3, #4]            // w7: g2
         add             w4, w4, w7              // w4 = g1 + g2
@@ -158,6 +182,15 @@  function ff_rgb24ToUV_half_neon, export=1
         ldrb            w7, [x3, #2]            // w7: b1
         ldrb            w8, [x3, #5]            // w8: b2
         add             w7, w7, w8              // w7 = b1 + b2
+    .else
+        ldrb            w4, [x3, #1]            // w4: g1
+        ldrb            w7, [x3, #5]            // w7: g2
+        add             w4, w4, w7              // w4 = g1 + g2
+
+        ldrb            w7, [x3, #2]            // w7: b1
+        ldrb            w8, [x3, #6]            // w8: b2
+        add             w7, w7, w8              // w7 = b1 + b2
+    .endif
 
         smaddl          x8, w2, w10, x9         // dst_u = ru * r + const_offset
         smaddl          x8, w4, w11, x8         // dst_u += gu * g
@@ -170,12 +203,28 @@  function ff_rgb24ToUV_half_neon, export=1
         smaddl          x8, w7, w15, x8         // dst_v += bv * b
         asr             x8, x8, #10             // dst_v >>= 10
         sub             w5, w5, #1
-        add             x3, x3, #6              // src += 6
+        ldrb            w4, [x3, #1]            // w4: g1
+        add             x3, x3, #(2*\element)
         strh            w8, [x1], #2            // store dst_v
         cbnz            w5, 2b
 3:
         ret
 endfunc
+.endm
+
+rgbToUV_half_neon fmt=rgb24, element=3
+
+function ff_bgra32ToUV_half_neon, export=1
+        cmp             w5, #0          // check width > 0
+        b.le            3f
+
+        ldp             w12, w11, [x6, #12]
+        ldp             w10, w15, [x6, #20]
+        ldp             w14, w13, [x6, #28]
+        b               4f
+endfunc
+
+rgbToUV_half_neon fmt=rgba32, element=4
 
 function ff_bgr24ToUV_neon, export=1
         cmp             w5, #0                  // check width > 0
@@ -187,7 +236,8 @@  function ff_bgr24ToUV_neon, export=1
         b               4f
 endfunc
 
-function ff_rgb24ToUV_neon, export=1
+.macro rgbToUV_neon fmt, element
+function ff_\fmt\()ToUV_neon, export=1
         cmp             w5, #0                  // check width > 0
         b.le            3f
 
@@ -199,13 +249,13 @@  function ff_rgb24ToUV_neon, export=1
         rgb_set_uv_coeff half=0
         b.lt            2f
 1:
-        rgb_to_yuv_load_rgb x3
+        rgb_to_yuv_load_rgb x3, \element
         rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
         rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
         rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
         rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
         sub             w5, w5, #16
-        add             x3, x3, #48             // src += 48
+        add             x3, x3, #(16*\element)
         cmp             w5, #16
         stp             q16, q17, [x0], #32     // store to dst_u
         stp             q18, q19, [x1], #32     // store to dst_v
@@ -227,9 +277,24 @@  function ff_rgb24ToUV_neon, export=1
         smaddl          x8, w4, w15, x8         // x8 += bv * b
         asr             w8, w8, #9              // x8 >>= 9
         sub             w5, w5, #1              // width--
-        add             x3, x3, #3              // src += 3
+        add             x3, x3, #\element
         strh            w8, [x1], #2            // store to dst_v
         cbnz            w5, 2b
 3:
         ret
 endfunc
+.endm
+
+rgbToUV_neon fmt=rgb24, element=3
+
+function ff_bgra32ToUV_neon, export=1
+        cmp             w5, #0                  // check width > 0
+        b.le            3f
+
+        ldp             w12, w11, [x6, #12]
+        ldp             w10, w15, [x6, #20]
+        ldp             w14, w13, [x6, #28]
+        b               4f
+endfunc
+
+rgbToUV_neon fmt=rgba32, element=4
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index c6594944c3..92af662014 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -212,7 +212,9 @@  void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \
                               uint32_t *coeffs, void *)
 
 NEON_INPUT(bgr24);
+NEON_INPUT(bgra32);
 NEON_INPUT(rgb24);
+NEON_INPUT(rgba32);
 
 void ff_lumRangeFromJpeg_neon(int16_t *dst, int width);
 void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
@@ -253,6 +255,13 @@  av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
             else
                 c->chrToYV12 = ff_bgr24ToUV_neon;
             break;
+        case AV_PIX_FMT_BGRA:
+            c->lumToYV12 = ff_bgra32ToY_neon;
+            if (c->chrSrcHSubSample)
+                c->chrToYV12 = ff_bgra32ToUV_half_neon;
+            else
+                c->chrToYV12 = ff_bgra32ToUV_neon;
+            break;
         case AV_PIX_FMT_RGB24:
             c->lumToYV12 = ff_rgb24ToY_neon;
             if (c->chrSrcHSubSample)
@@ -260,6 +269,13 @@  av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
             else
                 c->chrToYV12 = ff_rgb24ToUV_neon;
             break;
+        case AV_PIX_FMT_RGBA:
+            c->lumToYV12 = ff_rgba32ToY_neon;
+            if (c->chrSrcHSubSample)
+                c->chrToYV12 = ff_rgba32ToUV_half_neon;
+            else
+                c->chrToYV12 = ff_rgba32ToUV_neon;
+            break;
         default:
             break;
         }