diff mbox series

[FFmpeg-devel,2/2] swscale/aarch64: Add bgra/rgba to yuv

Message ID tencent_25D55C273C38A096624D28206A4D8B4FB107@qq.com
State New
Headers show
Series [FFmpeg-devel,1/2] swscale/aarch64: Add bgr24 to yuv | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Zhao Zhili June 15, 2024, 9:57 a.m. UTC
From: Zhao Zhili <zhilizhao@tencent.com>

Test on Apple M1 with kperf

bgra_to_uv_8_c: 13.4
bgra_to_uv_8_neon: 37.4
bgra_to_uv_128_c: 155.9
bgra_to_uv_128_neon: 91.7
bgra_to_uv_1080_c: 1173.2
bgra_to_uv_1080_neon: 822.7
bgra_to_uv_1920_c: 2078.2
bgra_to_uv_1920_neon: 1437.7
bgra_to_uv_half_8_c: 17.9
bgra_to_uv_half_8_neon: 37.4
bgra_to_uv_half_128_c: 103.9
bgra_to_uv_half_128_neon: 73.9
bgra_to_uv_half_1080_c: 850.2
bgra_to_uv_half_1080_neon: 484.2
bgra_to_uv_half_1920_c: 1479.2
bgra_to_uv_half_1920_neon: 824.2
bgra_to_y_8_c: 8.2
bgra_to_y_8_neon: 18.2
bgra_to_y_128_c: 101.4
bgra_to_y_128_neon: 74.9
bgra_to_y_1080_c: 739.4
bgra_to_y_1080_neon: 613.4
bgra_to_y_1920_c: 1298.7
bgra_to_y_1920_neon: 918.7
---
 libswscale/aarch64/input.S   | 81 +++++++++++++++++++++++++++++++-----
 libswscale/aarch64/swscale.c | 16 +++++++
 2 files changed, 86 insertions(+), 11 deletions(-)

Comments

Martin Storsjö June 18, 2024, 8:32 p.m. UTC | #1
On Sat, 15 Jun 2024, Zhao Zhili wrote:

> From: Zhao Zhili <zhilizhao@tencent.com>
>
> Test on Apple M1 with kperf
>
> bgra_to_uv_8_c: 13.4
> bgra_to_uv_8_neon: 37.4
> bgra_to_uv_128_c: 155.9
> bgra_to_uv_128_neon: 91.7
> bgra_to_uv_1080_c: 1173.2
> bgra_to_uv_1080_neon: 822.7
> bgra_to_uv_1920_c: 2078.2
> bgra_to_uv_1920_neon: 1437.7
> bgra_to_uv_half_8_c: 17.9
> bgra_to_uv_half_8_neon: 37.4
> bgra_to_uv_half_128_c: 103.9
> bgra_to_uv_half_128_neon: 73.9
> bgra_to_uv_half_1080_c: 850.2
> bgra_to_uv_half_1080_neon: 484.2
> bgra_to_uv_half_1920_c: 1479.2
> bgra_to_uv_half_1920_neon: 824.2
> bgra_to_y_8_c: 8.2
> bgra_to_y_8_neon: 18.2
> bgra_to_y_128_c: 101.4
> bgra_to_y_128_neon: 74.9
> bgra_to_y_1080_c: 739.4
> bgra_to_y_1080_neon: 613.4
> bgra_to_y_1920_c: 1298.7
> bgra_to_y_1920_neon: 918.7
> ---
> libswscale/aarch64/input.S   | 81 +++++++++++++++++++++++++++++++-----
> libswscale/aarch64/swscale.c | 16 +++++++
> 2 files changed, 86 insertions(+), 11 deletions(-)
>
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> index 2b956fe5c2..37f1158504 100644
> --- a/libswscale/aarch64/input.S
> +++ b/libswscale/aarch64/input.S
> @@ -20,8 +20,12 @@
>
> #include "libavutil/aarch64/asm.S"
>
> -.macro rgb_to_yuv_load_rgb src
> +.macro rgb_to_yuv_load_rgb src, element=3
> +    .if \element == 3
>         ld3             { v16.16b, v17.16b, v18.16b }, [\src]
> +    .else
> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
> +    .endif
>         uxtl            v19.8h, v16.8b             // v19: r
>         uxtl            v20.8h, v17.8b             // v20: g
>         uxtl            v21.8h, v18.8b             // v21: b
> @@ -43,7 +47,7 @@
>         sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
> .endm
>
> -.macro rgbToY bgr
> +.macro rgbToY bgr, element=3
>         cmp             w4, #0                  // check width > 0
>     .if \bgr
>         ldr             w12, [x5]               // w12: ry
> @@ -67,11 +71,15 @@
>         dup             v2.8h, w12
>         b.lt            2f
> 1:
> -        rgb_to_yuv_load_rgb x1
> +        rgb_to_yuv_load_rgb x1, \element
>         rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>         rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>         sub             w4, w4, #16             // width -= 16
> +    .if \element == 3
>         add             x1, x1, #48             // src += 48
> +    .else
> +        add             x1, x1, #64
> +    .endif

I guess this also could be just #(16*\element)

>         cmp             w4, #16                 // width >= 16 ?
>         stp             q16, q17, [x0], #32     // store to dst
>         b.ge            1b
> @@ -86,7 +94,7 @@
>         smaddl          x13, w15, w12, x13      // x13 += by * b
>         asr             w13, w13, #9            // x13 >>= 9
>         sub             w4, w4, #1              // width--
> -        add             x1, x1, #3              // src += 3
> +        add             x1, x1, \element

Keep the # for the immediate constant here, i.e. #\element. Perhaps it 
doen't matter for most assemblers we use, but it's good to stay 
consistent.

>         strh            w13, [x0], #2           // store to dst
>         cbnz            w4, 2b
> 3:
> @@ -101,6 +109,14 @@ function ff_bgr24ToY_neon, export=1
>         rgbToY          bgr=1
> endfunc
>
> +function ff_rgba32ToY_neon, export=1
> +        rgbToY          bgr=0, element=4
> +endfunc
> +
> +function ff_bgra32ToY_neon, export=1
> +        rgbToY          bgr=1, element=4
> +endfunc
> +
> .macro rgb_load_uv_coeff half, bgr
>     .if \bgr
>         ldr             w12, [x6, #12]
> @@ -130,7 +146,7 @@ endfunc
>         dup             v6.4s, w9
> .endm
>
> -.macro rgbToUV_half bgr
> +.macro rgbToUV_half bgr, element=3
>         cmp             w5, #0          // check width > 0
>         b.le            3f
>
> @@ -139,7 +155,11 @@ endfunc
>         b.lt            2f
>         // The following comments assume RGB order. The logic for RGB and BGR is the same.
> 1:
> +    .if \element == 3
>         ld3             { v16.16b, v17.16b, v18.16b }, [x3]
> +    .else
> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
> +    .endif
>         uaddlp          v19.8h, v16.16b         // v19: r
>         uaddlp          v20.8h, v17.16b         // v20: g
>         uaddlp          v21.8h, v18.16b         // v21: b
> @@ -147,7 +167,11 @@ endfunc
>         rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>         rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>         sub             w5, w5, #8              // width -= 8
> -        add             x3, x3, #48             // src += 48
> +    .if \element == 3
> +        add             x3, x3, #48
> +    .else
> +        add             x3, x3, #64
> +    .endif
>         cmp             w5, #8                  // width >= 8 ?
>         str             q16, [x0], #16          // store dst_u
>         str             q17, [x1], #16          // store dst_v
> @@ -155,9 +179,10 @@ endfunc
>         cbz             w5, 3f
> 2:
>         ldrb            w2, [x3]                // w2: r1
> -        ldrb            w4, [x3, #3]            // w4: r2
> +        ldrb            w4, [x3, \element]      // w4: r2

Ditto about keeping the #

>         add             w2, w2, w4              // w2 = r1 + r2
>
> +    .if \element == 3
>         ldrb            w4, [x3, #1]            // w4: g1
>         ldrb            w7, [x3, #4]            // w7: g2
>         add             w4, w4, w7              // w4 = g1 + g2
> @@ -165,6 +190,15 @@ endfunc
>         ldrb            w7, [x3, #2]            // w7: b1
>         ldrb            w8, [x3, #5]            // w8: b2
>         add             w7, w7, w8              // w7 = b1 + b2
> +    .else
> +        ldrb            w4, [x3, #1]            // w4: g1
> +        ldrb            w7, [x3, #5]            // w7: g2
> +        add             w4, w4, w7              // w4 = g1 + g2
> +
> +        ldrb            w7, [x3, #2]            // w7: b1
> +        ldrb            w8, [x3, #6]            // w8: b2
> +        add             w7, w7, w8              // w7 = b1 + b2
> +    .endif
>
>         smaddl          x8, w2, w10, x9         // dst_u = ru * r + const_offset
>         smaddl          x8, w4, w11, x8         // dst_u += gu * g
> @@ -177,7 +211,12 @@ endfunc
>         smaddl          x8, w7, w15, x8         // dst_v += bv * b
>         asr             x8, x8, #10             // dst_v >>= 10
>         sub             w5, w5, #1
> -        add             x3, x3, #6              // src += 6
> +        ldrb            w4, [x3, #1]            // w4: g1
> +    .if \element == 3
> +        add             x3, x3, #6
> +    .else
> +        add             x3, x3, #8
> +    .endif

And this can be #(2*\element)

// Martin
Rémi Denis-Courmont June 19, 2024, 7:07 a.m. UTC | #2
Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit :
>From: Zhao Zhili <zhilizhao@tencent.com>
>
>Test on Apple M1 with kperf
>
>bgra_to_uv_8_c: 13.4
>bgra_to_uv_8_neon: 37.4
>bgra_to_uv_128_c: 155.9
>bgra_to_uv_128_neon: 91.7
>bgra_to_uv_1080_c: 1173.2
>bgra_to_uv_1080_neon: 822.7
>bgra_to_uv_1920_c: 2078.2
>bgra_to_uv_1920_neon: 1437.7
>bgra_to_uv_half_8_c: 17.9
>bgra_to_uv_half_8_neon: 37.4
>bgra_to_uv_half_128_c: 103.9
>bgra_to_uv_half_128_neon: 73.9
>bgra_to_uv_half_1080_c: 850.2
>bgra_to_uv_half_1080_neon: 484.2
>bgra_to_uv_half_1920_c: 1479.2
>bgra_to_uv_half_1920_neon: 824.2
>bgra_to_y_8_c: 8.2
>bgra_to_y_8_neon: 18.2
>bgra_to_y_128_c: 101.4
>bgra_to_y_128_neon: 74.9
>bgra_to_y_1080_c: 739.4
>bgra_to_y_1080_neon: 613.4
>bgra_to_y_1920_c: 1298.7
>bgra_to_y_1920_neon: 918.7
>---
> libswscale/aarch64/input.S   | 81 +++++++++++++++++++++++++++++++-----
> libswscale/aarch64/swscale.c | 16 +++++++
> 2 files changed, 86 insertions(+), 11 deletions(-)
>
>diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>index 2b956fe5c2..37f1158504 100644
>--- a/libswscale/aarch64/input.S
>+++ b/libswscale/aarch64/input.S
>@@ -20,8 +20,12 @@
> 
> #include "libavutil/aarch64/asm.S"
> 
>-.macro rgb_to_yuv_load_rgb src
>+.macro rgb_to_yuv_load_rgb src, element=3
>+    .if \element == 3
>         ld3             { v16.16b, v17.16b, v18.16b }, [\src]
>+    .else
>+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
>+    .endif
>         uxtl            v19.8h, v16.8b             // v19: r
>         uxtl            v20.8h, v17.8b             // v20: g
>         uxtl            v21.8h, v18.8b             // v21: b
>@@ -43,7 +47,7 @@
>         sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
> .endm
> 
>-.macro rgbToY bgr
>+.macro rgbToY bgr, element=3

AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop.

>         cmp             w4, #0                  // check width > 0
>     .if \bgr
>         ldr             w12, [x5]               // w12: ry
>@@ -67,11 +71,15 @@
>         dup             v2.8h, w12
>         b.lt            2f
> 1:
>-        rgb_to_yuv_load_rgb x1
>+        rgb_to_yuv_load_rgb x1, \element
>         rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>         rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>         sub             w4, w4, #16             // width -= 16
>+    .if \element == 3
>         add             x1, x1, #48             // src += 48
>+    .else
>+        add             x1, x1, #64
>+    .endif
>         cmp             w4, #16                 // width >= 16 ?
>         stp             q16, q17, [x0], #32     // store to dst
>         b.ge            1b
>@@ -86,7 +94,7 @@
>         smaddl          x13, w15, w12, x13      // x13 += by * b
>         asr             w13, w13, #9            // x13 >>= 9
>         sub             w4, w4, #1              // width--
>-        add             x1, x1, #3              // src += 3
>+        add             x1, x1, \element
>         strh            w13, [x0], #2           // store to dst
>         cbnz            w4, 2b
> 3:
>@@ -101,6 +109,14 @@ function ff_bgr24ToY_neon, export=1
>         rgbToY          bgr=1
> endfunc
> 
>+function ff_rgba32ToY_neon, export=1
>+        rgbToY          bgr=0, element=4
>+endfunc
>+
>+function ff_bgra32ToY_neon, export=1
>+        rgbToY          bgr=1, element=4
>+endfunc
>+
> .macro rgb_load_uv_coeff half, bgr
>     .if \bgr
>         ldr             w12, [x6, #12]
>@@ -130,7 +146,7 @@ endfunc
>         dup             v6.4s, w9
> .endm
> 
>-.macro rgbToUV_half bgr
>+.macro rgbToUV_half bgr, element=3
>         cmp             w5, #0          // check width > 0
>         b.le            3f
> 
>@@ -139,7 +155,11 @@ endfunc
>         b.lt            2f
>         // The following comments assume RGB order. The logic for RGB and BGR is the same.
> 1:
>+    .if \element == 3
>         ld3             { v16.16b, v17.16b, v18.16b }, [x3]
>+    .else
>+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
>+    .endif
>         uaddlp          v19.8h, v16.16b         // v19: r
>         uaddlp          v20.8h, v17.16b         // v20: g
>         uaddlp          v21.8h, v18.16b         // v21: b
>@@ -147,7 +167,11 @@ endfunc
>         rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>         rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>         sub             w5, w5, #8              // width -= 8
>-        add             x3, x3, #48             // src += 48
>+    .if \element == 3
>+        add             x3, x3, #48
>+    .else
>+        add             x3, x3, #64
>+    .endif
>         cmp             w5, #8                  // width >= 8 ?
>         str             q16, [x0], #16          // store dst_u
>         str             q17, [x1], #16          // store dst_v
>@@ -155,9 +179,10 @@ endfunc
>         cbz             w5, 3f
> 2:
>         ldrb            w2, [x3]                // w2: r1
>-        ldrb            w4, [x3, #3]            // w4: r2
>+        ldrb            w4, [x3, \element]      // w4: r2
>         add             w2, w2, w4              // w2 = r1 + r2
> 
>+    .if \element == 3
>         ldrb            w4, [x3, #1]            // w4: g1
>         ldrb            w7, [x3, #4]            // w7: g2
>         add             w4, w4, w7              // w4 = g1 + g2
>@@ -165,6 +190,15 @@ endfunc
>         ldrb            w7, [x3, #2]            // w7: b1
>         ldrb            w8, [x3, #5]            // w8: b2
>         add             w7, w7, w8              // w7 = b1 + b2
>+    .else
>+        ldrb            w4, [x3, #1]            // w4: g1
>+        ldrb            w7, [x3, #5]            // w7: g2
>+        add             w4, w4, w7              // w4 = g1 + g2
>+
>+        ldrb            w7, [x3, #2]            // w7: b1
>+        ldrb            w8, [x3, #6]            // w8: b2
>+        add             w7, w7, w8              // w7 = b1 + b2
>+    .endif
> 
>         smaddl          x8, w2, w10, x9         // dst_u = ru * r + const_offset
>         smaddl          x8, w4, w11, x8         // dst_u += gu * g
>@@ -177,7 +211,12 @@ endfunc
>         smaddl          x8, w7, w15, x8         // dst_v += bv * b
>         asr             x8, x8, #10             // dst_v >>= 10
>         sub             w5, w5, #1
>-        add             x3, x3, #6              // src += 6
>+        ldrb            w4, [x3, #1]            // w4: g1
>+    .if \element == 3
>+        add             x3, x3, #6
>+    .else
>+        add             x3, x3, #8
>+    .endif
>         strh            w8, [x1], #2            // store dst_v
>         cbnz            w5, 2b
> 3:
>@@ -192,7 +231,15 @@ function ff_bgr24ToUV_half_neon, export=1
>         rgbToUV_half    bgr=1
> endfunc
> 
>-.macro rgbToUV bgr
>+function ff_rgba32ToUV_half_neon, export=1
>+        rgbToUV_half    bgr=0, element=4
>+endfunc
>+
>+function ff_bgra32ToUV_half_neon, export=1
>+        rgbToUV_half    bgr=1, element=4
>+endfunc
>+
>+.macro rgbToUV bgr, element=3
>         cmp             w5, #0                  // check width > 0
>         b.le            3f
> 
>@@ -201,13 +248,17 @@ endfunc
>         b.lt            2f
>         // The following comments assume RGB order. The logic for RGB and BGR is the same.
> 1:
>-        rgb_to_yuv_load_rgb x3
>+        rgb_to_yuv_load_rgb x3, \element
>         rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>         rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>         rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
>         rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
>         sub             w5, w5, #16
>+    .if \element == 3
>         add             x3, x3, #48             // src += 48
>+    .else
>+        add             x3, x3, #64
>+    .endif
>         cmp             w5, #16
>         stp             q16, q17, [x0], #32     // store to dst_u
>         stp             q18, q19, [x1], #32     // store to dst_v
>@@ -229,7 +280,7 @@ endfunc
>         smaddl          x8, w4, w15, x8         // x8 += bv * b
>         asr             w8, w8, #9              // x8 >>= 9
>         sub             w5, w5, #1              // width--
>-        add             x3, x3, #3              // src += 3
>+        add             x3, x3, \element
>         strh            w8, [x1], #2            // store to dst_v
>         cbnz            w5, 2b
> 3:
>@@ -243,3 +294,11 @@ endfunc
> function ff_bgr24ToUV_neon, export=1
>         rgbToUV         bgr=1
> endfunc
>+
>+function ff_rgba32ToUV_neon, export=1
>+        rgbToUV         bgr=0, element=4
>+endfunc
>+
>+function ff_bgra32ToUV_neon, export=1
>+        rgbToUV         bgr=1, element=4
>+endfunc
>diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
>index ce70dbedcc..8fe9fb11ac 100644
>--- a/libswscale/aarch64/swscale.c
>+++ b/libswscale/aarch64/swscale.c
>@@ -212,7 +212,9 @@ void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \
>                               uint32_t *coeffs, void *)
> 
> NEON_INPUT(bgr24);
>+NEON_INPUT(bgra32);
> NEON_INPUT(rgb24);
>+NEON_INPUT(rgba32);
> 
> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
> {
>@@ -233,6 +235,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>             else
>                 c->chrToYV12 = ff_bgr24ToUV_neon;
>             break;
>+        case AV_PIX_FMT_BGRA:
>+            c->lumToYV12 = ff_bgra32ToY_neon;
>+            if (c->chrSrcHSubSample)
>+                c->chrToYV12 = ff_bgra32ToUV_half_neon;
>+            else
>+                c->chrToYV12 = ff_bgra32ToUV_neon;
>+            break;
>         case AV_PIX_FMT_RGB24:
>             c->lumToYV12 = ff_rgb24ToY_neon;
>             if (c->chrSrcHSubSample)
>@@ -240,6 +249,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>             else
>                 c->chrToYV12 = ff_rgb24ToUV_neon;
>             break;
>+        case AV_PIX_FMT_RGBA:
>+            c->lumToYV12 = ff_rgba32ToY_neon;
>+            if (c->chrSrcHSubSample)
>+                c->chrToYV12 = ff_rgba32ToUV_half_neon;
>+            else
>+                c->chrToYV12 = ff_rgba32ToUV_neon;
>+            break;
>         default:
>             break;
>         }
Zhao Zhili June 19, 2024, 9:24 a.m. UTC | #3
> On Jun 19, 2024, at 15:07, Rémi Denis-Courmont <remi@remlab.net> wrote:
> 
> 
> 
> Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit :
>> From: Zhao Zhili <zhilizhao@tencent.com>
>> 
>> Test on Apple M1 with kperf
>> 
>> bgra_to_uv_8_c: 13.4
>> bgra_to_uv_8_neon: 37.4
>> bgra_to_uv_128_c: 155.9
>> bgra_to_uv_128_neon: 91.7
>> bgra_to_uv_1080_c: 1173.2
>> bgra_to_uv_1080_neon: 822.7
>> bgra_to_uv_1920_c: 2078.2
>> bgra_to_uv_1920_neon: 1437.7
>> bgra_to_uv_half_8_c: 17.9
>> bgra_to_uv_half_8_neon: 37.4
>> bgra_to_uv_half_128_c: 103.9
>> bgra_to_uv_half_128_neon: 73.9
>> bgra_to_uv_half_1080_c: 850.2
>> bgra_to_uv_half_1080_neon: 484.2
>> bgra_to_uv_half_1920_c: 1479.2
>> bgra_to_uv_half_1920_neon: 824.2
>> bgra_to_y_8_c: 8.2
>> bgra_to_y_8_neon: 18.2
>> bgra_to_y_128_c: 101.4
>> bgra_to_y_128_neon: 74.9
>> bgra_to_y_1080_c: 739.4
>> bgra_to_y_1080_neon: 613.4
>> bgra_to_y_1920_c: 1298.7
>> bgra_to_y_1920_neon: 918.7
>> ---
>> libswscale/aarch64/input.S   | 81 +++++++++++++++++++++++++++++++-----
>> libswscale/aarch64/swscale.c | 16 +++++++
>> 2 files changed, 86 insertions(+), 11 deletions(-)
>> 
>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>> index 2b956fe5c2..37f1158504 100644
>> --- a/libswscale/aarch64/input.S
>> +++ b/libswscale/aarch64/input.S
>> @@ -20,8 +20,12 @@
>> 
>> #include "libavutil/aarch64/asm.S"
>> 
>> -.macro rgb_to_yuv_load_rgb src
>> +.macro rgb_to_yuv_load_rgb src, element=3
>> +    .if \element == 3
>>        ld3             { v16.16b, v17.16b, v18.16b }, [\src]
>> +    .else
>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
>> +    .endif
>>        uxtl            v19.8h, v16.8b             // v19: r
>>        uxtl            v20.8h, v17.8b             // v20: g
>>        uxtl            v21.8h, v18.8b             // v21: b
>> @@ -43,7 +47,7 @@
>>        sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
>> .endm
>> 
>> -.macro rgbToY bgr
>> +.macro rgbToY bgr, element=3
> 
> AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop.

I’m not sure where to add the branch. Could you elaborate? Do you mean load coefficients first like the following:

function ff_bgr24ToUV_half_neon, export=1
        ldr             w12, [x6, #12]
        ldr             w11, [x6, #16]
        ldr             w10, [x6, #20]
        ldr             w15, [x6, #24]
        ldr             w14, [x6, #28]
        ldr             w13, [x6, #32]
        rgbToUV_half
endfunc

> 
>>        cmp             w4, #0                  // check width > 0
>>    .if \bgr
>>        ldr             w12, [x5]               // w12: ry
>> @@ -67,11 +71,15 @@
>>        dup             v2.8h, w12
>>        b.lt            2f
>> 1:
>> -        rgb_to_yuv_load_rgb x1
>> +        rgb_to_yuv_load_rgb x1, \element
>>        rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>>        rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>>        sub             w4, w4, #16             // width -= 16
>> +    .if \element == 3
>>        add             x1, x1, #48             // src += 48
>> +    .else
>> +        add             x1, x1, #64
>> +    .endif
>>        cmp             w4, #16                 // width >= 16 ?
>>        stp             q16, q17, [x0], #32     // store to dst
>>        b.ge            1b
>> @@ -86,7 +94,7 @@
>>        smaddl          x13, w15, w12, x13      // x13 += by * b
>>        asr             w13, w13, #9            // x13 >>= 9
>>        sub             w4, w4, #1              // width--
>> -        add             x1, x1, #3              // src += 3
>> +        add             x1, x1, \element
>>        strh            w13, [x0], #2           // store to dst
>>        cbnz            w4, 2b
>> 3:
>> @@ -101,6 +109,14 @@ function ff_bgr24ToY_neon, export=1
>>        rgbToY          bgr=1
>> endfunc
>> 
>> +function ff_rgba32ToY_neon, export=1
>> +        rgbToY          bgr=0, element=4
>> +endfunc
>> +
>> +function ff_bgra32ToY_neon, export=1
>> +        rgbToY          bgr=1, element=4
>> +endfunc
>> +
>> .macro rgb_load_uv_coeff half, bgr
>>    .if \bgr
>>        ldr             w12, [x6, #12]
>> @@ -130,7 +146,7 @@ endfunc
>>        dup             v6.4s, w9
>> .endm
>> 
>> -.macro rgbToUV_half bgr
>> +.macro rgbToUV_half bgr, element=3
>>        cmp             w5, #0          // check width > 0
>>        b.le            3f
>> 
>> @@ -139,7 +155,11 @@ endfunc
>>        b.lt            2f
>>        // The following comments assume RGB order. The logic for RGB and BGR is the same.
>> 1:
>> +    .if \element == 3
>>        ld3             { v16.16b, v17.16b, v18.16b }, [x3]
>> +    .else
>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
>> +    .endif
>>        uaddlp          v19.8h, v16.16b         // v19: r
>>        uaddlp          v20.8h, v17.16b         // v20: g
>>        uaddlp          v21.8h, v18.16b         // v21: b
>> @@ -147,7 +167,11 @@ endfunc
>>        rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>>        rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>>        sub             w5, w5, #8              // width -= 8
>> -        add             x3, x3, #48             // src += 48
>> +    .if \element == 3
>> +        add             x3, x3, #48
>> +    .else
>> +        add             x3, x3, #64
>> +    .endif
>>        cmp             w5, #8                  // width >= 8 ?
>>        str             q16, [x0], #16          // store dst_u
>>        str             q17, [x1], #16          // store dst_v
>> @@ -155,9 +179,10 @@ endfunc
>>        cbz             w5, 3f
>> 2:
>>        ldrb            w2, [x3]                // w2: r1
>> -        ldrb            w4, [x3, #3]            // w4: r2
>> +        ldrb            w4, [x3, \element]      // w4: r2
>>        add             w2, w2, w4              // w2 = r1 + r2
>> 
>> +    .if \element == 3
>>        ldrb            w4, [x3, #1]            // w4: g1
>>        ldrb            w7, [x3, #4]            // w7: g2
>>        add             w4, w4, w7              // w4 = g1 + g2
>> @@ -165,6 +190,15 @@ endfunc
>>        ldrb            w7, [x3, #2]            // w7: b1
>>        ldrb            w8, [x3, #5]            // w8: b2
>>        add             w7, w7, w8              // w7 = b1 + b2
>> +    .else
>> +        ldrb            w4, [x3, #1]            // w4: g1
>> +        ldrb            w7, [x3, #5]            // w7: g2
>> +        add             w4, w4, w7              // w4 = g1 + g2
>> +
>> +        ldrb            w7, [x3, #2]            // w7: b1
>> +        ldrb            w8, [x3, #6]            // w8: b2
>> +        add             w7, w7, w8              // w7 = b1 + b2
>> +    .endif
>> 
>>        smaddl          x8, w2, w10, x9         // dst_u = ru * r + const_offset
>>        smaddl          x8, w4, w11, x8         // dst_u += gu * g
>> @@ -177,7 +211,12 @@ endfunc
>>        smaddl          x8, w7, w15, x8         // dst_v += bv * b
>>        asr             x8, x8, #10             // dst_v >>= 10
>>        sub             w5, w5, #1
>> -        add             x3, x3, #6              // src += 6
>> +        ldrb            w4, [x3, #1]            // w4: g1
>> +    .if \element == 3
>> +        add             x3, x3, #6
>> +    .else
>> +        add             x3, x3, #8
>> +    .endif
>>        strh            w8, [x1], #2            // store dst_v
>>        cbnz            w5, 2b
>> 3:
>> @@ -192,7 +231,15 @@ function ff_bgr24ToUV_half_neon, export=1
>>        rgbToUV_half    bgr=1
>> endfunc
>> 
>> -.macro rgbToUV bgr
>> +function ff_rgba32ToUV_half_neon, export=1
>> +        rgbToUV_half    bgr=0, element=4
>> +endfunc
>> +
>> +function ff_bgra32ToUV_half_neon, export=1
>> +        rgbToUV_half    bgr=1, element=4
>> +endfunc
>> +
>> +.macro rgbToUV bgr, element=3
>>        cmp             w5, #0                  // check width > 0
>>        b.le            3f
>> 
>> @@ -201,13 +248,17 @@ endfunc
>>        b.lt            2f
>>        // The following comments assume RGB order. The logic for RGB and BGR is the same.
>> 1:
>> -        rgb_to_yuv_load_rgb x3
>> +        rgb_to_yuv_load_rgb x3, \element
>>        rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>>        rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>>        rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
>>        rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
>>        sub             w5, w5, #16
>> +    .if \element == 3
>>        add             x3, x3, #48             // src += 48
>> +    .else
>> +        add             x3, x3, #64
>> +    .endif
>>        cmp             w5, #16
>>        stp             q16, q17, [x0], #32     // store to dst_u
>>        stp             q18, q19, [x1], #32     // store to dst_v
>> @@ -229,7 +280,7 @@ endfunc
>>        smaddl          x8, w4, w15, x8         // x8 += bv * b
>>        asr             w8, w8, #9              // x8 >>= 9
>>        sub             w5, w5, #1              // width--
>> -        add             x3, x3, #3              // src += 3
>> +        add             x3, x3, \element
>>        strh            w8, [x1], #2            // store to dst_v
>>        cbnz            w5, 2b
>> 3:
>> @@ -243,3 +294,11 @@ endfunc
>> function ff_bgr24ToUV_neon, export=1
>>        rgbToUV         bgr=1
>> endfunc
>> +
>> +function ff_rgba32ToUV_neon, export=1
>> +        rgbToUV         bgr=0, element=4
>> +endfunc
>> +
>> +function ff_bgra32ToUV_neon, export=1
>> +        rgbToUV         bgr=1, element=4
>> +endfunc
>> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
>> index ce70dbedcc..8fe9fb11ac 100644
>> --- a/libswscale/aarch64/swscale.c
>> +++ b/libswscale/aarch64/swscale.c
>> @@ -212,7 +212,9 @@ void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \
>>                              uint32_t *coeffs, void *)
>> 
>> NEON_INPUT(bgr24);
>> +NEON_INPUT(bgra32);
>> NEON_INPUT(rgb24);
>> +NEON_INPUT(rgba32);
>> 
>> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>> {
>> @@ -233,6 +235,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>>            else
>>                c->chrToYV12 = ff_bgr24ToUV_neon;
>>            break;
>> +        case AV_PIX_FMT_BGRA:
>> +            c->lumToYV12 = ff_bgra32ToY_neon;
>> +            if (c->chrSrcHSubSample)
>> +                c->chrToYV12 = ff_bgra32ToUV_half_neon;
>> +            else
>> +                c->chrToYV12 = ff_bgra32ToUV_neon;
>> +            break;
>>        case AV_PIX_FMT_RGB24:
>>            c->lumToYV12 = ff_rgb24ToY_neon;
>>            if (c->chrSrcHSubSample)
>> @@ -240,6 +249,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>>            else
>>                c->chrToYV12 = ff_rgb24ToUV_neon;
>>            break;
>> +        case AV_PIX_FMT_RGBA:
>> +            c->lumToYV12 = ff_rgba32ToY_neon;
>> +            if (c->chrSrcHSubSample)
>> +                c->chrToYV12 = ff_rgba32ToUV_half_neon;
>> +            else
>> +                c->chrToYV12 = ff_rgba32ToUV_neon;
>> +            break;
>>        default:
>>            break;
>>        }
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Rémi Denis-Courmont June 19, 2024, 12:05 p.m. UTC | #4
Le 19 juin 2024 11:24:28 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit :
>
>
>> On Jun 19, 2024, at 15:07, Rémi Denis-Courmont <remi@remlab.net> wrote:
>> 
>> 
>> 
>> Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit :
>>> From: Zhao Zhili <zhilizhao@tencent.com>
>>> 
>>> Test on Apple M1 with kperf
>>> 
>>> bgra_to_uv_8_c: 13.4
>>> bgra_to_uv_8_neon: 37.4
>>> bgra_to_uv_128_c: 155.9
>>> bgra_to_uv_128_neon: 91.7
>>> bgra_to_uv_1080_c: 1173.2
>>> bgra_to_uv_1080_neon: 822.7
>>> bgra_to_uv_1920_c: 2078.2
>>> bgra_to_uv_1920_neon: 1437.7
>>> bgra_to_uv_half_8_c: 17.9
>>> bgra_to_uv_half_8_neon: 37.4
>>> bgra_to_uv_half_128_c: 103.9
>>> bgra_to_uv_half_128_neon: 73.9
>>> bgra_to_uv_half_1080_c: 850.2
>>> bgra_to_uv_half_1080_neon: 484.2
>>> bgra_to_uv_half_1920_c: 1479.2
>>> bgra_to_uv_half_1920_neon: 824.2
>>> bgra_to_y_8_c: 8.2
>>> bgra_to_y_8_neon: 18.2
>>> bgra_to_y_128_c: 101.4
>>> bgra_to_y_128_neon: 74.9
>>> bgra_to_y_1080_c: 739.4
>>> bgra_to_y_1080_neon: 613.4
>>> bgra_to_y_1920_c: 1298.7
>>> bgra_to_y_1920_neon: 918.7
>>> ---
>>> libswscale/aarch64/input.S   | 81 +++++++++++++++++++++++++++++++-----
>>> libswscale/aarch64/swscale.c | 16 +++++++
>>> 2 files changed, 86 insertions(+), 11 deletions(-)
>>> 
>>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>>> index 2b956fe5c2..37f1158504 100644
>>> --- a/libswscale/aarch64/input.S
>>> +++ b/libswscale/aarch64/input.S
>>> @@ -20,8 +20,12 @@
>>> 
>>> #include "libavutil/aarch64/asm.S"
>>> 
>>> -.macro rgb_to_yuv_load_rgb src
>>> +.macro rgb_to_yuv_load_rgb src, element=3
>>> +    .if \element == 3
>>>        ld3             { v16.16b, v17.16b, v18.16b }, [\src]
>>> +    .else
>>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
>>> +    .endif
>>>        uxtl            v19.8h, v16.8b             // v19: r
>>>        uxtl            v20.8h, v17.8b             // v20: g
>>>        uxtl            v21.8h, v18.8b             // v21: b
>>> @@ -43,7 +47,7 @@
>>>        sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
>>> .endm
>>> 
>>> -.macro rgbToY bgr
>>> +.macro rgbToY bgr, element=3
>> 
>> AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop.
>
>I’m not sure where to add the branch. Could you elaborate? Do you mean load coefficients first like the following:
>
>function ff_bgr24ToUV_half_neon, export=1
>        ldr             w12, [x6, #12]
>        ldr             w11, [x6, #16]
>        ldr             w10, [x6, #20]
>        ldr             w15, [x6, #24]
>        ldr             w14, [x6, #28]
>        ldr             w13, [x6, #32]
>        rgbToUV_half
>endfunc

Hmm, no. You need to jump past the loading of red and blue coefficients. It might help to load green coefficients last.

By the way, I think you can use LDP instead of LDR.

>
>> 
>>>        cmp             w4, #0                  // check width > 0
>>>    .if \bgr
>>>        ldr             w12, [x5]               // w12: ry
>>> @@ -67,11 +71,15 @@
>>>        dup             v2.8h, w12
>>>        b.lt            2f
>>> 1:
>>> -        rgb_to_yuv_load_rgb x1
>>> +        rgb_to_yuv_load_rgb x1, \element
>>>        rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>>>        rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>>>        sub             w4, w4, #16             // width -= 16
>>> +    .if \element == 3
>>>        add             x1, x1, #48             // src += 48
>>> +    .else
>>> +        add             x1, x1, #64
>>> +    .endif
>>>        cmp             w4, #16                 // width >= 16 ?
>>>        stp             q16, q17, [x0], #32     // store to dst
>>>        b.ge            1b
>>> @@ -86,7 +94,7 @@
>>>        smaddl          x13, w15, w12, x13      // x13 += by * b
>>>        asr             w13, w13, #9            // x13 >>= 9
>>>        sub             w4, w4, #1              // width--
>>> -        add             x1, x1, #3              // src += 3
>>> +        add             x1, x1, \element
>>>        strh            w13, [x0], #2           // store to dst
>>>        cbnz            w4, 2b
>>> 3:
>>> @@ -101,6 +109,14 @@ function ff_bgr24ToY_neon, export=1
>>>        rgbToY          bgr=1
>>> endfunc
>>> 
>>> +function ff_rgba32ToY_neon, export=1
>>> +        rgbToY          bgr=0, element=4
>>> +endfunc
>>> +
>>> +function ff_bgra32ToY_neon, export=1
>>> +        rgbToY          bgr=1, element=4
>>> +endfunc
>>> +
>>> .macro rgb_load_uv_coeff half, bgr
>>>    .if \bgr
>>>        ldr             w12, [x6, #12]
>>> @@ -130,7 +146,7 @@ endfunc
>>>        dup             v6.4s, w9
>>> .endm
>>> 
>>> -.macro rgbToUV_half bgr
>>> +.macro rgbToUV_half bgr, element=3
>>>        cmp             w5, #0          // check width > 0
>>>        b.le            3f
>>> 
>>> @@ -139,7 +155,11 @@ endfunc
>>>        b.lt            2f
>>>        // The following comments assume RGB order. The logic for RGB and BGR is the same.
>>> 1:
>>> +    .if \element == 3
>>>        ld3             { v16.16b, v17.16b, v18.16b }, [x3]
>>> +    .else
>>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
>>> +    .endif
>>>        uaddlp          v19.8h, v16.16b         // v19: r
>>>        uaddlp          v20.8h, v17.16b         // v20: g
>>>        uaddlp          v21.8h, v18.16b         // v21: b
>>> @@ -147,7 +167,11 @@ endfunc
>>>        rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>>>        rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>>>        sub             w5, w5, #8              // width -= 8
>>> -        add             x3, x3, #48             // src += 48
>>> +    .if \element == 3
>>> +        add             x3, x3, #48
>>> +    .else
>>> +        add             x3, x3, #64
>>> +    .endif
>>>        cmp             w5, #8                  // width >= 8 ?
>>>        str             q16, [x0], #16          // store dst_u
>>>        str             q17, [x1], #16          // store dst_v
>>> @@ -155,9 +179,10 @@ endfunc
>>>        cbz             w5, 3f
>>> 2:
>>>        ldrb            w2, [x3]                // w2: r1
>>> -        ldrb            w4, [x3, #3]            // w4: r2
>>> +        ldrb            w4, [x3, \element]      // w4: r2
>>>        add             w2, w2, w4              // w2 = r1 + r2
>>> 
>>> +    .if \element == 3
>>>        ldrb            w4, [x3, #1]            // w4: g1
>>>        ldrb            w7, [x3, #4]            // w7: g2
>>>        add             w4, w4, w7              // w4 = g1 + g2
>>> @@ -165,6 +190,15 @@ endfunc
>>>        ldrb            w7, [x3, #2]            // w7: b1
>>>        ldrb            w8, [x3, #5]            // w8: b2
>>>        add             w7, w7, w8              // w7 = b1 + b2
>>> +    .else
>>> +        ldrb            w4, [x3, #1]            // w4: g1
>>> +        ldrb            w7, [x3, #5]            // w7: g2
>>> +        add             w4, w4, w7              // w4 = g1 + g2
>>> +
>>> +        ldrb            w7, [x3, #2]            // w7: b1
>>> +        ldrb            w8, [x3, #6]            // w8: b2
>>> +        add             w7, w7, w8              // w7 = b1 + b2
>>> +    .endif
>>> 
>>>        smaddl          x8, w2, w10, x9         // dst_u = ru * r + const_offset
>>>        smaddl          x8, w4, w11, x8         // dst_u += gu * g
>>> @@ -177,7 +211,12 @@ endfunc
>>>        smaddl          x8, w7, w15, x8         // dst_v += bv * b
>>>        asr             x8, x8, #10             // dst_v >>= 10
>>>        sub             w5, w5, #1
>>> -        add             x3, x3, #6              // src += 6
>>> +        ldrb            w4, [x3, #1]            // w4: g1
>>> +    .if \element == 3
>>> +        add             x3, x3, #6
>>> +    .else
>>> +        add             x3, x3, #8
>>> +    .endif
>>>        strh            w8, [x1], #2            // store dst_v
>>>        cbnz            w5, 2b
>>> 3:
>>> @@ -192,7 +231,15 @@ function ff_bgr24ToUV_half_neon, export=1
>>>        rgbToUV_half    bgr=1
>>> endfunc
>>> 
>>> -.macro rgbToUV bgr
>>> +function ff_rgba32ToUV_half_neon, export=1
>>> +        rgbToUV_half    bgr=0, element=4
>>> +endfunc
>>> +
>>> +function ff_bgra32ToUV_half_neon, export=1
>>> +        rgbToUV_half    bgr=1, element=4
>>> +endfunc
>>> +
>>> +.macro rgbToUV bgr, element=3
>>>        cmp             w5, #0                  // check width > 0
>>>        b.le            3f
>>> 
>>> @@ -201,13 +248,17 @@ endfunc
>>>        b.lt            2f
>>>        // The following comments assume RGB order. The logic for RGB and BGR is the same.
>>> 1:
>>> -        rgb_to_yuv_load_rgb x3
>>> +        rgb_to_yuv_load_rgb x3, \element
>>>        rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>>>        rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>>>        rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
>>>        rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
>>>        sub             w5, w5, #16
>>> +    .if \element == 3
>>>        add             x3, x3, #48             // src += 48
>>> +    .else
>>> +        add             x3, x3, #64
>>> +    .endif
>>>        cmp             w5, #16
>>>        stp             q16, q17, [x0], #32     // store to dst_u
>>>        stp             q18, q19, [x1], #32     // store to dst_v
>>> @@ -229,7 +280,7 @@ endfunc
>>>        smaddl          x8, w4, w15, x8         // x8 += bv * b
>>>        asr             w8, w8, #9              // x8 >>= 9
>>>        sub             w5, w5, #1              // width--
>>> -        add             x3, x3, #3              // src += 3
>>> +        add             x3, x3, \element
>>>        strh            w8, [x1], #2            // store to dst_v
>>>        cbnz            w5, 2b
>>> 3:
>>> @@ -243,3 +294,11 @@ endfunc
>>> function ff_bgr24ToUV_neon, export=1
>>>        rgbToUV         bgr=1
>>> endfunc
>>> +
>>> +function ff_rgba32ToUV_neon, export=1
>>> +        rgbToUV         bgr=0, element=4
>>> +endfunc
>>> +
>>> +function ff_bgra32ToUV_neon, export=1
>>> +        rgbToUV         bgr=1, element=4
>>> +endfunc
>>> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
>>> index ce70dbedcc..8fe9fb11ac 100644
>>> --- a/libswscale/aarch64/swscale.c
>>> +++ b/libswscale/aarch64/swscale.c
>>> @@ -212,7 +212,9 @@ void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \
>>>                              uint32_t *coeffs, void *)
>>> 
>>> NEON_INPUT(bgr24);
>>> +NEON_INPUT(bgra32);
>>> NEON_INPUT(rgb24);
>>> +NEON_INPUT(rgba32);
>>> 
>>> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>>> {
>>> @@ -233,6 +235,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>>>            else
>>>                c->chrToYV12 = ff_bgr24ToUV_neon;
>>>            break;
>>> +        case AV_PIX_FMT_BGRA:
>>> +            c->lumToYV12 = ff_bgra32ToY_neon;
>>> +            if (c->chrSrcHSubSample)
>>> +                c->chrToYV12 = ff_bgra32ToUV_half_neon;
>>> +            else
>>> +                c->chrToYV12 = ff_bgra32ToUV_neon;
>>> +            break;
>>>        case AV_PIX_FMT_RGB24:
>>>            c->lumToYV12 = ff_rgb24ToY_neon;
>>>            if (c->chrSrcHSubSample)
>>> @@ -240,6 +249,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>>>            else
>>>                c->chrToYV12 = ff_rgb24ToUV_neon;
>>>            break;
>>> +        case AV_PIX_FMT_RGBA:
>>> +            c->lumToYV12 = ff_rgba32ToY_neon;
>>> +            if (c->chrSrcHSubSample)
>>> +                c->chrToYV12 = ff_rgba32ToUV_half_neon;
>>> +            else
>>> +                c->chrToYV12 = ff_rgba32ToUV_neon;
>>> +            break;
>>>        default:
>>>            break;
>>>        }
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>> 
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
>_______________________________________________
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Zhao Zhili June 19, 2024, 5:15 p.m. UTC | #5
> On Jun 19, 2024, at 20:05, Rémi Denis-Courmont <remi@remlab.net> wrote:
> 
> 
> 
> Le 19 juin 2024 11:24:28 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit :
>> 
>> 
>>> On Jun 19, 2024, at 15:07, Rémi Denis-Courmont <remi@remlab.net> wrote:
>>> 
>>> 
>>> 
>>> Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit :
>>>> From: Zhao Zhili <zhilizhao@tencent.com>
>>>> 
>>>> Test on Apple M1 with kperf
>>>> 
>>>> bgra_to_uv_8_c: 13.4
>>>> bgra_to_uv_8_neon: 37.4
>>>> bgra_to_uv_128_c: 155.9
>>>> bgra_to_uv_128_neon: 91.7
>>>> bgra_to_uv_1080_c: 1173.2
>>>> bgra_to_uv_1080_neon: 822.7
>>>> bgra_to_uv_1920_c: 2078.2
>>>> bgra_to_uv_1920_neon: 1437.7
>>>> bgra_to_uv_half_8_c: 17.9
>>>> bgra_to_uv_half_8_neon: 37.4
>>>> bgra_to_uv_half_128_c: 103.9
>>>> bgra_to_uv_half_128_neon: 73.9
>>>> bgra_to_uv_half_1080_c: 850.2
>>>> bgra_to_uv_half_1080_neon: 484.2
>>>> bgra_to_uv_half_1920_c: 1479.2
>>>> bgra_to_uv_half_1920_neon: 824.2
>>>> bgra_to_y_8_c: 8.2
>>>> bgra_to_y_8_neon: 18.2
>>>> bgra_to_y_128_c: 101.4
>>>> bgra_to_y_128_neon: 74.9
>>>> bgra_to_y_1080_c: 739.4
>>>> bgra_to_y_1080_neon: 613.4
>>>> bgra_to_y_1920_c: 1298.7
>>>> bgra_to_y_1920_neon: 918.7
>>>> ---
>>>> libswscale/aarch64/input.S   | 81 +++++++++++++++++++++++++++++++-----
>>>> libswscale/aarch64/swscale.c | 16 +++++++
>>>> 2 files changed, 86 insertions(+), 11 deletions(-)
>>>> 
>>>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>>>> index 2b956fe5c2..37f1158504 100644
>>>> --- a/libswscale/aarch64/input.S
>>>> +++ b/libswscale/aarch64/input.S
>>>> @@ -20,8 +20,12 @@
>>>> 
>>>> #include "libavutil/aarch64/asm.S"
>>>> 
>>>> -.macro rgb_to_yuv_load_rgb src
>>>> +.macro rgb_to_yuv_load_rgb src, element=3
>>>> +    .if \element == 3
>>>>       ld3             { v16.16b, v17.16b, v18.16b }, [\src]
>>>> +    .else
>>>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
>>>> +    .endif
>>>>       uxtl            v19.8h, v16.8b             // v19: r
>>>>       uxtl            v20.8h, v17.8b             // v20: g
>>>>       uxtl            v21.8h, v18.8b             // v21: b
>>>> @@ -43,7 +47,7 @@
>>>>       sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
>>>> .endm
>>>> 
>>>> -.macro rgbToY bgr
>>>> +.macro rgbToY bgr, element=3
>>> 
>>> AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop.
>> 
>> I’m not sure where to add the branch. Could you elaborate? Do you mean load coefficients first like the following:
>> 
>> function ff_bgr24ToUV_half_neon, export=1
>>       ldr             w12, [x6, #12]
>>       ldr             w11, [x6, #16]
>>       ldr             w10, [x6, #20]
>>       ldr             w15, [x6, #24]
>>       ldr             w14, [x6, #28]
>>       ldr             w13, [x6, #32]
>>       rgbToUV_half
>> endfunc
> 
> Hmm, no. You need to jump past the loading of red and blue coefficients. It might help to load green coefficients last.
> 
> By the way, I think you can use LDP instead of LDR.

Patch v2 replace LDR by LDP, then the "jump past the loading of red and blue coefficients” doesn’t apply now.

> 
>> 
>>> 
>>>>       cmp             w4, #0                  // check width > 0
>>>>   .if \bgr
>>>>       ldr             w12, [x5]               // w12: ry
>>>> @@ -67,11 +71,15 @@
>>>>       dup             v2.8h, w12
>>>>       b.lt            2f
>>>> 1:
>>>> -        rgb_to_yuv_load_rgb x1
>>>> +        rgb_to_yuv_load_rgb x1, \element
>>>>       rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>>>>       rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>>>>       sub             w4, w4, #16             // width -= 16
>>>> +    .if \element == 3
>>>>       add             x1, x1, #48             // src += 48
>>>> +    .else
>>>> +        add             x1, x1, #64
>>>> +    .endif
>>>>       cmp             w4, #16                 // width >= 16 ?
>>>>       stp             q16, q17, [x0], #32     // store to dst
>>>>       b.ge            1b
>>>> @@ -86,7 +94,7 @@
>>>>       smaddl          x13, w15, w12, x13      // x13 += by * b
>>>>       asr             w13, w13, #9            // x13 >>= 9
>>>>       sub             w4, w4, #1              // width--
>>>> -        add             x1, x1, #3              // src += 3
>>>> +        add             x1, x1, \element
>>>>       strh            w13, [x0], #2           // store to dst
>>>>       cbnz            w4, 2b
>>>> 3:
>>>> @@ -101,6 +109,14 @@ function ff_bgr24ToY_neon, export=1
>>>>       rgbToY          bgr=1
>>>> endfunc
>>>> 
>>>> +function ff_rgba32ToY_neon, export=1
>>>> +        rgbToY          bgr=0, element=4
>>>> +endfunc
>>>> +
>>>> +function ff_bgra32ToY_neon, export=1
>>>> +        rgbToY          bgr=1, element=4
>>>> +endfunc
>>>> +
>>>> .macro rgb_load_uv_coeff half, bgr
>>>>   .if \bgr
>>>>       ldr             w12, [x6, #12]
>>>> @@ -130,7 +146,7 @@ endfunc
>>>>       dup             v6.4s, w9
>>>> .endm
>>>> 
>>>> -.macro rgbToUV_half bgr
>>>> +.macro rgbToUV_half bgr, element=3
>>>>       cmp             w5, #0          // check width > 0
>>>>       b.le            3f
>>>> 
>>>> @@ -139,7 +155,11 @@ endfunc
>>>>       b.lt            2f
>>>>       // The following comments assume RGB order. The logic for RGB and BGR is the same.
>>>> 1:
>>>> +    .if \element == 3
>>>>       ld3             { v16.16b, v17.16b, v18.16b }, [x3]
>>>> +    .else
>>>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
>>>> +    .endif
>>>>       uaddlp          v19.8h, v16.16b         // v19: r
>>>>       uaddlp          v20.8h, v17.16b         // v20: g
>>>>       uaddlp          v21.8h, v18.16b         // v21: b
>>>> @@ -147,7 +167,11 @@ endfunc
>>>>       rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
>>>>       rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
>>>>       sub             w5, w5, #8              // width -= 8
>>>> -        add             x3, x3, #48             // src += 48
>>>> +    .if \element == 3
>>>> +        add             x3, x3, #48
>>>> +    .else
>>>> +        add             x3, x3, #64
>>>> +    .endif
>>>>       cmp             w5, #8                  // width >= 8 ?
>>>>       str             q16, [x0], #16          // store dst_u
>>>>       str             q17, [x1], #16          // store dst_v
>>>> @@ -155,9 +179,10 @@ endfunc
>>>>       cbz             w5, 3f
>>>> 2:
>>>>       ldrb            w2, [x3]                // w2: r1
>>>> -        ldrb            w4, [x3, #3]            // w4: r2
>>>> +        ldrb            w4, [x3, \element]      // w4: r2
>>>>       add             w2, w2, w4              // w2 = r1 + r2
>>>> 
>>>> +    .if \element == 3
>>>>       ldrb            w4, [x3, #1]            // w4: g1
>>>>       ldrb            w7, [x3, #4]            // w7: g2
>>>>       add             w4, w4, w7              // w4 = g1 + g2
>>>> @@ -165,6 +190,15 @@ endfunc
>>>>       ldrb            w7, [x3, #2]            // w7: b1
>>>>       ldrb            w8, [x3, #5]            // w8: b2
>>>>       add             w7, w7, w8              // w7 = b1 + b2
>>>> +    .else
>>>> +        ldrb            w4, [x3, #1]            // w4: g1
>>>> +        ldrb            w7, [x3, #5]            // w7: g2
>>>> +        add             w4, w4, w7              // w4 = g1 + g2
>>>> +
>>>> +        ldrb            w7, [x3, #2]            // w7: b1
>>>> +        ldrb            w8, [x3, #6]            // w8: b2
>>>> +        add             w7, w7, w8              // w7 = b1 + b2
>>>> +    .endif
>>>> 
>>>>       smaddl          x8, w2, w10, x9         // dst_u = ru * r + const_offset
>>>>       smaddl          x8, w4, w11, x8         // dst_u += gu * g
>>>> @@ -177,7 +211,12 @@ endfunc
>>>>       smaddl          x8, w7, w15, x8         // dst_v += bv * b
>>>>       asr             x8, x8, #10             // dst_v >>= 10
>>>>       sub             w5, w5, #1
>>>> -        add             x3, x3, #6              // src += 6
>>>> +        ldrb            w4, [x3, #1]            // w4: g1
>>>> +    .if \element == 3
>>>> +        add             x3, x3, #6
>>>> +    .else
>>>> +        add             x3, x3, #8
>>>> +    .endif
>>>>       strh            w8, [x1], #2            // store dst_v
>>>>       cbnz            w5, 2b
>>>> 3:
>>>> @@ -192,7 +231,15 @@ function ff_bgr24ToUV_half_neon, export=1
>>>>       rgbToUV_half    bgr=1
>>>> endfunc
>>>> 
>>>> -.macro rgbToUV bgr
>>>> +function ff_rgba32ToUV_half_neon, export=1
>>>> +        rgbToUV_half    bgr=0, element=4
>>>> +endfunc
>>>> +
>>>> +function ff_bgra32ToUV_half_neon, export=1
>>>> +        rgbToUV_half    bgr=1, element=4
>>>> +endfunc
>>>> +
>>>> +.macro rgbToUV bgr, element=3
>>>>       cmp             w5, #0                  // check width > 0
>>>>       b.le            3f
>>>> 
>>>> @@ -201,13 +248,17 @@ endfunc
>>>>       b.lt            2f
>>>>       // The following comments assume RGB order. The logic for RGB and BGR is the same.
>>>> 1:
>>>> -        rgb_to_yuv_load_rgb x3
>>>> +        rgb_to_yuv_load_rgb x3, \element
>>>>       rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
>>>>       rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
>>>>       rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
>>>>       rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
>>>>       sub             w5, w5, #16
>>>> +    .if \element == 3
>>>>       add             x3, x3, #48             // src += 48
>>>> +    .else
>>>> +        add             x3, x3, #64
>>>> +    .endif
>>>>       cmp             w5, #16
>>>>       stp             q16, q17, [x0], #32     // store to dst_u
>>>>       stp             q18, q19, [x1], #32     // store to dst_v
>>>> @@ -229,7 +280,7 @@ endfunc
>>>>       smaddl          x8, w4, w15, x8         // x8 += bv * b
>>>>       asr             w8, w8, #9              // x8 >>= 9
>>>>       sub             w5, w5, #1              // width--
>>>> -        add             x3, x3, #3              // src += 3
>>>> +        add             x3, x3, \element
>>>>       strh            w8, [x1], #2            // store to dst_v
>>>>       cbnz            w5, 2b
>>>> 3:
>>>> @@ -243,3 +294,11 @@ endfunc
>>>> function ff_bgr24ToUV_neon, export=1
>>>>       rgbToUV         bgr=1
>>>> endfunc
>>>> +
>>>> +function ff_rgba32ToUV_neon, export=1
>>>> +        rgbToUV         bgr=0, element=4
>>>> +endfunc
>>>> +
>>>> +function ff_bgra32ToUV_neon, export=1
>>>> +        rgbToUV         bgr=1, element=4
>>>> +endfunc
>>>> diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
>>>> index ce70dbedcc..8fe9fb11ac 100644
>>>> --- a/libswscale/aarch64/swscale.c
>>>> +++ b/libswscale/aarch64/swscale.c
>>>> @@ -212,7 +212,9 @@ void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \
>>>>                             uint32_t *coeffs, void *)
>>>> 
>>>> NEON_INPUT(bgr24);
>>>> +NEON_INPUT(bgra32);
>>>> NEON_INPUT(rgb24);
>>>> +NEON_INPUT(rgba32);
>>>> 
>>>> av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>>>> {
>>>> @@ -233,6 +235,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>>>>           else
>>>>               c->chrToYV12 = ff_bgr24ToUV_neon;
>>>>           break;
>>>> +        case AV_PIX_FMT_BGRA:
>>>> +            c->lumToYV12 = ff_bgra32ToY_neon;
>>>> +            if (c->chrSrcHSubSample)
>>>> +                c->chrToYV12 = ff_bgra32ToUV_half_neon;
>>>> +            else
>>>> +                c->chrToYV12 = ff_bgra32ToUV_neon;
>>>> +            break;
>>>>       case AV_PIX_FMT_RGB24:
>>>>           c->lumToYV12 = ff_rgb24ToY_neon;
>>>>           if (c->chrSrcHSubSample)
>>>> @@ -240,6 +249,13 @@ av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
>>>>           else
>>>>               c->chrToYV12 = ff_rgb24ToUV_neon;
>>>>           break;
>>>> +        case AV_PIX_FMT_RGBA:
>>>> +            c->lumToYV12 = ff_rgba32ToY_neon;
>>>> +            if (c->chrSrcHSubSample)
>>>> +                c->chrToYV12 = ff_rgba32ToUV_half_neon;
>>>> +            else
>>>> +                c->chrToYV12 = ff_rgba32ToUV_neon;
>>>> +            break;
>>>>       default:
>>>>           break;
>>>>       }
>>> _______________________________________________
>>> ffmpeg-devel mailing list
>>> ffmpeg-devel@ffmpeg.org
>>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>> 
>>> To unsubscribe, visit link above, or email
>>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>> 
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>> 
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org <mailto:ffmpeg-devel-request@ffmpeg.org> with subject "unsubscribe".
Martin Storsjö June 20, 2024, 12:49 p.m. UTC | #6
On Thu, 20 Jun 2024, Zhao Zhili wrote:

>> On Jun 19, 2024, at 20:05, Rémi Denis-Courmont <remi@remlab.net> wrote:
>> 
>> Le 19 juin 2024 11:24:28 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit :
>>> 
>>>> On Jun 19, 2024, at 15:07, Rémi Denis-Courmont <remi@remlab.net> wrote:
>>>> 
>>>> 
>>>> 
>>>> Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit :
>>>>> 
>>>>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>>>>> index 2b956fe5c2..37f1158504 100644
>>>>> --- a/libswscale/aarch64/input.S
>>>>> +++ b/libswscale/aarch64/input.S
>>>>> @@ -20,8 +20,12 @@
>>>>> 
>>>>> #include "libavutil/aarch64/asm.S"
>>>>> 
>>>>> -.macro rgb_to_yuv_load_rgb src
>>>>> +.macro rgb_to_yuv_load_rgb src, element=3
>>>>> +    .if \element == 3
>>>>>       ld3             { v16.16b, v17.16b, v18.16b }, [\src]
>>>>> +    .else
>>>>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
>>>>> +    .endif
>>>>>       uxtl            v19.8h, v16.8b             // v19: r
>>>>>       uxtl            v20.8h, v17.8b             // v20: g
>>>>>       uxtl            v21.8h, v18.8b             // v21: b
>>>>> @@ -43,7 +47,7 @@
>>>>>       sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
>>>>> .endm
>>>>> 
>>>>> -.macro rgbToY bgr
>>>>> +.macro rgbToY bgr, element=3
>>>> 
>>>> AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop.
>>> 
>>> I’m not sure where to add the branch. Could you elaborate? Do you mean load coefficients first like the following:
>>> 
>>> function ff_bgr24ToUV_half_neon, export=1
>>>       ldr             w12, [x6, #12]
>>>       ldr             w11, [x6, #16]
>>>       ldr             w10, [x6, #20]
>>>       ldr             w15, [x6, #24]
>>>       ldr             w14, [x6, #28]
>>>       ldr             w13, [x6, #32]
>>>       rgbToUV_half
>>> endfunc
>> 
>> Hmm, no. You need to jump past the loading of red and blue coefficients. It might help to load green coefficients last.
>> 
>> By the way, I think you can use LDP instead of LDR.
>
> Patch v2 replace LDR by LDP, then the "jump past the loading of red and blue coefficients” doesn’t apply now.

Rémi's point is that you don't need to duplicate the whole function, when 
the only thing you're changing is a couple of instructions in the prologue 
of the function. By reusing the actual bulk of the function, you save on 
binary size.

One way of doing it looks like this:

diff --git a/libavutil/aarch64/asm.S b/libavutil/aarch64/asm.S
index 1840f9fb01..eb870e4dca 100644
--- a/libavutil/aarch64/asm.S
+++ b/libavutil/aarch64/asm.S
@@ -256,5 +256,11 @@ ELF     .size   \name, . - \name
  #define JOIN(a, b) GLUE(a, b)
  #define X(s) JOIN(EXTERN_ASM, s)

+#ifdef __APPLE__
+#define L(x) L ## x
+#else
+#define L(x) .L ## x
+#endif
+
  #define x18 do_not_use_x18
  #define w18 do_not_use_w18
diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index 33afa34111..ce10d584c6 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -49,6 +49,7 @@ function ff_rgb24ToY_neon, export=1
          ldr             w12, [x5, #8]           // w12: by
          b.le            3f

+L(rgb24ToY_internal):
          mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
          movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
          dup             v6.4s, w9               // w9: const_offset
@@ -85,6 +86,14 @@ function ff_rgb24ToY_neon, export=1
          ret
  endfunc

+function ff_bgr24ToY_neon, export=1
+        cmp             w4, #0                  // check width > 0
+        ldp             w12, w11, [x5]          // w12: ry, w11: gy
+        ldr             w10, [x5, #8]           // w10: by
+        b.gt            L(rgb24ToY_internal)
+        ret
+endfunc
+
  .macro rgb24_load_uv_coeff half
          ldp             w10, w11, [x6, #12]     // w10: ru, w11: gu
          ldp             w12, w13, [x6, #20]     // w12: bu, w13: rv


Another way looks like this:

diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index 33afa34111..5c4b7a41fd 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -43,12 +43,23 @@
          sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // 
dst_higher_half = dst2 >> right_shift
  .endm

+function ff_bgr24ToY_neon, export=1
+        cmp             w4, #0                  // check width > 0
+        ldp             w12, w11, [x5]          // w12: ry, w11: gy
+        ldr             w10, [x5, #8]           // w10: by
+        b.gt            rgb24ToY_internal
+        ret
+endfunc
+
  function ff_rgb24ToY_neon, export=1
          cmp             w4, #0                  // check width > 0
          ldp             w10, w11, [x5]          // w10: ry, w11: gy
          ldr             w12, [x5, #8]           // w12: by
-        b.le            3f
+        b.gt            rgb24ToY_internal
+        ret
+endfunc

+function rgb24ToY_internal
          mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
          movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
          dup             v6.4s, w9               // w9: const_offset


Or if you want to be really adventurous, you can make a fallthrough:


diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index 33afa34111..025a965b76 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -43,12 +43,22 @@
          sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // 
dst_higher_half = dst2 >> right_shift
  .endm

+function ff_bgr24ToY_neon, export=1
+        cmp             w4, #0                  // check width > 0
+        ldp             w12, w11, [x5]          // w12: ry, w11: gy
+        ldr             w10, [x5, #8]           // w10: by
+        b.gt            rgb24ToY_internal
+        ret
+endfunc
+
  function ff_rgb24ToY_neon, export=1
          cmp             w4, #0                  // check width > 0
          ldp             w10, w11, [x5]          // w10: ry, w11: gy
          ldr             w12, [x5, #8]           // w12: by
          b.le            3f
+endfunc

+function rgb24ToY_internal
          mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
          movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
          dup             v6.4s, w9               // w9: const_offset


// Martin
Zhao Zhili June 20, 2024, 4:02 p.m. UTC | #7
> On Jun 20, 2024, at 20:49, Martin Storsjö <martin@martin.st> wrote:
> 
> On Thu, 20 Jun 2024, Zhao Zhili wrote:
> 
>>> On Jun 19, 2024, at 20:05, Rémi Denis-Courmont <remi@remlab.net> wrote:
>>> Le 19 juin 2024 11:24:28 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit :
>>>>> On Jun 19, 2024, at 15:07, Rémi Denis-Courmont <remi@remlab.net> wrote:
>>>>> Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit :
>>>>>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>>>>>> index 2b956fe5c2..37f1158504 100644
>>>>>> --- a/libswscale/aarch64/input.S
>>>>>> +++ b/libswscale/aarch64/input.S
>>>>>> @@ -20,8 +20,12 @@
>>>>>> #include "libavutil/aarch64/asm.S"
>>>>>> -.macro rgb_to_yuv_load_rgb src
>>>>>> +.macro rgb_to_yuv_load_rgb src, element=3
>>>>>> +    .if \element == 3
>>>>>>      ld3             { v16.16b, v17.16b, v18.16b }, [\src]
>>>>>> +    .else
>>>>>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
>>>>>> +    .endif
>>>>>>      uxtl            v19.8h, v16.8b             // v19: r
>>>>>>      uxtl            v20.8h, v17.8b             // v20: g
>>>>>>      uxtl            v21.8h, v18.8b             // v21: b
>>>>>> @@ -43,7 +47,7 @@
>>>>>>      sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
>>>>>> .endm
>>>>>> -.macro rgbToY bgr
>>>>>> +.macro rgbToY bgr, element=3
>>>>> AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop.
>>>> I’m not sure where to add the branch. Could you elaborate? Do you mean load coefficients first like the following:
>>>> function ff_bgr24ToUV_half_neon, export=1
>>>>      ldr             w12, [x6, #12]
>>>>      ldr             w11, [x6, #16]
>>>>      ldr             w10, [x6, #20]
>>>>      ldr             w15, [x6, #24]
>>>>      ldr             w14, [x6, #28]
>>>>      ldr             w13, [x6, #32]
>>>>      rgbToUV_half
>>>> endfunc
>>> Hmm, no. You need to jump past the loading of red and blue coefficients. It might help to load green coefficients last.
>>> By the way, I think you can use LDP instead of LDR.
>> 
>> Patch v2 replace LDR by LDP, then the "jump past the loading of red and blue coefficients” doesn’t apply now.
> 
> Rémi's point is that you don't need to duplicate the whole function, when the only thing you're changing is a couple of instructions in the prologue of the function. By reusing the actual bulk of the function, you save on binary size.

Thank you for the detailed examples. I missed the key point here is to save binary size.

I have seen similar example of fall through in risk/input_rvv.s. Is it well defined to jump to a local label in another function?

> 
> One way of doing it looks like this:
> 
> diff --git a/libavutil/aarch64/asm.S b/libavutil/aarch64/asm.S
> index 1840f9fb01..eb870e4dca 100644
> --- a/libavutil/aarch64/asm.S
> +++ b/libavutil/aarch64/asm.S
> @@ -256,5 +256,11 @@ ELF     .size   \name, . - \name
> #define JOIN(a, b) GLUE(a, b)
> #define X(s) JOIN(EXTERN_ASM, s)
> 
> +#ifdef __APPLE__
> +#define L(x) L ## x
> +#else
> +#define L(x) .L ## x
> +#endif
> +
> #define x18 do_not_use_x18
> #define w18 do_not_use_w18
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> index 33afa34111..ce10d584c6 100644
> --- a/libswscale/aarch64/input.S
> +++ b/libswscale/aarch64/input.S
> @@ -49,6 +49,7 @@ function ff_rgb24ToY_neon, export=1
>         ldr             w12, [x5, #8]           // w12: by
>         b.le            3f
> 
> +L(rgb24ToY_internal):
>         mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
>         movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
>         dup             v6.4s, w9               // w9: const_offset
> @@ -85,6 +86,14 @@ function ff_rgb24ToY_neon, export=1
>         ret
> endfunc
> 
> +function ff_bgr24ToY_neon, export=1
> +        cmp             w4, #0                  // check width > 0
> +        ldp             w12, w11, [x5]          // w12: ry, w11: gy
> +        ldr             w10, [x5, #8]           // w10: by
> +        b.gt            L(rgb24ToY_internal)
> +        ret
> +endfunc
> +
> .macro rgb24_load_uv_coeff half
>         ldp             w10, w11, [x6, #12]     // w10: ru, w11: gu
>         ldp             w12, w13, [x6, #20]     // w12: bu, w13: rv
> 
> 
> Another way looks like this:
> 
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> index 33afa34111..5c4b7a41fd 100644
> --- a/libswscale/aarch64/input.S
> +++ b/libswscale/aarch64/input.S
> @@ -43,12 +43,23 @@
>         sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
> .endm
> 
> +function ff_bgr24ToY_neon, export=1
> +        cmp             w4, #0                  // check width > 0
> +        ldp             w12, w11, [x5]          // w12: ry, w11: gy
> +        ldr             w10, [x5, #8]           // w10: by
> +        b.gt            rgb24ToY_internal
> +        ret
> +endfunc
> +
> function ff_rgb24ToY_neon, export=1
>         cmp             w4, #0                  // check width > 0
>         ldp             w10, w11, [x5]          // w10: ry, w11: gy
>         ldr             w12, [x5, #8]           // w12: by
> -        b.le            3f
> +        b.gt            rgb24ToY_internal
> +        ret
> +endfunc
> 
> +function rgb24ToY_internal
>         mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
>         movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
>         dup             v6.4s, w9               // w9: const_offset
> 
> 
> Or if you want to be really adventurous, you can make a fallthrough:
> 
> 
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> index 33afa34111..025a965b76 100644
> --- a/libswscale/aarch64/input.S
> +++ b/libswscale/aarch64/input.S
> @@ -43,12 +43,22 @@
>         sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
> .endm
> 
> +function ff_bgr24ToY_neon, export=1
> +        cmp             w4, #0                  // check width > 0
> +        ldp             w12, w11, [x5]          // w12: ry, w11: gy
> +        ldr             w10, [x5, #8]           // w10: by
> +        b.gt            rgb24ToY_internal
> +        ret
> +endfunc
> +
> function ff_rgb24ToY_neon, export=1
>         cmp             w4, #0                  // check width > 0
>         ldp             w10, w11, [x5]          // w10: ry, w11: gy
>         ldr             w12, [x5, #8]           // w12: by
>         b.le            3f
> +endfunc
> 
> +function rgb24ToY_internal
>         mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 7)
>         movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT - 1)
>         dup             v6.4s, w9               // w9: const_offset
> 
> 
> // Martin
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Rémi Denis-Courmont June 20, 2024, 4:25 p.m. UTC | #8
Le 20 juin 2024 18:02:31 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit :
>
>
>> On Jun 20, 2024, at 20:49, Martin Storsjö <martin@martin.st> wrote:
>> 
>> On Thu, 20 Jun 2024, Zhao Zhili wrote:
>> 
>>>> On Jun 19, 2024, at 20:05, Rémi Denis-Courmont <remi@remlab.net> wrote:
>>>> Le 19 juin 2024 11:24:28 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com <mailto:quinkblack@foxmail.com>> a écrit :
>>>>>> On Jun 19, 2024, at 15:07, Rémi Denis-Courmont <remi@remlab.net> wrote:
>>>>>> Le 15 juin 2024 11:57:18 GMT+02:00, Zhao Zhili <quinkblack@foxmail.com> a écrit :
>>>>>>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>>>>>>> index 2b956fe5c2..37f1158504 100644
>>>>>>> --- a/libswscale/aarch64/input.S
>>>>>>> +++ b/libswscale/aarch64/input.S
>>>>>>> @@ -20,8 +20,12 @@
>>>>>>> #include "libavutil/aarch64/asm.S"
>>>>>>> -.macro rgb_to_yuv_load_rgb src
>>>>>>> +.macro rgb_to_yuv_load_rgb src, element=3
>>>>>>> +    .if \element == 3
>>>>>>>      ld3             { v16.16b, v17.16b, v18.16b }, [\src]
>>>>>>> +    .else
>>>>>>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
>>>>>>> +    .endif
>>>>>>>      uxtl            v19.8h, v16.8b             // v19: r
>>>>>>>      uxtl            v20.8h, v17.8b             // v20: g
>>>>>>>      uxtl            v21.8h, v18.8b             // v21: b
>>>>>>> @@ -43,7 +47,7 @@
>>>>>>>      sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
>>>>>>> .endm
>>>>>>> -.macro rgbToY bgr
>>>>>>> +.macro rgbToY bgr, element=3
>>>>>> AFAICT, you don't need to a macro parameter for component order. Just swap red and blue coefficients in the prologue and then run the bit-exact same loops for bgr/rgb, rgba/bgra and argb/abgr. This adds one branch in the prologue but that's mostly negligible compared to the loop.
>>>>> I’m not sure where to add the branch. Could you elaborate? Do you mean load coefficients first like the following:
>>>>> function ff_bgr24ToUV_half_neon, export=1
>>>>>      ldr             w12, [x6, #12]
>>>>>      ldr             w11, [x6, #16]
>>>>>      ldr             w10, [x6, #20]
>>>>>      ldr             w15, [x6, #24]
>>>>>      ldr             w14, [x6, #28]
>>>>>      ldr             w13, [x6, #32]
>>>>>      rgbToUV_half
>>>>> endfunc
>>>> Hmm, no. You need to jump past the loading of red and blue coefficients. It might help to load green coefficients last.
>>>> By the way, I think you can use LDP instead of LDR.
>>> 
>>> Patch v2 replace LDR by LDP, then the "jump past the loading of red and blue coefficients” doesn’t apply now.
>> 
>> Rémi's point is that you don't need to duplicate the whole function, when the only thing you're changing is a couple of instructions in the prologue of the function. By reusing the actual bulk of the function, you save on binary size.
>
>Thank you for the detailed examples. I missed the key point here is to save binary size.
>
>I have seen similar example of fall through in risk/input_rvv.s. Is it well defined to jump to a local label in another function?

Falling through is well defined so long as we don't use function-sections. Jumping to a label inside another function is well defined, as the assembler has no notion of what a function is.

`func` and `endfunc` are just FFmpeg macros for defining symbols.
diff mbox series

Patch

diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index 2b956fe5c2..37f1158504 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -20,8 +20,12 @@ 
 
 #include "libavutil/aarch64/asm.S"
 
-.macro rgb_to_yuv_load_rgb src
+.macro rgb_to_yuv_load_rgb src, element=3
+    .if \element == 3
         ld3             { v16.16b, v17.16b, v18.16b }, [\src]
+    .else
+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [\src]
+    .endif
         uxtl            v19.8h, v16.8b             // v19: r
         uxtl            v20.8h, v17.8b             // v20: g
         uxtl            v21.8h, v18.8b             // v21: b
@@ -43,7 +47,7 @@ 
         sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = dst2 >> right_shift
 .endm
 
-.macro rgbToY bgr
+.macro rgbToY bgr, element=3
         cmp             w4, #0                  // check width > 0
     .if \bgr
         ldr             w12, [x5]               // w12: ry
@@ -67,11 +71,15 @@ 
         dup             v2.8h, w12
         b.lt            2f
 1:
-        rgb_to_yuv_load_rgb x1
+        rgb_to_yuv_load_rgb x1, \element
         rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
         rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
         sub             w4, w4, #16             // width -= 16
+    .if \element == 3
         add             x1, x1, #48             // src += 48
+    .else
+        add             x1, x1, #64
+    .endif
         cmp             w4, #16                 // width >= 16 ?
         stp             q16, q17, [x0], #32     // store to dst
         b.ge            1b
@@ -86,7 +94,7 @@ 
         smaddl          x13, w15, w12, x13      // x13 += by * b
         asr             w13, w13, #9            // x13 >>= 9
         sub             w4, w4, #1              // width--
-        add             x1, x1, #3              // src += 3
+        add             x1, x1, \element
         strh            w13, [x0], #2           // store to dst
         cbnz            w4, 2b
 3:
@@ -101,6 +109,14 @@  function ff_bgr24ToY_neon, export=1
         rgbToY          bgr=1
 endfunc
 
+function ff_rgba32ToY_neon, export=1
+        rgbToY          bgr=0, element=4
+endfunc
+
+function ff_bgra32ToY_neon, export=1
+        rgbToY          bgr=1, element=4
+endfunc
+
 .macro rgb_load_uv_coeff half, bgr
     .if \bgr
         ldr             w12, [x6, #12]
@@ -130,7 +146,7 @@  endfunc
         dup             v6.4s, w9
 .endm
 
-.macro rgbToUV_half bgr
+.macro rgbToUV_half bgr, element=3
         cmp             w5, #0          // check width > 0
         b.le            3f
 
@@ -139,7 +155,11 @@  endfunc
         b.lt            2f
         // The following comments assume RGB order. The logic for RGB and BGR is the same.
 1:
+    .if \element == 3
         ld3             { v16.16b, v17.16b, v18.16b }, [x3]
+    .else
+        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
+    .endif
         uaddlp          v19.8h, v16.16b         // v19: r
         uaddlp          v20.8h, v17.16b         // v20: g
         uaddlp          v21.8h, v18.16b         // v21: b
@@ -147,7 +167,11 @@  endfunc
         rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2, #10
         rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
         sub             w5, w5, #8              // width -= 8
-        add             x3, x3, #48             // src += 48
+    .if \element == 3
+        add             x3, x3, #48
+    .else
+        add             x3, x3, #64
+    .endif
         cmp             w5, #8                  // width >= 8 ?
         str             q16, [x0], #16          // store dst_u
         str             q17, [x1], #16          // store dst_v
@@ -155,9 +179,10 @@  endfunc
         cbz             w5, 3f
 2:
         ldrb            w2, [x3]                // w2: r1
-        ldrb            w4, [x3, #3]            // w4: r2
+        ldrb            w4, [x3, \element]      // w4: r2
         add             w2, w2, w4              // w2 = r1 + r2
 
+    .if \element == 3
         ldrb            w4, [x3, #1]            // w4: g1
         ldrb            w7, [x3, #4]            // w7: g2
         add             w4, w4, w7              // w4 = g1 + g2
@@ -165,6 +190,15 @@  endfunc
         ldrb            w7, [x3, #2]            // w7: b1
         ldrb            w8, [x3, #5]            // w8: b2
         add             w7, w7, w8              // w7 = b1 + b2
+    .else
+        ldrb            w4, [x3, #1]            // w4: g1
+        ldrb            w7, [x3, #5]            // w7: g2
+        add             w4, w4, w7              // w4 = g1 + g2
+
+        ldrb            w7, [x3, #2]            // w7: b1
+        ldrb            w8, [x3, #6]            // w8: b2
+        add             w7, w7, w8              // w7 = b1 + b2
+    .endif
 
         smaddl          x8, w2, w10, x9         // dst_u = ru * r + const_offset
         smaddl          x8, w4, w11, x8         // dst_u += gu * g
@@ -177,7 +211,12 @@  endfunc
         smaddl          x8, w7, w15, x8         // dst_v += bv * b
         asr             x8, x8, #10             // dst_v >>= 10
         sub             w5, w5, #1
-        add             x3, x3, #6              // src += 6
+        ldrb            w4, [x3, #1]            // w4: g1
+    .if \element == 3
+        add             x3, x3, #6
+    .else
+        add             x3, x3, #8
+    .endif
         strh            w8, [x1], #2            // store dst_v
         cbnz            w5, 2b
 3:
@@ -192,7 +231,15 @@  function ff_bgr24ToUV_half_neon, export=1
         rgbToUV_half    bgr=1
 endfunc
 
-.macro rgbToUV bgr
+function ff_rgba32ToUV_half_neon, export=1
+        rgbToUV_half    bgr=0, element=4
+endfunc
+
+function ff_bgra32ToUV_half_neon, export=1
+        rgbToUV_half    bgr=1, element=4
+endfunc
+
+.macro rgbToUV bgr, element=3
         cmp             w5, #0                  // check width > 0
         b.le            3f
 
@@ -201,13 +248,17 @@  endfunc
         b.lt            2f
         // The following comments assume RGB order. The logic for RGB and BGR is the same.
 1:
-        rgb_to_yuv_load_rgb x3
+        rgb_to_yuv_load_rgb x3, \element
         rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
         rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
         rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
         rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
         sub             w5, w5, #16
+    .if \element == 3
         add             x3, x3, #48             // src += 48
+    .else
+        add             x3, x3, #64
+    .endif
         cmp             w5, #16
         stp             q16, q17, [x0], #32     // store to dst_u
         stp             q18, q19, [x1], #32     // store to dst_v
@@ -229,7 +280,7 @@  endfunc
         smaddl          x8, w4, w15, x8         // x8 += bv * b
         asr             w8, w8, #9              // x8 >>= 9
         sub             w5, w5, #1              // width--
-        add             x3, x3, #3              // src += 3
+        add             x3, x3, \element
         strh            w8, [x1], #2            // store to dst_v
         cbnz            w5, 2b
 3:
@@ -243,3 +294,11 @@  endfunc
 function ff_bgr24ToUV_neon, export=1
         rgbToUV         bgr=1
 endfunc
+
+function ff_rgba32ToUV_neon, export=1
+        rgbToUV         bgr=0, element=4
+endfunc
+
+function ff_bgra32ToUV_neon, export=1
+        rgbToUV         bgr=1, element=4
+endfunc
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index ce70dbedcc..8fe9fb11ac 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -212,7 +212,9 @@  void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \
                               uint32_t *coeffs, void *)
 
 NEON_INPUT(bgr24);
+NEON_INPUT(bgra32);
 NEON_INPUT(rgb24);
+NEON_INPUT(rgba32);
 
 av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
 {
@@ -233,6 +235,13 @@  av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
             else
                 c->chrToYV12 = ff_bgr24ToUV_neon;
             break;
+        case AV_PIX_FMT_BGRA:
+            c->lumToYV12 = ff_bgra32ToY_neon;
+            if (c->chrSrcHSubSample)
+                c->chrToYV12 = ff_bgra32ToUV_half_neon;
+            else
+                c->chrToYV12 = ff_bgra32ToUV_neon;
+            break;
         case AV_PIX_FMT_RGB24:
             c->lumToYV12 = ff_rgb24ToY_neon;
             if (c->chrSrcHSubSample)
@@ -240,6 +249,13 @@  av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)
             else
                 c->chrToYV12 = ff_rgb24ToUV_neon;
             break;
+        case AV_PIX_FMT_RGBA:
+            c->lumToYV12 = ff_rgba32ToY_neon;
+            if (c->chrSrcHSubSample)
+                c->chrToYV12 = ff_rgba32ToUV_half_neon;
+            else
+                c->chrToYV12 = ff_rgba32ToUV_neon;
+            break;
         default:
             break;
         }