[FFmpeg-devel,v1,6/6] swscale: Add aarch64 functions for RGB24->YUV420P

Message ID	20230820151022.2204421-7-jc@kynesim.co.uk
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: John Cox <jc@kynesim.co.uk> To: ffmpeg-devel@ffmpeg.org Date: Sun, 20 Aug 2023 15:10:22 +0000 Message-Id: <20230820151022.2204421-7-jc@kynesim.co.uk> In-Reply-To: <20230820151022.2204421-1-jc@kynesim.co.uk> References: <20230820151022.2204421-1-jc@kynesim.co.uk> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH v1 6/6] swscale: Add aarch64 functions for RGB24->YUV420P Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: John Cox <jc@kynesim.co.uk> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	swscale: Add dedicated RGB->YUV unscaled functions & aarch64 asm \| expand [FFmpeg-devel,v1,0/6] swscale: Add dedicated RGB->YUV unscaled functions & aarch64 asm [FFmpeg-devel,v1,1/6] fate-filter-fps: Set swscale bitexact for tests that do conversions [FFmpeg-devel,v1,2/6] swscale: Rename BGR24->YUV conversion functions as bgr... [FFmpeg-devel,v1,3/6] swscale: Add explicit rgb24->yv12 conversion [FFmpeg-devel,v1,4/6] swscale: RGB24->YUV allow odd widths & improve C rounding [FFmpeg-devel,v1,5/6] swscale: Add unscaled XRGB->YUV420P functions [FFmpeg-devel,v1,6/6] swscale: Add aarch64 functions for RGB24->YUV420P

Context	Check	Description
yinshiyou/make_loongarch64	success	Make finished
yinshiyou/make_fate_loongarch64	success	Make fate finished
andriy/make_x86	success	Make finished
andriy/make_fate_x86	success	Make fate finished

diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c index a9bf6ff9e0..b2d68c1df3 100644 --- a/libswscale/aarch64/rgb2rgb.c +++ b/libswscale/aarch64/rgb2rgb.c @@ -30,6 +30,12 @@ void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, int width, int height, int src1Stride, int src2Stride, int dstStride); +void ff_bgr24toyv12_neon(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, + int chromStride, int srcStride, int32_t *rgb2yuv); +void ff_rgb24toyv12_neon(const uint8_t *src, uint8_t *ydst, uint8_t *udst, + uint8_t *vdst, int width, int height, int lumStride, + int chromStride, int srcStride, int32_t *rgb2yuv); av_cold void rgb2rgb_init_aarch64(void) { @@ -37,5 +43,7 @@ av_cold void rgb2rgb_init_aarch64(void) if (have_neon(cpu_flags)) { interleaveBytes = ff_interleave_bytes_neon; + ff_rgb24toyv12 = ff_rgb24toyv12_neon; + ff_bgr24toyv12 = ff_bgr24toyv12_neon; } } diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S index d81110ec57..b15e69a3bd 100644 --- a/libswscale/aarch64/rgb2rgb_neon.S +++ b/libswscale/aarch64/rgb2rgb_neon.S @@ -77,3 +77,359 @@ function ff_interleave_bytes_neon, export=1 0: ret endfunc + +// Expand rgb2 into r0+r1/g0+g1/b0+b1 +.macro XRGB3Y r0, g0, b0, r1, g1, b1, r2, g2, b2 + uxtl \r0\().8h, \r2\().8b + uxtl \g0\().8h, \g2\().8b + uxtl \b0\().8h, \b2\().8b + + uxtl2 \r1\().8h, \r2\().16b + uxtl2 \g1\().8h, \g2\().16b + uxtl2 \b1\().8h, \b2\().16b +.endm + +// Expand rgb2 into r0+r1/g0+g1/b0+b1 +// and pick every other el to put back into rgb2 for chroma +.macro XRGB3YC r0, g0, b0, r1, g1, b1, r2, g2, b2 + XRGB3Y \r0, \g0, \b0, \r1, \g1, \b1, \r2, \g2, \b2 + + bic \r2\().8h, #0xff, LSL #8 + bic \g2\().8h, #0xff, LSL #8 + bic \b2\().8h, #0xff, LSL #8 +.endm + +.macro SMLAL3 d0, d1, s0, s1, s2, c0, c1, c2 + smull \d0\().4s, \s0\().4h, \c0 + smlal \d0\().4s, \s1\().4h, \c1 + smlal \d0\().4s, \s2\().4h, \c2 + smull2 \d1\().4s, \s0\().8h, \c0 + smlal2 \d1\().4s, \s1\().8h, \c1 + smlal2 \d1\().4s, \s2\().8h, \c2 +.endm + +// d0 may be s0 +// s0, s2 corrupted +.macro SHRN_Y d0, s0, s1, s2, s3, k128h + shrn \s0\().4h, \s0\().4s, #12 + shrn2 \s0\().8h, \s1\().4s, #12 + add \s0\().8h, \s0\().8h, \k128h\().8h // +128 (>> 3 = 16) + sqrshrun \d0\().8b, \s0\().8h, #3 + shrn \s2\().4h, \s2\().4s, #12 + shrn2 \s2\().8h, \s3\().4s, #12 + add \s2\().8h, \s2\().8h, \k128h\().8h + sqrshrun2 \d0\().16b, v28.8h, #3 +.endm + +.macro SHRN_C d0, s0, s1, k128b + shrn \s0\().4h, \s0\().4s, #14 + shrn2 \s0\().8h, \s1\().4s, #14 + sqrshrn \s0\().8b, \s0\().8h, #1 + add \d0\().8b, \s0\().8b, \k128b\().8b // +128 +.endm + +.macro STB2V s0, n, a + st1 {\s0\().b}[(\n+0)], [\a], #1 + st1 {\s0\().b}[(\n+1)], [\a], #1 +.endm + +.macro STB4V s0, n, a + STB2V \s0, (\n+0), \a + STB2V \s0, (\n+2), \a +.endm + + +// void ff_bgr24toyv12_neon( +// const uint8_t *src, // x0 +// uint8_t *ydst, // x1 +// uint8_t *udst, // x2 +// uint8_t *vdst, // x3 +// int width, // w4 +// int height, // w5 +// int lumStride, // w6 +// int chromStride, // w7 +// int srcStr, // [sp, #0] +// int32_t *rgb2yuv); // [sp, #8] + +function ff_bgr24toyv12_neon, export=1 + ldr x15, [sp, #8] + ld3 {v3.s, v4.s, v5.s}[0], [x15], #12 + ld3 {v3.s, v4.s, v5.s}[1], [x15], #12 + ld3 {v3.s, v4.s, v5.s}[2], [x15] + mov v6.16b, v3.16b + mov v3.16b, v5.16b + mov v5.16b, v6.16b + b 99f +endfunc + +// void ff_rgb24toyv12_neon( +// const uint8_t *src, // x0 +// uint8_t *ydst, // x1 +// uint8_t *udst, // x2 +// uint8_t *vdst, // x3 +// int width, // w4 +// int height, // w5 +// int lumStride, // w6 +// int chromStride, // w7 +// int srcStr, // [sp, #0] +// int32_t *rgb2yuv); // [sp, #8] (including Mac) + +// regs +// v0-2 Src bytes - reused as chroma src +// v3-5 Coeffs (packed very inefficiently - could be squashed) +// v6 128b +// v7 128h +// v8-15 Reserved +// v16-18 Lo Src expanded as H +// v19 - +// v20-22 Hi Src expanded as H +// v23 - +// v24 U out +// v25 U tmp +// v26 Y out +// v27-29 Y tmp +// v30 V out +// v31 V tmp + +function ff_rgb24toyv12_neon, export=1 + ldr x15, [sp, #8] + ld3 {v3.s, v4.s, v5.s}[0], [x15], #12 + ld3 {v3.s, v4.s, v5.s}[1], [x15], #12 + ld3 {v3.s, v4.s, v5.s}[2], [x15] + +99: + ldr w14, [sp, #0] + movi v7.8b, #128 + uxtl v6.8h, v7.8b + // Ensure if nothing to do then we do nothing + cmp w4, #0 + b.le 90f + cmp w5, #0 + b.le 90f + // If w % 16 != 0 then -16 so we do main loop 1 fewer times with + // the remainder done in the tail + tst w4, #15 + b.eq 1f + sub w4, w4, #16 +1: + +// -------------------- Even line body - YUV +11: + subs w9, w4, #0 + mov x10, x0 + mov x11, x1 + mov x12, x2 + mov x13, x3 + b.lt 12f + + ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 + subs w9, w9, #16 + b.le 13f + +10: + XRGB3YC v16, v17, v18, v20, v21, v22, v0, v1, v2 + + // Testing shows it is faster to stack the smull/smlal ops together + // rather than interleave them between channels and indeed even the + // shift/add sections seem happier not interleaved + + // Y0 + SMLAL3 v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0] + // Y1 + SMLAL3 v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0] + SHRN_Y v26, v26, v27, v28, v29, v6 + + // U + // Vector subscript *2 as we loaded into S but are only using H + SMLAL3 v24, v25, v0, v1, v2, v3.h[2], v4.h[2], v5.h[2] + + // V + SMLAL3 v30, v31, v0, v1, v2, v3.h[4], v4.h[4], v5.h[4] + + ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 + + SHRN_C v24, v24, v25, v7 + SHRN_C v30, v30, v31, v7 + + subs w9, w9, #16 + + st1 {v26.16b}, [x11], #16 + st1 {v24.8b}, [x12], #8 + st1 {v30.8b}, [x13], #8 + + b.gt 10b + +// -------------------- Even line tail - YUV +// If width % 16 == 0 then simply runs once with preloaded RGB +// If other then deals with preload & then does remaining tail + +13: + // Body is simple copy of main loop body minus preload + + XRGB3YC v16, v17, v18, v20, v21, v22, v0, v1, v2 + // Y0 + SMLAL3 v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0] + // Y1 + SMLAL3 v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0] + SHRN_Y v26, v26, v27, v28, v29, v6 + // U + SMLAL3 v24, v25, v0, v1, v2, v3.h[2], v4.h[2], v5.h[2] + // V + SMLAL3 v30, v31, v0, v1, v2, v3.h[4], v4.h[4], v5.h[4] + + cmp w9, #-16 + + SHRN_C v24, v24, v25, v7 + SHRN_C v30, v30, v31, v7 + + // Here: + // w9 == 0 width % 16 == 0, tail done + // w9 > -16 1st tail done (16 pels), remainder still to go + // w9 == -16 shouldn't happen + // w9 > -32 2nd tail done + // w9 <= -32 shouldn't happen + + b.lt 2f + st1 {v26.16b}, [x11], #16 + st1 {v24.8b}, [x12], #8 + st1 {v30.8b}, [x13], #8 + cbz w9, 3f + +12: + sub w9, w9, #16 + + tbz w9, #3, 1f + ld3 {v0.8b, v1.8b, v2.8b}, [x10], #24 +1: tbz w9, #2, 1f + ld3 {v0.b, v1.b, v2.b}[8], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[9], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[10], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[11], [x10], #3 +1: tbz w9, #1, 1f + ld3 {v0.b, v1.b, v2.b}[12], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[13], [x10], #3 +1: tbz w9, #0, 13b + ld3 {v0.b, v1.b, v2.b}[14], [x10], #3 + b 13b + +2: + tbz w9, #3, 1f + st1 {v26.8b}, [x11], #8 + STB4V v24, 0, x12 + STB4V v30, 0, x13 +1: tbz w9, #2, 1f + STB4V v26 8, x11 + STB2V v24, 4, x12 + STB2V v30, 4, x13 +1: tbz w9, #1, 1f + STB2V v26, 12, x11 + st1 {v24.b}[6], [x12], #1 + st1 {v30.b}[6], [x13], #1 +1: tbz w9, #0, 1f + st1 {v26.b}[14], [x11] + st1 {v24.b}[7], [x12] + st1 {v30.b}[7], [x13] +1: +3: + +// -------------------- Odd line body - Y only + + subs w5, w5, #1 + b.eq 90f + + subs w9, w4, #0 + add x0, x0, w14, sxtx + add x1, x1, w6, sxtx + mov x10, x0 + mov x11, x1 + b.lt 12f + + ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 + subs w9, w9, #16 + b.le 13f + +10: + XRGB3Y v16, v17, v18, v20, v21, v22, v0, v1, v2 + // Y0 + SMLAL3 v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0] + // Y1 + SMLAL3 v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0] + + ld3 {v0.16b, v1.16b, v2.16b}, [x10], #48 + + SHRN_Y v26, v26, v27, v28, v29, v6 + + subs w9, w9, #16 + + st1 {v26.16b}, [x11], #16 + + b.gt 10b + +// -------------------- Odd line tail - Y +// If width % 16 == 0 then simply runs once with preloaded RGB +// If other then deals with preload & then does remaining tail + +13: + // Body is simple copy of main loop body minus preload + + XRGB3Y v16, v17, v18, v20, v21, v22, v0, v1, v2 + // Y0 + SMLAL3 v26, v27, v16, v17, v18, v3.h[0], v4.h[0], v5.h[0] + // Y1 + SMLAL3 v28, v29, v20, v21, v22, v3.h[0], v4.h[0], v5.h[0] + + cmp w9, #-16 + + SHRN_Y v26, v26, v27, v28, v29, v6 + + // Here: + // w9 == 0 width % 16 == 0, tail done + // w9 > -16 1st tail done (16 pels), remainder still to go + // w9 == -16 shouldn't happen + // w9 > -32 2nd tail done + // w9 <= -32 shouldn't happen + + b.lt 2f + st1 {v26.16b}, [x11], #16 + cbz w9, 3f + +12: + sub w9, w9, #16 + + tbz w9, #3, 1f + ld3 {v0.8b, v1.8b, v2.8b}, [x10], #24 +1: tbz w9, #2, 1f + ld3 {v0.b, v1.b, v2.b}[8], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[9], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[10], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[11], [x10], #3 +1: tbz w9, #1, 1f + ld3 {v0.b, v1.b, v2.b}[12], [x10], #3 + ld3 {v0.b, v1.b, v2.b}[13], [x10], #3 +1: tbz w9, #0, 13b + ld3 {v0.b, v1.b, v2.b}[14], [x10], #3 + b 13b + +2: + tbz w9, #3, 1f + st1 {v26.8b}, [x11], #8 +1: tbz w9, #2, 1f + STB4V v26, 8, x11 +1: tbz w9, #1, 1f + STB2V v26, 12, x11 +1: tbz w9, #0, 1f + st1 {v26.b}[14], [x11] +1: +3: + +// ------------------- Loop to start + + add x0, x0, w14, sxtx + add x1, x1, w6, sxtx + add x2, x2, w7, sxtx + add x3, x3, w7, sxtx + subs w5, w5, #1 + b.gt 11b +90: + ret +endfunc

[FFmpeg-devel,v1,6/6] swscale: Add aarch64 functions for RGB24->YUV420P

Checks

Commit Message

Patch