Message ID | 20210816094545.448283-2-mnitenko@gmail.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/2] lavc/aarch64: move transpose_4x8H to neon.S | expand |
Context | Check | Description |
---|---|---|
andriy/x86_make | success | Make finished |
andriy/x86_make_fate | success | Make fate finished |
andriy/PPC64_make | success | Make finished |
andriy/PPC64_make_fate | success | Make fate finished |
On Mon, 16 Aug 2021, Mikhail Nitenko wrote: > Benchmarks: A53 A72 > h264_h_loop_filter_chroma422_10bpp_c: 277.5 114.2 > h264_h_loop_filter_chroma422_10bpp_neon: 109.7 81.7 > h264_h_loop_filter_chroma_10bpp_c: 165.0 75.5 > h264_h_loop_filter_chroma_10bpp_neon: 121.2 74.7 > h264_h_loop_filter_chroma_intra422_10bpp_c: 324.2 124.2 > h264_h_loop_filter_chroma_intra422_10bpp_neon: 155.2 99.5 > h264_h_loop_filter_chroma_intra_10bpp_c: 121.0 48.5 > h264_h_loop_filter_chroma_intra_10bpp_neon: 79.5 52.7 > h264_h_loop_filter_chroma_mbaff422_10bpp_c: 191.0 73.5 > h264_h_loop_filter_chroma_mbaff422_10bpp_neon: 121.2 75.5 > h264_h_loop_filter_chroma_mbaff_intra422_10bpp_c: 117.0 51.5 > h264_h_loop_filter_chroma_mbaff_intra422_10bpp_neon: 79.5 53.7 > h264_h_loop_filter_chroma_mbaff_intra_10bpp_c: 63.0 28.5 > h264_h_loop_filter_chroma_mbaff_intra_10bpp_neon: 48.7 33.2 > h264_v_loop_filter_chroma_10bpp_c: 260.2 135.5 > h264_v_loop_filter_chroma_10bpp_neon: 72.2 49.2 > h264_v_loop_filter_chroma_intra_10bpp_c: 158.0 70.7 > h264_v_loop_filter_chroma_intra_10bpp_neon: 48.7 32.0 > > Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com> > --- > > removed leftover code, moved from 32bit and started loading with two > alternating registers, code became quite a bit faster! > > libavcodec/aarch64/h264dsp_init_aarch64.c | 37 ++++ > libavcodec/aarch64/h264dsp_neon.S | 255 ++++++++++++++++++++++ > 2 files changed, 292 insertions(+) > > diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S > index 997082498f..80b7ed5ce1 100644 > --- a/libavcodec/aarch64/h264dsp_neon.S > +++ b/libavcodec/aarch64/h264dsp_neon.S > @@ -819,3 +819,258 @@ endfunc > weight_func 16 > weight_func 8 > weight_func 4 > + > +.macro h264_loop_filter_start_10 > + cmp w2, #0 > + ldr w6, [x4] > + ccmp w3, #0, #0, ne > + lsl w2, w2, #2 > + mov v24.S[0], w6 > + lsl w3, w3, #2 > + and w8, w6, w6, lsl #16 Nitpick: Align the third operand column on ccmp/lsl above with how it's done here for the 'and'. (Yes the existing code here seems to have the same misalignment.) > + b.eq 1f > + ands w8, w8, w8, lsl #8 > + b.ge 2f > +1: > + ret > +2: > +.endm > + > +.macro h264_loop_filter_start_intra_10 > + orr w4, w2, w3 > + cbnz w4, 1f > + ret > +1: > + lsl w2, w2, #2 > + lsl w3, w3, #2 > + dup v30.8h, w2 // alpha > + dup v31.8h, w3 // beta > +.endm > + > +.macro h264_loop_filter_chroma_10 > + dup v22.8h, w2 // alpha > + dup v23.8h, w3 // beta > + uxtl v24.8h, v24.8b // tc0 > + > + uabd v26.8h, v16.8h, v0.8h // abs(p0 - q0) > + uabd v28.8h, v18.8h, v16.8h // abs(p1 - p0) > + uabd v30.8h, v2.8h, v0.8h // abs(q1 - q0) > + cmhi v26.8h, v22.8h, v26.8h // < alpha > + cmhi v28.8h, v23.8h, v28.8h // < beta > + cmhi v30.8h, v23.8h, v30.8h // < beta > + > + and v26.16b, v26.16b, v28.16b > + mov v4.16b, v0.16b > + sub v4.8h, v4.8h, v16.8h > + and v26.16b, v26.16b, v30.16b > + shl v4.8h, v4.8h, #2 > + mov x8, v26.d[0] > + mov x9, v26.d[1] > + sli v24.8H, v24.8H, #8 > + uxtl v24.8H, v24.8B > + add v4.8h, v4.8h, v18.8h > + shl v24.8h, v24.8h, #2 > + > + adds x8, x8, x9 I think it would be better for in-order cores to do this 'adds' maybe a couple instructions earlier (but the 'mov' from SIMD to GPR probably takes a couple cycles, so not too far earlier), maybe one instruction earlier? > + b.eq 9f > + > + movi v31.8h, #3 // (tc0 - 1) << (BIT_DEPTH - 8)) + 1 I guess this 'movi' could be done before the 'b.eq' too? If we branch out, we'd have run it in vain, but it's probably essentially free in that case anyway, and avoids having the next 'uqsub' stalling, waiting for it. > + uqsub v24.8h, v24.8h, v31.8h > + sub v4.8h , v4.8h, v2.8h > + srshr v4.8h, v4.8h, #3 > + smin v4.8h, v4.8h, v24.8h > + neg v25.8h, v24.8h > + smax v4.8h, v4.8h, v25.8h > + and v4.16B, v4.16B, v26.16B > + add v16.8h, v16.8h, v4.8h > + sub v0.8h, v0.8h, v4.8h Nit: The vertical alignment is quite wobbly here, please try to make nice vertical lines if possible. > + > + mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping > + movi v5.8h, #0 > + smin v0.8h, v0.8h, v4.8h > + smax v16.8h, v16.8h, v5.8h > + smax v0.8h, v0.8h, v5.8h > + smin v16.8h, v16.8h, v4.8h I think it'd be (marginally) better pipelining, and more consistent, if you'd do first two smin v4, then two smax v5. > +.endm > + > +function ff_h264_v_loop_filter_chroma_neon_10, export=1 > + h264_loop_filter_start_10 > + > + mov x10, x0 > + sub x0, x0, x1, lsl #1 > + ld1 {v18.8h}, [x0 ], x1 > + ld1 {v0.8h}, [x10], x1 > + ld1 {v16.8h}, [x0 ], x1 > + ld1 {v2.8h}, [x10] > + > + h264_loop_filter_chroma_10 > + > + sub x0, x10, x1, lsl #1 > + st1 {v16.8h}, [x0], x1 > + st1 {v0.8h}, [x0], x1 > +9: > + ret > +endfunc > + > +function ff_h264_h_loop_filter_chroma_neon_10, export=1 > + h264_loop_filter_start_10 > + > + sub x0, x0, #4 // access the 2nd left pixel > +h_loop_filter_chroma420_10: > + add x10, x0, x1, lsl #2 > + ld1 {v18.d}[0], [x0 ], x1 > + ld1 {v18.d}[1], [x10], x1 > + ld1 {v16.d}[0], [x0 ], x1 > + ld1 {v16.d}[1], [x10], x1 > + ld1 {v0.d}[0], [x0 ], x1 > + ld1 {v0.d}[1], [x10], x1 > + ld1 {v2.d}[0], [x0 ], x1 > + ld1 {v2.d}[1], [x10], x1 > + > + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 > + > + h264_loop_filter_chroma_10 > + > + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 > + > + sub x0, x10, x1, lsl #3 > + st1 {v18.d}[0], [x0], x1 > + st1 {v16.d}[0], [x0], x1 > + st1 {v0.d}[0], [x0], x1 > + st1 {v2.d}[0], [x0], x1 > + st1 {v18.d}[1], [x0], x1 > + st1 {v16.d}[1], [x0], x1 > + st1 {v0.d}[1], [x0], x1 > + st1 {v2.d}[1], [x0], x1 > +9: > + ret > +endfunc > + > +function ff_h264_h_loop_filter_chroma422_neon_10, export=1 > + h264_loop_filter_start_10 > + add x5, x0, x1 > + sub x0, x0, #4 > + add x1, x1, x1 > + mov x7, x30 > + bl h_loop_filter_chroma420_10 > + mov x30, x7 > + sub x0, x5, #4 > + mov v24.s[0], w6 > + b h_loop_filter_chroma420_10 > +endfunc > + > +.macro h264_loop_filter_chroma_intra_10 > + uabd v26.8h, v16.8h, v17.8h // abs(p0 - q0) > + uabd v27.8h, v18.8h, v16.8h // abs(p1 - p0) > + uabd v28.8h, v19.8h, v17.8h // abs(q1 - q0) > + cmhi v26.8h, v30.8h, v26.8h // < alpha > + cmhi v27.8h, v31.8h, v27.8h // < beta > + cmhi v28.8h, v31.8h, v28.8h // < beta > + and v26.16b, v26.16b, v27.16b > + and v26.16b, v26.16b, v28.16b > + mov x2, v26.d[0] > + mov x3, v26.d[1] > + > + shl v4.8h, v18.8h, #1 > + shl v6.8h, v19.8h, #1 > + > + adds x2, x2, x3 > + b.eq 9f > + > + add v20.8h, v16.8h, v19.8h > + add v22.8h, v17.8h, v18.8h > + add v20.8h, v20.8h, v4.8h > + add v22.8h, v22.8h, v6.8h > + urshr v24.8h, v20.8h, #2 > + urshr v25.8h, v22.8h, #2 > + bit v16.16b, v24.16b, v26.16b > + bit v17.16b, v25.16b, v26.16b > +.endm > + > +function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1 > + h264_loop_filter_start_intra_10 > + mov x9, x0 > + sub x0, x0, x1, lsl #1 > + ld1 {v18.8h}, [x0], x1 > + ld1 {v17.8h}, [x9], x1 > + ld1 {v16.8h}, [x0], x1 > + ld1 {v19.8h}, [x9] > + > + h264_loop_filter_chroma_intra_10 > + > + sub x0, x9, x1, lsl #1 > + st1 {v16.8h}, [x0], x1 > + st1 {v17.8h}, [x0], x1 > + > +9: > + ret > +endfunc > + > +function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1 > + h264_loop_filter_start_intra_10 > + > + sub x4, x0, #4 > + sub x0, x0, #2 > + add x9, x4, x1, lsl #1 > + ld1 {v18.8h}, [x4], x1 > + ld1 {v17.8h}, [x9], x1 > + ld1 {v16.8h}, [x4], x1 > + ld1 {v19.8h}, [x9], x1 > + > + transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 > + > + h264_loop_filter_chroma_intra_10 > + > + st2 {v16.h,v17.h}[0], [x0], x1 > + st2 {v16.h,v17.h}[1], [x0], x1 > + st2 {v16.h,v17.h}[2], [x0], x1 > + st2 {v16.h,v17.h}[3], [x0], x1 > + > +9: > + ret > +endfunc > + > +function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1 > + h264_loop_filter_start_intra_10 > + sub x4, x0, #4 > + sub x0, x0, #2 > +h_loop_filter_chroma420_intra_10: > + add x9, x4, x1, lsl #2 > + ld1 {v18.4h}, [x4], x1 > + ld1 {v18.d}[1], [x9], x1 > + ld1 {v16.4h}, [x4], x1 > + ld1 {v16.d}[1], [x9], x1 > + ld1 {v17.4h}, [x4], x1 > + ld1 {v17.d}[1], [x9], x1 > + ld1 {v19.4h}, [x4], x1 > + ld1 {v19.d}[1], [x9], x1 Try to align things here to make this bit at least a bit more readable. This looks pretty good now overall, just a bit minor cosmetic cleanup and maybe some minor instruction scheduling left. If you have access to run and benchmark things on an in-order core (like Cortex A53), that'd be good to do while trying to do the instruction scheduling tweaks to avoid stalls. // Martin
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c index cbaf8d31eb..6bf3ecb8a1 100644 --- a/libavcodec/aarch64/h264dsp_init_aarch64.c +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c @@ -83,6 +83,29 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[5 * 8]); +void ff_h264_v_loop_filter_luma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_luma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_v_loop_filter_luma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta); +void ff_h264_h_loop_filter_luma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta); +void ff_h264_v_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_chroma422_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_v_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma422_intra_neon_10(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); + av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { @@ -125,5 +148,19 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, c->h264_idct8_add = ff_h264_idct8_add_neon; c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon; c->h264_idct8_add4 = ff_h264_idct8_add4_neon; + } else if (have_neon(cpu_flags) && bit_depth == 10) { + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon_10; + c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon_10; + + if (chroma_format_idc <= 1) { + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon_10; + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon_10; + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10; + } else { + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon_10; + c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon_10; + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon_10; + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon_10; + } } } diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S index 997082498f..80b7ed5ce1 100644 --- a/libavcodec/aarch64/h264dsp_neon.S +++ b/libavcodec/aarch64/h264dsp_neon.S @@ -819,3 +819,258 @@ endfunc weight_func 16 weight_func 8 weight_func 4 + +.macro h264_loop_filter_start_10 + cmp w2, #0 + ldr w6, [x4] + ccmp w3, #0, #0, ne + lsl w2, w2, #2 + mov v24.S[0], w6 + lsl w3, w3, #2 + and w8, w6, w6, lsl #16 + b.eq 1f + ands w8, w8, w8, lsl #8 + b.ge 2f +1: + ret +2: +.endm + +.macro h264_loop_filter_start_intra_10 + orr w4, w2, w3 + cbnz w4, 1f + ret +1: + lsl w2, w2, #2 + lsl w3, w3, #2 + dup v30.8h, w2 // alpha + dup v31.8h, w3 // beta +.endm + +.macro h264_loop_filter_chroma_10 + dup v22.8h, w2 // alpha + dup v23.8h, w3 // beta + uxtl v24.8h, v24.8b // tc0 + + uabd v26.8h, v16.8h, v0.8h // abs(p0 - q0) + uabd v28.8h, v18.8h, v16.8h // abs(p1 - p0) + uabd v30.8h, v2.8h, v0.8h // abs(q1 - q0) + cmhi v26.8h, v22.8h, v26.8h // < alpha + cmhi v28.8h, v23.8h, v28.8h // < beta + cmhi v30.8h, v23.8h, v30.8h // < beta + + and v26.16b, v26.16b, v28.16b + mov v4.16b, v0.16b + sub v4.8h, v4.8h, v16.8h + and v26.16b, v26.16b, v30.16b + shl v4.8h, v4.8h, #2 + mov x8, v26.d[0] + mov x9, v26.d[1] + sli v24.8H, v24.8H, #8 + uxtl v24.8H, v24.8B + add v4.8h, v4.8h, v18.8h + shl v24.8h, v24.8h, #2 + + adds x8, x8, x9 + b.eq 9f + + movi v31.8h, #3 // (tc0 - 1) << (BIT_DEPTH - 8)) + 1 + uqsub v24.8h, v24.8h, v31.8h + sub v4.8h , v4.8h, v2.8h + srshr v4.8h, v4.8h, #3 + smin v4.8h, v4.8h, v24.8h + neg v25.8h, v24.8h + smax v4.8h, v4.8h, v25.8h + and v4.16B, v4.16B, v26.16B + add v16.8h, v16.8h, v4.8h + sub v0.8h, v0.8h, v4.8h + + mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping + movi v5.8h, #0 + smin v0.8h, v0.8h, v4.8h + smax v16.8h, v16.8h, v5.8h + smax v0.8h, v0.8h, v5.8h + smin v16.8h, v16.8h, v4.8h +.endm + +function ff_h264_v_loop_filter_chroma_neon_10, export=1 + h264_loop_filter_start_10 + + mov x10, x0 + sub x0, x0, x1, lsl #1 + ld1 {v18.8h}, [x0 ], x1 + ld1 {v0.8h}, [x10], x1 + ld1 {v16.8h}, [x0 ], x1 + ld1 {v2.8h}, [x10] + + h264_loop_filter_chroma_10 + + sub x0, x10, x1, lsl #1 + st1 {v16.8h}, [x0], x1 + st1 {v0.8h}, [x0], x1 +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma_neon_10, export=1 + h264_loop_filter_start_10 + + sub x0, x0, #4 // access the 2nd left pixel +h_loop_filter_chroma420_10: + add x10, x0, x1, lsl #2 + ld1 {v18.d}[0], [x0 ], x1 + ld1 {v18.d}[1], [x10], x1 + ld1 {v16.d}[0], [x0 ], x1 + ld1 {v16.d}[1], [x10], x1 + ld1 {v0.d}[0], [x0 ], x1 + ld1 {v0.d}[1], [x10], x1 + ld1 {v2.d}[0], [x0 ], x1 + ld1 {v2.d}[1], [x10], x1 + + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 + + h264_loop_filter_chroma_10 + + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 + + sub x0, x10, x1, lsl #3 + st1 {v18.d}[0], [x0], x1 + st1 {v16.d}[0], [x0], x1 + st1 {v0.d}[0], [x0], x1 + st1 {v2.d}[0], [x0], x1 + st1 {v18.d}[1], [x0], x1 + st1 {v16.d}[1], [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v2.d}[1], [x0], x1 +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma422_neon_10, export=1 + h264_loop_filter_start_10 + add x5, x0, x1 + sub x0, x0, #4 + add x1, x1, x1 + mov x7, x30 + bl h_loop_filter_chroma420_10 + mov x30, x7 + sub x0, x5, #4 + mov v24.s[0], w6 + b h_loop_filter_chroma420_10 +endfunc + +.macro h264_loop_filter_chroma_intra_10 + uabd v26.8h, v16.8h, v17.8h // abs(p0 - q0) + uabd v27.8h, v18.8h, v16.8h // abs(p1 - p0) + uabd v28.8h, v19.8h, v17.8h // abs(q1 - q0) + cmhi v26.8h, v30.8h, v26.8h // < alpha + cmhi v27.8h, v31.8h, v27.8h // < beta + cmhi v28.8h, v31.8h, v28.8h // < beta + and v26.16b, v26.16b, v27.16b + and v26.16b, v26.16b, v28.16b + mov x2, v26.d[0] + mov x3, v26.d[1] + + shl v4.8h, v18.8h, #1 + shl v6.8h, v19.8h, #1 + + adds x2, x2, x3 + b.eq 9f + + add v20.8h, v16.8h, v19.8h + add v22.8h, v17.8h, v18.8h + add v20.8h, v20.8h, v4.8h + add v22.8h, v22.8h, v6.8h + urshr v24.8h, v20.8h, #2 + urshr v25.8h, v22.8h, #2 + bit v16.16b, v24.16b, v26.16b + bit v17.16b, v25.16b, v26.16b +.endm + +function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1 + h264_loop_filter_start_intra_10 + mov x9, x0 + sub x0, x0, x1, lsl #1 + ld1 {v18.8h}, [x0], x1 + ld1 {v17.8h}, [x9], x1 + ld1 {v16.8h}, [x0], x1 + ld1 {v19.8h}, [x9] + + h264_loop_filter_chroma_intra_10 + + sub x0, x9, x1, lsl #1 + st1 {v16.8h}, [x0], x1 + st1 {v17.8h}, [x0], x1 + +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1 + h264_loop_filter_start_intra_10 + + sub x4, x0, #4 + sub x0, x0, #2 + add x9, x4, x1, lsl #1 + ld1 {v18.8h}, [x4], x1 + ld1 {v17.8h}, [x9], x1 + ld1 {v16.8h}, [x4], x1 + ld1 {v19.8h}, [x9], x1 + + transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra_10 + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0], x1 + +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1 + h264_loop_filter_start_intra_10 + sub x4, x0, #4 + sub x0, x0, #2 +h_loop_filter_chroma420_intra_10: + add x9, x4, x1, lsl #2 + ld1 {v18.4h}, [x4], x1 + ld1 {v18.d}[1], [x9], x1 + ld1 {v16.4h}, [x4], x1 + ld1 {v16.d}[1], [x9], x1 + ld1 {v17.4h}, [x4], x1 + ld1 {v17.d}[1], [x9], x1 + ld1 {v19.4h}, [x4], x1 + ld1 {v19.d}[1], [x9], x1 + + transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra_10 + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0], x1 + st2 {v16.h,v17.h}[4], [x0], x1 + st2 {v16.h,v17.h}[5], [x0], x1 + st2 {v16.h,v17.h}[6], [x0], x1 + st2 {v16.h,v17.h}[7], [x0], x1 + +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1 + h264_loop_filter_start_intra_10 + sub x4, x0, #4 + add x5, x0, x1, lsl #3 + sub x0, x0, #2 + mov x7, x30 + bl h_loop_filter_chroma420_intra_10 + mov x4, x9 + sub x0, x5, #2 + mov x30, x7 + b h_loop_filter_chroma420_intra_10 +endfunc
Benchmarks: A53 A72 h264_h_loop_filter_chroma422_10bpp_c: 277.5 114.2 h264_h_loop_filter_chroma422_10bpp_neon: 109.7 81.7 h264_h_loop_filter_chroma_10bpp_c: 165.0 75.5 h264_h_loop_filter_chroma_10bpp_neon: 121.2 74.7 h264_h_loop_filter_chroma_intra422_10bpp_c: 324.2 124.2 h264_h_loop_filter_chroma_intra422_10bpp_neon: 155.2 99.5 h264_h_loop_filter_chroma_intra_10bpp_c: 121.0 48.5 h264_h_loop_filter_chroma_intra_10bpp_neon: 79.5 52.7 h264_h_loop_filter_chroma_mbaff422_10bpp_c: 191.0 73.5 h264_h_loop_filter_chroma_mbaff422_10bpp_neon: 121.2 75.5 h264_h_loop_filter_chroma_mbaff_intra422_10bpp_c: 117.0 51.5 h264_h_loop_filter_chroma_mbaff_intra422_10bpp_neon: 79.5 53.7 h264_h_loop_filter_chroma_mbaff_intra_10bpp_c: 63.0 28.5 h264_h_loop_filter_chroma_mbaff_intra_10bpp_neon: 48.7 33.2 h264_v_loop_filter_chroma_10bpp_c: 260.2 135.5 h264_v_loop_filter_chroma_10bpp_neon: 72.2 49.2 h264_v_loop_filter_chroma_intra_10bpp_c: 158.0 70.7 h264_v_loop_filter_chroma_intra_10bpp_neon: 48.7 32.0 Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com> --- removed leftover code, moved from 32bit and started loading with two alternating registers, code became quite a bit faster! libavcodec/aarch64/h264dsp_init_aarch64.c | 37 ++++ libavcodec/aarch64/h264dsp_neon.S | 255 ++++++++++++++++++++++ 2 files changed, 292 insertions(+)