Message ID | 20210716175639.313513-2-mnitenko@gmail.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/2] lavc/aarch64: move transpose_4x8H to neon.S | expand |
Context | Check | Description |
---|---|---|
andriy/x86_make | success | Make finished |
andriy/x86_make_fate | success | Make fate finished |
andriy/PPC64_make | success | Make finished |
andriy/PPC64_make_fate | success | Make fate finished |
16 Jul 2021, 19:56 by mnitenko@gmail.com: > Benchmarks: A53 A72 > h264_h_loop_filter_chroma422_10bpp_c: 293.0 116.7 > h264_h_loop_filter_chroma422_10bpp_neon: 283.7 126.2 > h264_h_loop_filter_chroma_10bpp_c: 165.2 58.5 > h264_h_loop_filter_chroma_10bpp_neon: 74.7 87.2 > h264_h_loop_filter_chroma_intra422_10bpp_c: 246.2 124.5 > h264_h_loop_filter_chroma_intra422_10bpp_neon: 178.7 70.0 > h264_h_loop_filter_chroma_intra_10bpp_c: 121.0 40.5 > h264_h_loop_filter_chroma_intra_10bpp_neon: 73.7 59.2 > h264_h_loop_filter_chroma_mbaff422_10bpp_c: 145.7 72.7 > h264_h_loop_filter_chroma_mbaff422_10bpp_neon: 151.7 87.2 > h264_h_loop_filter_chroma_mbaff_intra422_10bpp_c: 117.5 48.0 > h264_h_loop_filter_chroma_mbaff_intra422_10bpp_neon: 73.7 37.7 > h264_h_loop_filter_chroma_mbaff_intra_10bpp_c: 57.0 27.7 > h264_h_loop_filter_chroma_mbaff_intra_10bpp_neon: 81.7 50.7 > h264_h_loop_filter_luma_intra_8bpp_c: 242.7 134.0 > h264_h_loop_filter_luma_intra_8bpp_neon: 100.7 53.5 > h264_v_loop_filter_chroma_10bpp_c: 257.2 138.5 > h264_v_loop_filter_chroma_10bpp_neon: 98.2 67.5 > h264_v_loop_filter_chroma_intra_10bpp_c: 158.0 76.2 > h264_v_loop_filter_chroma_intra_10bpp_neon: 62.7 36.5 > > Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com> > --- > > this code is a bit slow, particularly the horizontal versions, so any > suggestions are greatly appreciated! > > libavcodec/aarch64/h264dsp_init_aarch64.c | 29 +++ > libavcodec/aarch64/h264dsp_neon.S | 299 ++++++++++++++++++++++ > 2 files changed, 328 insertions(+) > > diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c > index d5baccf235..9ee9c11e15 100644 > --- a/libavcodec/aarch64/h264dsp_init_aarch64.c > +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c > @@ -83,6 +83,21 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset, > int16_t *block, int stride, > const uint8_t nnzc[6*8]); > > +void ff_h264_v_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, > + int beta, int8_t *tc0); > +void ff_h264_h_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, > + int beta, int8_t *tc0); > +void ff_h264_h_loop_filter_chroma422_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, > + int beta, int8_t *tc0); > +void ff_h264_v_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, > + int alpha, int beta); > +void ff_h264_h_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, > + int alpha, int beta); > +void ff_h264_h_loop_filter_chroma422_intra_neon_10(uint8_t *pix, ptrdiff_t stride, > + int alpha, int beta); > +void ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10(uint8_t *pix, ptrdiff_t stride, > + int alpha, int beta); > + > av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, > const int chroma_format_idc) > { > @@ -125,5 +140,19 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, > c->h264_idct8_add = ff_h264_idct8_add_neon; > c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon; > c->h264_idct8_add4 = ff_h264_idct8_add4_neon; > + } else if (have_neon(cpu_flags) && bit_depth == 10) { > + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon_10; > + c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon_10; > + > + if (chroma_format_idc <= 1) { > + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon_10; > + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon_10; > + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10; > + } else { > + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon_10; > + c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon_10; > + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon_10; > + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon_10; > + } > } > } > diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S > index fbb8ecc463..92e5afa524 100644 > --- a/libavcodec/aarch64/h264dsp_neon.S > +++ b/libavcodec/aarch64/h264dsp_neon.S > @@ -827,3 +827,302 @@ endfunc > weight_func 16 > weight_func 8 > weight_func 4 > + > +.macro h264_loop_filter_start_10 > + cmp w2, #0 > + ldr w6, [x4] > + ccmp w3, #0, #0, ne > + lsl w2, w2, #2 // shift needed for 10bit > + mov v24.S[0], w6 > + lsl w3, w3, #2 > + and w8, w6, w6, lsl #16 > + b.eq 1f > + cmp w6, #0 > + b.eq 1f > + ands w8, w8, w8, lsl #8 > + b.ge 2f > +1: > + ret > +2: > +.endm > + > +.macro h264_loop_filter_start_intra_10 > + orr w4, w2, w3 > + cbnz w4, 1f > + ret > +1: > + sxtw x1, w1 > + lsl w2, w2, #2 // shift needed for 10bit > + lsl w3, w3, #2 // shift needed for 10bit > + dup v30.8h, w2 // alpha > + dup v31.8h, w3 // beta > +.endm > + > +.macro h264_loop_filter_chroma_10 > + dup v22.8h, w2 // alpha > + dup v23.8h, w3 // beta > + uxtl v24.8h, v24.8b // tc0 > + > + uabd v26.8h, v16.8h, v0.8h // abs(p0 - q0) > + uabd v28.8h, v18.8h, v16.8h // abs(p1 - p0) > + uabd v30.8h, v2.8h, v0.8h // abs(q1 - q0) > + > + cmhi v26.8h, v22.8h, v26.8h // < alpha > + cmhi v28.8h, v23.8h, v28.8h // < beta > + cmhi v30.8h, v23.8h, v30.8h // < beta > + > + uxtl v4.4s, v0.4h > + uxtl2 v5.4s, v0.8h > + > + and v26.16b, v26.16b, v28.16b > + > + usubw v4.4s, v4.4s, v16.4h > + usubw2 v5.4s, v5.4s, v16.8h > + > + and v26.16b, v26.16b, v30.16b > + > + shl v4.4s, v4.4s, #2 > + shl v5.4s, v5.4s, #2 > + > + mov x8, v26.d[0] > + mov x9, v26.d[1] > + orr x8, x8, x9 > + > + sli v24.8H, v24.8H, #8 > + uxtl v24.8H, v24.8B > + uaddw v4.4s, v4.4s, v18.4h > + uaddw2 v5.4s, v5.4s, v18.8h // add p1 > + > + cbz x8, 9f > + > + usubw v4.4s, v4.4s, v2.4h > + usubw2 v5.4s, v5.4s, v2.8h // sub q1 > + rshrn v4.4h, v4.4s, #3 > + rshrn2 v4.8h, v5.4s, #3 > + > + mov w8, #1 > + dup v31.8h, w8 // this is actually important for higher depths, but not needed in 8 bit > + sub v24.8h, v24.8h, v31.8h > + shl v24.8h, v24.8h, #2 > + add v24.8h, v24.8h, v31.8h > + mov w8, #0 > + dup v31.8h, w8 > + smax v24.8h, v24.8h, v31.8h // this all feels like a huge hack (needed to exclude neg values) > + > + smin v4.8h, v4.8h, v24.8h > + neg v25.8h, v24.8h > + smax v4.8h, v4.8h, v25.8h > + > + uxtl v22.4s, v0.4h > + uxtl2 v23.4s, v0.8h > + > + and v4.16B, v4.16B, v26.16B > + > + uxtl v28.4s, v16.4h > + uxtl2 v29.4s, v16.8h > + > + saddw v28.4s, v28.4s, v4.4h > + saddw2 v29.4s, v29.4s, v4.8h > + > + ssubw v22.4s, v22.4s, v4.4h > + ssubw2 v23.4s, v23.4s, v4.8h > + > + sqxtun v16.4h, v28.4s > + sqxtun2 v16.8h, v29.4s > + > + sqxtun v0.4h, v22.4s > + sqxtun2 v0.8h, v23.4s > + > + mov w2, #1023 // for clipping > + dup v4.8h, w2 > + smin v0.8h, v0.8h, v4.8h > + smin v16.8h, v16.8h, v4.8h > +.endm > + > +function ff_h264_v_loop_filter_chroma_neon_10, export=1 > + h264_loop_filter_start_10 > + sxtw x1, w1 > + > + sub x0, x0, x1, lsl #1 > + ld1 {v18.8h}, [x0], x1 > + ld1 {v16.8h}, [x0], x1 > + ld1 {v0.8h}, [x0], x1 > + ld1 {v2.8h}, [x0] > + > + h264_loop_filter_chroma_10 > + > + sub x0, x0, x1, lsl #1 > + st1 {v16.8h}, [x0], x1 > + st1 {v0.8h}, [x0], x1 > +9: > + ret > +endfunc > + > +function ff_h264_h_loop_filter_chroma_neon_10, export=1 > + h264_loop_filter_start_10 > + sxtw x1, w1 > + > + sub x0, x0, #4 > +h_loop_filter_chroma420_10: > + ld1 {v18.d}[0], [x0], x1 > + ld1 {v16.d}[0], [x0], x1 > + ld1 {v0.d}[0], [x0], x1 > + ld1 {v2.d}[0], [x0], x1 > + ld1 {v18.d}[1], [x0], x1 > + ld1 {v16.d}[1], [x0], x1 > + ld1 {v0.d}[1], [x0], x1 > + ld1 {v2.d}[1], [x0], x1 > + > + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 > + > + h264_loop_filter_chroma_10 > + > + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 > + > + sub x0, x0, x1, lsl #3 > + st1 {v18.d}[0], [x0], x1 > + st1 {v16.d}[0], [x0], x1 > + st1 {v0.d}[0], [x0], x1 > + st1 {v2.d}[0], [x0], x1 > + st1 {v18.d}[1], [x0], x1 > + st1 {v16.d}[1], [x0], x1 > + st1 {v0.d}[1], [x0], x1 > + st1 {v2.d}[1], [x0], x1 > +9: > + ret > +endfunc > + > +function ff_h264_h_loop_filter_chroma422_neon_10, export=1 > + sxtw x1, w1 > + h264_loop_filter_start_10 > + add x5, x0, x1 > + sub x0, x0, #4 > + add x1, x1, x1 > + mov x7, x30 > + bl h_loop_filter_chroma420_10 > + mov x30, x7 > + sub x0, x5, #4 > + mov v24.s[0], w6 > + b h_loop_filter_chroma420_10 > +endfunc > + > +.macro h264_loop_filter_chroma_intra_10 > + uabd v26.8h, v16.8h, v17.8h // abs(p0 - q0) > + uabd v27.8h, v18.8h, v16.8h // abs(p1 - p0) > + uabd v28.8h, v19.8h, v17.8h // abs(q1 - q0) > + cmhi v26.8h, v30.8h, v26.8h // < alpha > + cmhi v27.8h, v31.8h, v27.8h // < beta > + cmhi v28.8h, v31.8h, v28.8h // < beta > + and v26.16b, v26.16b, v27.16b > + and v26.16b, v26.16b, v28.16b > + mov x2, v26.d[0] > + mov x3, v26.d[1] > + orr x2, x2, x3 > + > + ushll v4.4s, v18.4h, #1 > + ushll2 v5.4s, v18.8h, #1 > + ushll v6.4s, v19.4h, #1 > + ushll2 v7.4s, v19.8h, #1 > + > + cbz x2, 9f > + > + uaddl v20.4s, v16.4h, v19.4h > + uaddl2 v21.4s, v16.8h, v19.8h > + uaddl v22.4s, v17.4h, v18.4h > + uaddl2 v23.4s, v17.8h, v18.8h > + add v20.4s, v20.4s, v4.4s > + add v21.4s, v21.4s, v5.4s > + add v22.4s, v22.4s, v6.4s > + add v23.4s, v23.4s, v7.4s > + uqrshrn v24.4h, v20.4s, #2 > + uqrshrn2 v24.8h, v21.4s, #2 > + uqrshrn v25.4h, v22.4s, #2 > + uqrshrn2 v25.8h, v23.4s, #2 > + bit v16.16b, v24.16b, v26.16b > + bit v17.16b, v25.16b, v26.16b > +.endm > + > +function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1 > + h264_loop_filter_start_intra_10 > + sub x0, x0, x1, lsl #1 > + ld1 {v18.8h}, [x0], x1 > + ld1 {v16.8h}, [x0], x1 > + ld1 {v17.8h}, [x0], x1 > + ld1 {v19.8h}, [x0] > + > + h264_loop_filter_chroma_intra_10 > + > + sub x0, x0, x1, lsl #1 > + st1 {v16.8h}, [x0], x1 > + st1 {v17.8h}, [x0], x1 > + > +9: > + ret > +endfunc > + > +function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1 > + h264_loop_filter_start_intra_10 > + > + sub x4, x0, #4 > + sub x0, x0, #2 > + ld1 {v18.8h}, [x4], x1 > + ld1 {v16.8h}, [x4], x1 > + ld1 {v17.8h}, [x4], x1 > + ld1 {v19.8h}, [x4], x1 > + > + transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 > + > + h264_loop_filter_chroma_intra_10 > + > + st2 {v16.h,v17.h}[0], [x0], x1 > + st2 {v16.h,v17.h}[1], [x0], x1 > + st2 {v16.h,v17.h}[2], [x0], x1 > + st2 {v16.h,v17.h}[3], [x0], x1 > + > +9: > + ret > +endfunc > + > +function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1 > + h264_loop_filter_start_intra_10 > + > + sub x4, x0, #4 > + sub x0, x0, #2 > +h_loop_filter_chroma420_intra_10: > + ld1 {v18.8h}, [x4], x1 > + ld1 {v16.8h}, [x4], x1 > + ld1 {v17.8h}, [x4], x1 > + ld1 {v19.8h}, [x4], x1 > + ld1 {v18.d}[1], [x4], x1 > + ld1 {v16.d}[1], [x4], x1 > + ld1 {v17.d}[1], [x4], x1 > + ld1 {v19.d}[1], [x4], x1 > + > + transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 > + > + h264_loop_filter_chroma_intra_10 > + > + st2 {v16.h,v17.h}[0], [x0], x1 > + st2 {v16.h,v17.h}[1], [x0], x1 > + st2 {v16.h,v17.h}[2], [x0], x1 > + st2 {v16.h,v17.h}[3], [x0], x1 > + st2 {v16.h,v17.h}[4], [x0], x1 > + st2 {v16.h,v17.h}[5], [x0], x1 > + st2 {v16.h,v17.h}[6], [x0], x1 > + st2 {v16.h,v17.h}[7], [x0], x1 > + > +9: > + ret > +endfunc > + > +function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1 > + h264_loop_filter_start_intra_10 > + sub x4, x0, #4 > + add x5, x0, x1, lsl #3 > + sub x0, x0, #2 > + mov x7, x30 > + bl h_loop_filter_chroma420_intra_10 > + sub x0, x5, #2 > + mov x30, x7 > + b h_loop_filter_chroma420_intra_10 > +endfunc > I think you have enough spare registers to unroll h_loop_filter_chroma420_intra_10 twice. That should speed up ff_h264_h_loop_filter_chroma422_intra_neon_10 by a lot, since with aarch64 an unused register is a wasted register.
On Fri, 16 Jul 2021, Mikhail Nitenko wrote: > Benchmarks: A53 A72 > h264_h_loop_filter_chroma422_10bpp_c: 293.0 116.7 > h264_h_loop_filter_chroma422_10bpp_neon: 283.7 126.2 > h264_h_loop_filter_chroma_10bpp_c: 165.2 58.5 > h264_h_loop_filter_chroma_10bpp_neon: 74.7 87.2 > h264_h_loop_filter_chroma_intra422_10bpp_c: 246.2 124.5 > h264_h_loop_filter_chroma_intra422_10bpp_neon: 178.7 70.0 > h264_h_loop_filter_chroma_intra_10bpp_c: 121.0 40.5 > h264_h_loop_filter_chroma_intra_10bpp_neon: 73.7 59.2 > h264_h_loop_filter_chroma_mbaff422_10bpp_c: 145.7 72.7 > h264_h_loop_filter_chroma_mbaff422_10bpp_neon: 151.7 87.2 > h264_h_loop_filter_chroma_mbaff_intra422_10bpp_c: 117.5 48.0 > h264_h_loop_filter_chroma_mbaff_intra422_10bpp_neon: 73.7 37.7 > h264_h_loop_filter_chroma_mbaff_intra_10bpp_c: 57.0 27.7 > h264_h_loop_filter_chroma_mbaff_intra_10bpp_neon: 81.7 50.7 > h264_h_loop_filter_luma_intra_8bpp_c: 242.7 134.0 > h264_h_loop_filter_luma_intra_8bpp_neon: 100.7 53.5 > h264_v_loop_filter_chroma_10bpp_c: 257.2 138.5 > h264_v_loop_filter_chroma_10bpp_neon: 98.2 67.5 > h264_v_loop_filter_chroma_intra_10bpp_c: 158.0 76.2 > h264_v_loop_filter_chroma_intra_10bpp_neon: 62.7 36.5 > > Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com> > --- > > this code is a bit slow, particularly the horizontal versions, so any > suggestions are greatly appreciated! > > libavcodec/aarch64/h264dsp_init_aarch64.c | 29 +++ > libavcodec/aarch64/h264dsp_neon.S | 299 ++++++++++++++++++++++ > 2 files changed, 328 insertions(+) > > diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c > index d5baccf235..9ee9c11e15 100644 > --- a/libavcodec/aarch64/h264dsp_init_aarch64.c > +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c > @@ -83,6 +83,21 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset, > int16_t *block, int stride, > const uint8_t nnzc[6*8]); > > +void ff_h264_v_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, > + int beta, int8_t *tc0); > +void ff_h264_h_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, > + int beta, int8_t *tc0); > +void ff_h264_h_loop_filter_chroma422_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, > + int beta, int8_t *tc0); > +void ff_h264_v_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, > + int alpha, int beta); > +void ff_h264_h_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, > + int alpha, int beta); > +void ff_h264_h_loop_filter_chroma422_intra_neon_10(uint8_t *pix, ptrdiff_t stride, > + int alpha, int beta); > +void ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10(uint8_t *pix, ptrdiff_t stride, > + int alpha, int beta); > + > av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, > const int chroma_format_idc) > { > @@ -125,5 +140,19 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, > c->h264_idct8_add = ff_h264_idct8_add_neon; > c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon; > c->h264_idct8_add4 = ff_h264_idct8_add4_neon; > + } else if (have_neon(cpu_flags) && bit_depth == 10) { > + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon_10; > + c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon_10; > + > + if (chroma_format_idc <= 1) { > + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon_10; > + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon_10; > + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10; > + } else { > + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon_10; > + c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon_10; > + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon_10; > + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon_10; > + } > } > } > diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S > index fbb8ecc463..92e5afa524 100644 > --- a/libavcodec/aarch64/h264dsp_neon.S > +++ b/libavcodec/aarch64/h264dsp_neon.S > @@ -827,3 +827,302 @@ endfunc > weight_func 16 > weight_func 8 > weight_func 4 > + > +.macro h264_loop_filter_start_10 > + cmp w2, #0 > + ldr w6, [x4] > + ccmp w3, #0, #0, ne > + lsl w2, w2, #2 // shift needed for 10bit > + mov v24.S[0], w6 > + lsl w3, w3, #2 Please indent the operand columns in the lsl instructions similarly to the rest > + and w8, w6, w6, lsl #16 > + b.eq 1f > + cmp w6, #0 > + b.eq 1f This extra cmp and b.eq isn't present in the 8 bit version. Why is it needed here? I.e. is this a typo or legitimate? > + ands w8, w8, w8, lsl #8 > + b.ge 2f > +1: > + ret > +2: > +.endm > + > +.macro h264_loop_filter_start_intra_10 > + orr w4, w2, w3 > + cbnz w4, 1f > + ret > +1: > + sxtw x1, w1 This sxtw (and all the other ones) are unneeded, as the stride argument is ptrdiff_t these days (it was int originally); I sent a separate patch to fix this for the existing 8 bit functions. > + lsl w2, w2, #2 // shift needed for 10bit > + lsl w3, w3, #2 // shift needed for 10bit > + dup v30.8h, w2 // alpha > + dup v31.8h, w3 // beta > +.endm > + > +.macro h264_loop_filter_chroma_10 > + dup v22.8h, w2 // alpha > + dup v23.8h, w3 // beta > + uxtl v24.8h, v24.8b // tc0 > + > + uabd v26.8h, v16.8h, v0.8h // abs(p0 - q0) > + uabd v28.8h, v18.8h, v16.8h // abs(p1 - p0) > + uabd v30.8h, v2.8h, v0.8h // abs(q1 - q0) > + > + cmhi v26.8h, v22.8h, v26.8h // < alpha > + cmhi v28.8h, v23.8h, v28.8h // < beta > + cmhi v30.8h, v23.8h, v30.8h // < beta > + > + uxtl v4.4s, v0.4h > + uxtl2 v5.4s, v0.8h > + > + and v26.16b, v26.16b, v28.16b > + > + usubw v4.4s, v4.4s, v16.4h > + usubw2 v5.4s, v5.4s, v16.8h I'm not sure if this step here requires being done with 32 bit elements; at least this far (the difference of two pixels), it only requires a signed 11 bit number, so it shouldn't require lengthening? > + > + and v26.16b, v26.16b, v30.16b > + > + shl v4.4s, v4.4s, #2 > + shl v5.4s, v5.4s, #2 > + > + mov x8, v26.d[0] > + mov x9, v26.d[1] > + orr x8, x8, x9 > + > + sli v24.8H, v24.8H, #8 > + uxtl v24.8H, v24.8B > + uaddw v4.4s, v4.4s, v18.4h > + uaddw2 v5.4s, v5.4s, v18.8h // add p1 > + > + cbz x8, 9f > + > + usubw v4.4s, v4.4s, v2.4h > + usubw2 v5.4s, v5.4s, v2.8h // sub q1 > + rshrn v4.4h, v4.4s, #3 > + rshrn2 v4.8h, v5.4s, #3 > + > + mov w8, #1 > + dup v31.8h, w8 // this is actually important for higher depths, but not needed in 8 bit You can use 'movi' instead of mov+dup. And instead of having a comment saying it's important, I'd rather see the comment explain what it does or why - you could e.g. have some comment refer to 'tc' > + sub v24.8h, v24.8h, v31.8h > + shl v24.8h, v24.8h, #2 > + add v24.8h, v24.8h, v31.8h So this does v24 = ((v24 - 1) << 2) + 1. Isn't that equivalent to v24 = (v24 << 2) - 3? > + mov w8, #0 > + dup v31.8h, w8 > + smax v24.8h, v24.8h, v31.8h // this all feels like a huge hack (needed to exclude neg values) You might want to use 'usqadd' instead of the add/sub + smax here. You could also start calculating v24 a bit earlier, interleaved with where v4/v5 is finished > + > + smin v4.8h, v4.8h, v24.8h > + neg v25.8h, v24.8h > + smax v4.8h, v4.8h, v25.8h > + > + uxtl v22.4s, v0.4h > + uxtl2 v23.4s, v0.8h None of this should require 32 bit elements > + > + and v4.16B, v4.16B, v26.16B > + > + uxtl v28.4s, v16.4h > + uxtl2 v29.4s, v16.8h > + > + saddw v28.4s, v28.4s, v4.4h > + saddw2 v29.4s, v29.4s, v4.8h > + > + ssubw v22.4s, v22.4s, v4.4h > + ssubw2 v23.4s, v23.4s, v4.8h > + > + sqxtun v16.4h, v28.4s > + sqxtun2 v16.8h, v29.4s > + > + sqxtun v0.4h, v22.4s > + sqxtun2 v0.8h, v23.4s > + > + mov w2, #1023 // for clipping > + dup v4.8h, w2 mvni #0xFC, lsl #8 And if possible, load your constants a bit earlier to avoid stalls > + smin v0.8h, v0.8h, v4.8h > + smin v16.8h, v16.8h, v4.8h > +.endm > + > +function ff_h264_v_loop_filter_chroma_neon_10, export=1 > + h264_loop_filter_start_10 > + sxtw x1, w1 > + > + sub x0, x0, x1, lsl #1 > + ld1 {v18.8h}, [x0], x1 > + ld1 {v16.8h}, [x0], x1 > + ld1 {v0.8h}, [x0], x1 > + ld1 {v2.8h}, [x0] > + > + h264_loop_filter_chroma_10 > + > + sub x0, x0, x1, lsl #1 > + st1 {v16.8h}, [x0], x1 > + st1 {v0.8h}, [x0], x1 > +9: > + ret > +endfunc > + > +function ff_h264_h_loop_filter_chroma_neon_10, export=1 > + h264_loop_filter_start_10 > + sxtw x1, w1 > + > + sub x0, x0, #4 > +h_loop_filter_chroma420_10: > + ld1 {v18.d}[0], [x0], x1 > + ld1 {v16.d}[0], [x0], x1 > + ld1 {v0.d}[0], [x0], x1 > + ld1 {v2.d}[0], [x0], x1 > + ld1 {v18.d}[1], [x0], x1 > + ld1 {v16.d}[1], [x0], x1 > + ld1 {v0.d}[1], [x0], x1 > + ld1 {v2.d}[1], [x0], x1 If the horizontal versions of these functions are slow, I'd recommend try loading using two alternating registers, like is done in all the vp9 loop filter functions. > + > + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 > + > + h264_loop_filter_chroma_10 > + > + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 > + > + sub x0, x0, x1, lsl #3 > + st1 {v18.d}[0], [x0], x1 > + st1 {v16.d}[0], [x0], x1 > + st1 {v0.d}[0], [x0], x1 > + st1 {v2.d}[0], [x0], x1 > + st1 {v18.d}[1], [x0], x1 > + st1 {v16.d}[1], [x0], x1 > + st1 {v0.d}[1], [x0], x1 > + st1 {v2.d}[1], [x0], x1 > +9: > + ret > +endfunc > + > +function ff_h264_h_loop_filter_chroma422_neon_10, export=1 > + sxtw x1, w1 > + h264_loop_filter_start_10 > + add x5, x0, x1 > + sub x0, x0, #4 > + add x1, x1, x1 > + mov x7, x30 > + bl h_loop_filter_chroma420_10 > + mov x30, x7 > + sub x0, x5, #4 > + mov v24.s[0], w6 > + b h_loop_filter_chroma420_10 > +endfunc > + > +.macro h264_loop_filter_chroma_intra_10 > + uabd v26.8h, v16.8h, v17.8h // abs(p0 - q0) > + uabd v27.8h, v18.8h, v16.8h // abs(p1 - p0) > + uabd v28.8h, v19.8h, v17.8h // abs(q1 - q0) This uses a completely different indentation than the rest, please match the existing style. I'll send a patch to reindent these bits in the existing code. > + cmhi v26.8h, v30.8h, v26.8h // < alpha > + cmhi v27.8h, v31.8h, v27.8h // < beta > + cmhi v28.8h, v31.8h, v28.8h // < beta > + and v26.16b, v26.16b, v27.16b > + and v26.16b, v26.16b, v28.16b > + mov x2, v26.d[0] > + mov x3, v26.d[1] > + orr x2, x2, x3 > + > + ushll v4.4s, v18.4h, #1 > + ushll2 v5.4s, v18.8h, #1 I'd recommend moving the 'orr' from above to here instead, to avoid stalls. You could also consider using 'adds' (there's no 'orrs', but for these cases it should work just fine) with b.eq instead of 'orr' + 'cbz'; see how that's done in the vp9 code. And again, 32 bit elements shouldn't be needed here. // Martin
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c index d5baccf235..9ee9c11e15 100644 --- a/libavcodec/aarch64/h264dsp_init_aarch64.c +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c @@ -83,6 +83,21 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_v_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_chroma422_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_v_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma422_intra_neon_10(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); + av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) { @@ -125,5 +140,19 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, c->h264_idct8_add = ff_h264_idct8_add_neon; c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon; c->h264_idct8_add4 = ff_h264_idct8_add4_neon; + } else if (have_neon(cpu_flags) && bit_depth == 10) { + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon_10; + c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon_10; + + if (chroma_format_idc <= 1) { + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon_10; + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon_10; + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10; + } else { + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon_10; + c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon_10; + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon_10; + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon_10; + } } } diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S index fbb8ecc463..92e5afa524 100644 --- a/libavcodec/aarch64/h264dsp_neon.S +++ b/libavcodec/aarch64/h264dsp_neon.S @@ -827,3 +827,302 @@ endfunc weight_func 16 weight_func 8 weight_func 4 + +.macro h264_loop_filter_start_10 + cmp w2, #0 + ldr w6, [x4] + ccmp w3, #0, #0, ne + lsl w2, w2, #2 // shift needed for 10bit + mov v24.S[0], w6 + lsl w3, w3, #2 + and w8, w6, w6, lsl #16 + b.eq 1f + cmp w6, #0 + b.eq 1f + ands w8, w8, w8, lsl #8 + b.ge 2f +1: + ret +2: +.endm + +.macro h264_loop_filter_start_intra_10 + orr w4, w2, w3 + cbnz w4, 1f + ret +1: + sxtw x1, w1 + lsl w2, w2, #2 // shift needed for 10bit + lsl w3, w3, #2 // shift needed for 10bit + dup v30.8h, w2 // alpha + dup v31.8h, w3 // beta +.endm + +.macro h264_loop_filter_chroma_10 + dup v22.8h, w2 // alpha + dup v23.8h, w3 // beta + uxtl v24.8h, v24.8b // tc0 + + uabd v26.8h, v16.8h, v0.8h // abs(p0 - q0) + uabd v28.8h, v18.8h, v16.8h // abs(p1 - p0) + uabd v30.8h, v2.8h, v0.8h // abs(q1 - q0) + + cmhi v26.8h, v22.8h, v26.8h // < alpha + cmhi v28.8h, v23.8h, v28.8h // < beta + cmhi v30.8h, v23.8h, v30.8h // < beta + + uxtl v4.4s, v0.4h + uxtl2 v5.4s, v0.8h + + and v26.16b, v26.16b, v28.16b + + usubw v4.4s, v4.4s, v16.4h + usubw2 v5.4s, v5.4s, v16.8h + + and v26.16b, v26.16b, v30.16b + + shl v4.4s, v4.4s, #2 + shl v5.4s, v5.4s, #2 + + mov x8, v26.d[0] + mov x9, v26.d[1] + orr x8, x8, x9 + + sli v24.8H, v24.8H, #8 + uxtl v24.8H, v24.8B + uaddw v4.4s, v4.4s, v18.4h + uaddw2 v5.4s, v5.4s, v18.8h // add p1 + + cbz x8, 9f + + usubw v4.4s, v4.4s, v2.4h + usubw2 v5.4s, v5.4s, v2.8h // sub q1 + rshrn v4.4h, v4.4s, #3 + rshrn2 v4.8h, v5.4s, #3 + + mov w8, #1 + dup v31.8h, w8 // this is actually important for higher depths, but not needed in 8 bit + sub v24.8h, v24.8h, v31.8h + shl v24.8h, v24.8h, #2 + add v24.8h, v24.8h, v31.8h + mov w8, #0 + dup v31.8h, w8 + smax v24.8h, v24.8h, v31.8h // this all feels like a huge hack (needed to exclude neg values) + + smin v4.8h, v4.8h, v24.8h + neg v25.8h, v24.8h + smax v4.8h, v4.8h, v25.8h + + uxtl v22.4s, v0.4h + uxtl2 v23.4s, v0.8h + + and v4.16B, v4.16B, v26.16B + + uxtl v28.4s, v16.4h + uxtl2 v29.4s, v16.8h + + saddw v28.4s, v28.4s, v4.4h + saddw2 v29.4s, v29.4s, v4.8h + + ssubw v22.4s, v22.4s, v4.4h + ssubw2 v23.4s, v23.4s, v4.8h + + sqxtun v16.4h, v28.4s + sqxtun2 v16.8h, v29.4s + + sqxtun v0.4h, v22.4s + sqxtun2 v0.8h, v23.4s + + mov w2, #1023 // for clipping + dup v4.8h, w2 + smin v0.8h, v0.8h, v4.8h + smin v16.8h, v16.8h, v4.8h +.endm + +function ff_h264_v_loop_filter_chroma_neon_10, export=1 + h264_loop_filter_start_10 + sxtw x1, w1 + + sub x0, x0, x1, lsl #1 + ld1 {v18.8h}, [x0], x1 + ld1 {v16.8h}, [x0], x1 + ld1 {v0.8h}, [x0], x1 + ld1 {v2.8h}, [x0] + + h264_loop_filter_chroma_10 + + sub x0, x0, x1, lsl #1 + st1 {v16.8h}, [x0], x1 + st1 {v0.8h}, [x0], x1 +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma_neon_10, export=1 + h264_loop_filter_start_10 + sxtw x1, w1 + + sub x0, x0, #4 +h_loop_filter_chroma420_10: + ld1 {v18.d}[0], [x0], x1 + ld1 {v16.d}[0], [x0], x1 + ld1 {v0.d}[0], [x0], x1 + ld1 {v2.d}[0], [x0], x1 + ld1 {v18.d}[1], [x0], x1 + ld1 {v16.d}[1], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v2.d}[1], [x0], x1 + + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 + + h264_loop_filter_chroma_10 + + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 + + sub x0, x0, x1, lsl #3 + st1 {v18.d}[0], [x0], x1 + st1 {v16.d}[0], [x0], x1 + st1 {v0.d}[0], [x0], x1 + st1 {v2.d}[0], [x0], x1 + st1 {v18.d}[1], [x0], x1 + st1 {v16.d}[1], [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v2.d}[1], [x0], x1 +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma422_neon_10, export=1 + sxtw x1, w1 + h264_loop_filter_start_10 + add x5, x0, x1 + sub x0, x0, #4 + add x1, x1, x1 + mov x7, x30 + bl h_loop_filter_chroma420_10 + mov x30, x7 + sub x0, x5, #4 + mov v24.s[0], w6 + b h_loop_filter_chroma420_10 +endfunc + +.macro h264_loop_filter_chroma_intra_10 + uabd v26.8h, v16.8h, v17.8h // abs(p0 - q0) + uabd v27.8h, v18.8h, v16.8h // abs(p1 - p0) + uabd v28.8h, v19.8h, v17.8h // abs(q1 - q0) + cmhi v26.8h, v30.8h, v26.8h // < alpha + cmhi v27.8h, v31.8h, v27.8h // < beta + cmhi v28.8h, v31.8h, v28.8h // < beta + and v26.16b, v26.16b, v27.16b + and v26.16b, v26.16b, v28.16b + mov x2, v26.d[0] + mov x3, v26.d[1] + orr x2, x2, x3 + + ushll v4.4s, v18.4h, #1 + ushll2 v5.4s, v18.8h, #1 + ushll v6.4s, v19.4h, #1 + ushll2 v7.4s, v19.8h, #1 + + cbz x2, 9f + + uaddl v20.4s, v16.4h, v19.4h + uaddl2 v21.4s, v16.8h, v19.8h + uaddl v22.4s, v17.4h, v18.4h + uaddl2 v23.4s, v17.8h, v18.8h + add v20.4s, v20.4s, v4.4s + add v21.4s, v21.4s, v5.4s + add v22.4s, v22.4s, v6.4s + add v23.4s, v23.4s, v7.4s + uqrshrn v24.4h, v20.4s, #2 + uqrshrn2 v24.8h, v21.4s, #2 + uqrshrn v25.4h, v22.4s, #2 + uqrshrn2 v25.8h, v23.4s, #2 + bit v16.16b, v24.16b, v26.16b + bit v17.16b, v25.16b, v26.16b +.endm + +function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1 + h264_loop_filter_start_intra_10 + sub x0, x0, x1, lsl #1 + ld1 {v18.8h}, [x0], x1 + ld1 {v16.8h}, [x0], x1 + ld1 {v17.8h}, [x0], x1 + ld1 {v19.8h}, [x0] + + h264_loop_filter_chroma_intra_10 + + sub x0, x0, x1, lsl #1 + st1 {v16.8h}, [x0], x1 + st1 {v17.8h}, [x0], x1 + +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1 + h264_loop_filter_start_intra_10 + + sub x4, x0, #4 + sub x0, x0, #2 + ld1 {v18.8h}, [x4], x1 + ld1 {v16.8h}, [x4], x1 + ld1 {v17.8h}, [x4], x1 + ld1 {v19.8h}, [x4], x1 + + transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra_10 + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0], x1 + +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1 + h264_loop_filter_start_intra_10 + + sub x4, x0, #4 + sub x0, x0, #2 +h_loop_filter_chroma420_intra_10: + ld1 {v18.8h}, [x4], x1 + ld1 {v16.8h}, [x4], x1 + ld1 {v17.8h}, [x4], x1 + ld1 {v19.8h}, [x4], x1 + ld1 {v18.d}[1], [x4], x1 + ld1 {v16.d}[1], [x4], x1 + ld1 {v17.d}[1], [x4], x1 + ld1 {v19.d}[1], [x4], x1 + + transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra_10 + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0], x1 + st2 {v16.h,v17.h}[4], [x0], x1 + st2 {v16.h,v17.h}[5], [x0], x1 + st2 {v16.h,v17.h}[6], [x0], x1 + st2 {v16.h,v17.h}[7], [x0], x1 + +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1 + h264_loop_filter_start_intra_10 + sub x4, x0, #4 + add x5, x0, x1, lsl #3 + sub x0, x0, #2 + mov x7, x30 + bl h_loop_filter_chroma420_intra_10 + sub x0, x5, #2 + mov x30, x7 + b h_loop_filter_chroma420_intra_10 +endfunc
Benchmarks: A53 A72 h264_h_loop_filter_chroma422_10bpp_c: 293.0 116.7 h264_h_loop_filter_chroma422_10bpp_neon: 283.7 126.2 h264_h_loop_filter_chroma_10bpp_c: 165.2 58.5 h264_h_loop_filter_chroma_10bpp_neon: 74.7 87.2 h264_h_loop_filter_chroma_intra422_10bpp_c: 246.2 124.5 h264_h_loop_filter_chroma_intra422_10bpp_neon: 178.7 70.0 h264_h_loop_filter_chroma_intra_10bpp_c: 121.0 40.5 h264_h_loop_filter_chroma_intra_10bpp_neon: 73.7 59.2 h264_h_loop_filter_chroma_mbaff422_10bpp_c: 145.7 72.7 h264_h_loop_filter_chroma_mbaff422_10bpp_neon: 151.7 87.2 h264_h_loop_filter_chroma_mbaff_intra422_10bpp_c: 117.5 48.0 h264_h_loop_filter_chroma_mbaff_intra422_10bpp_neon: 73.7 37.7 h264_h_loop_filter_chroma_mbaff_intra_10bpp_c: 57.0 27.7 h264_h_loop_filter_chroma_mbaff_intra_10bpp_neon: 81.7 50.7 h264_h_loop_filter_luma_intra_8bpp_c: 242.7 134.0 h264_h_loop_filter_luma_intra_8bpp_neon: 100.7 53.5 h264_v_loop_filter_chroma_10bpp_c: 257.2 138.5 h264_v_loop_filter_chroma_10bpp_neon: 98.2 67.5 h264_v_loop_filter_chroma_intra_10bpp_c: 158.0 76.2 h264_v_loop_filter_chroma_intra_10bpp_neon: 62.7 36.5 Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com> --- this code is a bit slow, particularly the horizontal versions, so any suggestions are greatly appreciated! libavcodec/aarch64/h264dsp_init_aarch64.c | 29 +++ libavcodec/aarch64/h264dsp_neon.S | 299 ++++++++++++++++++++++ 2 files changed, 328 insertions(+)