Message ID | 20210816102918.464463-1-mnitenko@gmail.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,v2] lavc/aarch64: add pred functions for 10-bit | expand |
Context | Check | Description |
---|---|---|
andriy/x86_make | success | Make finished |
andriy/x86_make_fate | success | Make fate finished |
andriy/PPC64_make | success | Make finished |
andriy/PPC64_make_fate | success | Make fate finished |
On Mon, 16 Aug 2021, Mikhail Nitenko wrote: > Benchmarks: A53 A72 > pred8x8_dc_10_c: 64.2 55.7 > pred8x8_dc_10_neon: 61.7 53.7 > pred8x8_dc_128_10_c: 26.0 20.7 > pred8x8_dc_128_10_neon: 30.7 24.5 > pred8x8_horizontal_10_c: 60.0 35.2 > pred8x8_horizontal_10_neon: 38.0 33.0 > pred8x8_left_dc_10_c: 42.5 35.5 > pred8x8_left_dc_10_neon: 50.7 41.5 > pred8x8_mad_cow_dc_0l0_10_c: 55.7 44.7 > pred8x8_mad_cow_dc_0l0_10_neon: 47.5 37.2 > pred8x8_mad_cow_dc_0lt_10_c: 89.2 75.5 > pred8x8_mad_cow_dc_0lt_10_neon: 52.2 47.0 > pred8x8_mad_cow_dc_l0t_10_c: 74.7 59.2 > pred8x8_mad_cow_dc_l0t_10_neon: 50.5 44.7 > pred8x8_mad_cow_dc_l00_10_c: 58.0 45.7 > pred8x8_mad_cow_dc_l00_10_neon: 42.5 37.5 > pred8x8_plane_10_c: 347.7 295.5 > pred8x8_plane_10_neon: 136.2 108.0 > pred8x8_top_dc_10_c: 44.5 38.5 > pred8x8_top_dc_10_neon: 39.7 34.5 > pred8x8_vertical_10_c: 27.5 21.7 > pred8x8_vertical_10_neon: 21.0 22.2 > pred16x16_plane_10_c: 1242.0 1075.7 > pred16x16_plane_10_neon: 324.0 199.5 > > Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com> > --- > > moved to 32-bit, however, in plane the 16bit are not enough, and it > overflows, so when it overflows the code starts using 32bit wide > sections > > libavcodec/aarch64/h264pred_init.c | 40 +++- > libavcodec/aarch64/h264pred_neon.S | 302 ++++++++++++++++++++++++++++- > 2 files changed, 335 insertions(+), 7 deletions(-) > > diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c > index 325a86bfcd..0ae8f70d23 100644 > --- a/libavcodec/aarch64/h264pred_init.c > +++ b/libavcodec/aarch64/h264pred_init.c > @@ -45,10 +45,23 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride); > void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride); > void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride); > > -void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); > -void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); > -void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); > void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred16x16_plane_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); > + > +void ff_pred8x8_vert_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred8x8_hor_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred8x8_plane_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred8x8_dc_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred8x8_128_dc_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred8x8_left_dc_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred8x8_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred8x8_l0t_dc_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred8x8_0lt_dc_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred8x8_l00_dc_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred8x8_0l0_dc_neon_10(uint8_t *src, ptrdiff_t stride); > > static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, > const int bit_depth, > @@ -84,10 +97,31 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, > h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; > } > if (bit_depth == 10) { > + if (chroma_format_idc <= 1) { > + h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon_10; > + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon_10; > + if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) > + h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon_10; > + h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon_10; > + if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 && > + codec_id != AV_CODEC_ID_VP8) { > + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon_10; > + h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon_10; > + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon_10; > + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon_10; > + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon_10; > + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon_10; > + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon_10; > + } > + } > + > h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon_10; > h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon_10; > h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon_10; > h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10; > + if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 && > + codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) > + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon_10; > } > } > > diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S > index e40bdc8d53..712741941f 100644 > --- a/libavcodec/aarch64/h264pred_neon.S > +++ b/libavcodec/aarch64/h264pred_neon.S > @@ -361,15 +361,13 @@ function ff_pred8x8_0l0_dc_neon, export=1 > endfunc > > .macro ldcol.16 rd, rs, rt, n=4, hi=0 > -.if \n >= 4 || \hi == 0 > +.if \n >= 4 && \hi == 0 > ld1 {\rd\().h}[0], [\rs], \rt > ld1 {\rd\().h}[1], [\rs], \rt > -.endif > -.if \n >= 4 || \hi == 1 > ld1 {\rd\().h}[2], [\rs], \rt > ld1 {\rd\().h}[3], [\rs], \rt > .endif > -.if \n == 8 > +.if \n == 8 || \hi == 1 > ld1 {\rd\().h}[4], [\rs], \rt > ld1 {\rd\().h}[5], [\rs], \rt > ld1 {\rd\().h}[6], [\rs], \rt > @@ -467,3 +465,299 @@ function ff_pred16x16_vert_neon_10, export=1 > b.ne 1b > ret > endfunc > + > +function ff_pred16x16_plane_neon_10, export=1 > + sub x3, x0, x1 > + movrel x4, p16weight > + add x2, x3, #16 > + sub x3, x3, #2 > + ld1 {v0.8h}, [x3] > + ld1 {v2.8h}, [x2], x1 > + ldcol.16 v1, x3, x1, 8 > + add x3, x3, x1 > + ldcol.16 v3, x3, x1, 8 > + > + rev64 v16.8h, v0.8h > + rev64 v17.8h, v1.8h > + ext v0.16b, v16.16b, v16.16b, #8 > + ext v1.16b, v17.16b, v17.16b, #8 > + > + add v7.8h, v2.8h, v3.8h > + sub v2.8h, v2.8h, v0.8h > + sub v3.8h, v3.8h, v1.8h > + ld1 {v0.8h}, [x4] > + mul v2.8h, v2.8h, v0.8h > + mul v3.8h, v3.8h, v0.8h > + addp v2.8h, v2.8h, v3.8h > + addp v2.8h, v2.8h, v2.8h > + addp v2.4h, v2.4h, v2.4h > + sshll v3.4s, v2.4h, #2 > + saddw v2.4s, v3.4s, v2.4h > + rshrn v4.4h, v2.4s, #6 > + trn2 v5.4h, v4.4h, v4.4h > + add v2.4h, v4.4h, v5.4h > + shl v3.4h, v2.4h, #3 > + ext v7.16b, v7.16b, v7.16b, #14 > + sub v3.4h, v3.4h, v2.4h // 7 * (b + c) > + add v7.4h, v7.4h, v0.4h > + shl v2.4h, v7.4h, #4 > + ssubl v2.4s, v2.4h, v3.4h > + shl v3.4h, v4.4h, #4 > + ext v0.16b, v0.16b, v0.16b, #14 > + ssubl v6.4s, v5.4h, v3.4h > + > + mov v0.h[0], wzr > + mul v0.8h, v0.8h, v4.h[0] > + dup v16.4s, v2.s[0] > + dup v17.4s, v2.s[0] > + dup v2.8h, v4.h[0] > + dup v3.4s, v6.s[0] > + shl v2.8h, v2.8h, #3 > + saddw v16.4s, v16.4s, v0.4h > + saddw2 v17.4s, v17.4s, v0.8h > + saddw v3.4s, v3.4s, v2.4h Nit: Pleasey try to fix the wobbly vertical alignment here > + > + mov w3, #16 > + mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping > +1: > + sqshrun v0.4h, v16.4s, #5 > + sqshrun2 v0.8h, v17.4s, #5 > + saddw v16.4s, v16.4s, v2.4h > + saddw v17.4s, v17.4s, v2.4h > + sqshrun v1.4h, v16.4s, #5 > + sqshrun2 v1.8h, v17.4s, #5 > + add v16.4s, v16.4s, v3.4s > + add v17.4s, v17.4s, v3.4s > + > + subs w3, w3, #1 > + > + smin v0.8h, v0.8h, v4.8h > + smin v1.8h, v1.8h, v4.8h > + st1 {v0.8h, v1.8h}, [x0], x1 I think it might be better to do the 'subs' between 'smin' and 'st1'. I haven't verified if it's possible to do things with more narrow registers, but I guess this seems reasonable > + b.ne 1b > + ret > +endfunc > + > +function ff_pred8x8_hor_neon_10, export=1 > + sub x2, x0, #2 > + mov w3, #8 > + > +1: ld1r {v0.8h}, [x2], x1 > + subs w3, w3, #1 > + st1 {v0.8h}, [x0], x1 > + b.ne 1b > + ret > +endfunc > + > +function ff_pred8x8_vert_neon_10, export=1 > + sub x2, x0, x1 > + lsl x1, x1, #1 > + > + ld1 {v0.8h}, [x2], x1 > + mov w3, #4 > +1: subs w3, w3, #1 > + st1 {v0.8h}, [x0], x1 > + st1 {v0.8h}, [x2], x1 > + b.ne 1b > + ret > +endfunc > + > +function ff_pred8x8_plane_neon_10, export=1 > + sub x3, x0, x1 > + movrel x4, p8weight > + movrel x5, p16weight > + add x2, x3, #8 > + sub x3, x3, #2 > + ld1 {v0.d}[0], [x3] > + ld1 {v2.d}[0], [x2], x1 > + ldcol.16 v0, x3, x1, hi=1 > + add x3, x3, x1 > + ldcol.16 v3, x3, x1, 4 > + add v7.8h, v2.8h, v3.8h > + rev64 v0.8h, v0.8h > + trn1 v2.2d, v2.2d, v3.2d > + sub v2.8h, v2.8h, v0.8h > + ld1 {v6.8h}, [x4] > + mul v2.8h, v2.8h, v6.8h > + ld1 {v0.8h}, [x5] > + saddlp v2.4s, v2.8h > + addp v2.4s, v2.4s, v2.4s > + shl v3.4s, v2.4s, #4 > + add v2.4s, v3.4s, v2.4s > + rshrn v5.4h, v2.4s, #5 > + addp v2.4h, v5.4h, v5.4h > + shl v3.4h, v2.4h, #1 > + add v3.4h, v3.4h, v2.4h > + rev64 v7.4h, v7.4h > + add v7.4h, v7.4h, v0.4h > + shl v2.4h, v7.4h, #4 > + ssubl v2.4s, v2.4h, v3.4h > + ext v0.16b, v0.16b, v0.16b, #14 > + mov v0.h[0], wzr > + mul v0.8h, v0.8h, v5.h[0] > + dup v1.4s, v2.s[0] > + dup v2.4s, v2.s[0] > + dup v3.8h, v5.h[1] > + saddw v1.4s, v1.4s, v0.4h > + saddw2 v2.4s, v2.4s, v0.8h > + mov w3, #8 > + mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping > +1: > + sqshrun v0.4h, v1.4s, #5 > + sqshrun2 v0.8h, v2.4s, #5 > + > + subs w3, w3, #1 > + > + saddw v1.4s, v1.4s, v3.4h > + saddw v2.4s, v2.4s, v3.4h > + > + smin v0.8h, v0.8h, v4.8h > + st1 {v0.8h}, [x0], x1 I think it might be good to do the 'smin' a bit earlier here, maybe between the two 'saddw' or after the 'subs'. Looks good other than that. // Martin
diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c index 325a86bfcd..0ae8f70d23 100644 --- a/libavcodec/aarch64/h264pred_init.c +++ b/libavcodec/aarch64/h264pred_init.c @@ -45,10 +45,23 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride); void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride); void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); -void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); -void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_plane_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); + +void ff_pred8x8_vert_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_hor_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_plane_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_128_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_left_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_l0t_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_0lt_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_l00_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_0l0_dc_neon_10(uint8_t *src, ptrdiff_t stride); static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, const int bit_depth, @@ -84,10 +97,31 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; } if (bit_depth == 10) { + if (chroma_format_idc <= 1) { + h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon_10; + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon_10; + if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) + h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon_10; + h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon_10; + if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 && + codec_id != AV_CODEC_ID_VP8) { + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon_10; + h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon_10; + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon_10; + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon_10; + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon_10; + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon_10; + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon_10; + } + } + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon_10; h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon_10; h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon_10; h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10; + if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 && + codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon_10; } } diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S index e40bdc8d53..712741941f 100644 --- a/libavcodec/aarch64/h264pred_neon.S +++ b/libavcodec/aarch64/h264pred_neon.S @@ -361,15 +361,13 @@ function ff_pred8x8_0l0_dc_neon, export=1 endfunc .macro ldcol.16 rd, rs, rt, n=4, hi=0 -.if \n >= 4 || \hi == 0 +.if \n >= 4 && \hi == 0 ld1 {\rd\().h}[0], [\rs], \rt ld1 {\rd\().h}[1], [\rs], \rt -.endif -.if \n >= 4 || \hi == 1 ld1 {\rd\().h}[2], [\rs], \rt ld1 {\rd\().h}[3], [\rs], \rt .endif -.if \n == 8 +.if \n == 8 || \hi == 1 ld1 {\rd\().h}[4], [\rs], \rt ld1 {\rd\().h}[5], [\rs], \rt ld1 {\rd\().h}[6], [\rs], \rt @@ -467,3 +465,299 @@ function ff_pred16x16_vert_neon_10, export=1 b.ne 1b ret endfunc + +function ff_pred16x16_plane_neon_10, export=1 + sub x3, x0, x1 + movrel x4, p16weight + add x2, x3, #16 + sub x3, x3, #2 + ld1 {v0.8h}, [x3] + ld1 {v2.8h}, [x2], x1 + ldcol.16 v1, x3, x1, 8 + add x3, x3, x1 + ldcol.16 v3, x3, x1, 8 + + rev64 v16.8h, v0.8h + rev64 v17.8h, v1.8h + ext v0.16b, v16.16b, v16.16b, #8 + ext v1.16b, v17.16b, v17.16b, #8 + + add v7.8h, v2.8h, v3.8h + sub v2.8h, v2.8h, v0.8h + sub v3.8h, v3.8h, v1.8h + ld1 {v0.8h}, [x4] + mul v2.8h, v2.8h, v0.8h + mul v3.8h, v3.8h, v0.8h + addp v2.8h, v2.8h, v3.8h + addp v2.8h, v2.8h, v2.8h + addp v2.4h, v2.4h, v2.4h + sshll v3.4s, v2.4h, #2 + saddw v2.4s, v3.4s, v2.4h + rshrn v4.4h, v2.4s, #6 + trn2 v5.4h, v4.4h, v4.4h + add v2.4h, v4.4h, v5.4h + shl v3.4h, v2.4h, #3 + ext v7.16b, v7.16b, v7.16b, #14 + sub v3.4h, v3.4h, v2.4h // 7 * (b + c) + add v7.4h, v7.4h, v0.4h + shl v2.4h, v7.4h, #4 + ssubl v2.4s, v2.4h, v3.4h + shl v3.4h, v4.4h, #4 + ext v0.16b, v0.16b, v0.16b, #14 + ssubl v6.4s, v5.4h, v3.4h + + mov v0.h[0], wzr + mul v0.8h, v0.8h, v4.h[0] + dup v16.4s, v2.s[0] + dup v17.4s, v2.s[0] + dup v2.8h, v4.h[0] + dup v3.4s, v6.s[0] + shl v2.8h, v2.8h, #3 + saddw v16.4s, v16.4s, v0.4h + saddw2 v17.4s, v17.4s, v0.8h + saddw v3.4s, v3.4s, v2.4h + + mov w3, #16 + mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping +1: + sqshrun v0.4h, v16.4s, #5 + sqshrun2 v0.8h, v17.4s, #5 + saddw v16.4s, v16.4s, v2.4h + saddw v17.4s, v17.4s, v2.4h + sqshrun v1.4h, v16.4s, #5 + sqshrun2 v1.8h, v17.4s, #5 + add v16.4s, v16.4s, v3.4s + add v17.4s, v17.4s, v3.4s + + subs w3, w3, #1 + + smin v0.8h, v0.8h, v4.8h + smin v1.8h, v1.8h, v4.8h + st1 {v0.8h, v1.8h}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_pred8x8_hor_neon_10, export=1 + sub x2, x0, #2 + mov w3, #8 + +1: ld1r {v0.8h}, [x2], x1 + subs w3, w3, #1 + st1 {v0.8h}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_pred8x8_vert_neon_10, export=1 + sub x2, x0, x1 + lsl x1, x1, #1 + + ld1 {v0.8h}, [x2], x1 + mov w3, #4 +1: subs w3, w3, #1 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x2], x1 + b.ne 1b + ret +endfunc + +function ff_pred8x8_plane_neon_10, export=1 + sub x3, x0, x1 + movrel x4, p8weight + movrel x5, p16weight + add x2, x3, #8 + sub x3, x3, #2 + ld1 {v0.d}[0], [x3] + ld1 {v2.d}[0], [x2], x1 + ldcol.16 v0, x3, x1, hi=1 + add x3, x3, x1 + ldcol.16 v3, x3, x1, 4 + add v7.8h, v2.8h, v3.8h + rev64 v0.8h, v0.8h + trn1 v2.2d, v2.2d, v3.2d + sub v2.8h, v2.8h, v0.8h + ld1 {v6.8h}, [x4] + mul v2.8h, v2.8h, v6.8h + ld1 {v0.8h}, [x5] + saddlp v2.4s, v2.8h + addp v2.4s, v2.4s, v2.4s + shl v3.4s, v2.4s, #4 + add v2.4s, v3.4s, v2.4s + rshrn v5.4h, v2.4s, #5 + addp v2.4h, v5.4h, v5.4h + shl v3.4h, v2.4h, #1 + add v3.4h, v3.4h, v2.4h + rev64 v7.4h, v7.4h + add v7.4h, v7.4h, v0.4h + shl v2.4h, v7.4h, #4 + ssubl v2.4s, v2.4h, v3.4h + ext v0.16b, v0.16b, v0.16b, #14 + mov v0.h[0], wzr + mul v0.8h, v0.8h, v5.h[0] + dup v1.4s, v2.s[0] + dup v2.4s, v2.s[0] + dup v3.8h, v5.h[1] + saddw v1.4s, v1.4s, v0.4h + saddw2 v2.4s, v2.4s, v0.8h + mov w3, #8 + mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping +1: + sqshrun v0.4h, v1.4s, #5 + sqshrun2 v0.8h, v2.4s, #5 + + subs w3, w3, #1 + + saddw v1.4s, v1.4s, v3.4h + saddw v2.4s, v2.4s, v3.4h + + smin v0.8h, v0.8h, v4.8h + st1 {v0.8h}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_pred8x8_128_dc_neon_10, export=1 + movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) + movi v1.8h, #2, lsl #8 + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_top_dc_neon_10, export=1 + sub x2, x0, x1 + ld1 {v0.8h}, [x2] + + addp v0.8h, v0.8h, v0.8h + addp v0.4h, v0.4h, v0.4h + zip1 v0.4h, v0.4h, v0.4h + urshr v2.4h, v0.4h, #2 + zip1 v0.8h, v2.8h, v2.8h + zip1 v1.8h, v2.8h, v2.8h + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_left_dc_neon_10, export=1 + sub x2, x0, #2 + ldcol.16 v0, x2, x1, 8 + + addp v0.8h, v0.8h, v0.8h + addp v0.4h, v0.4h, v0.4h + urshr v2.4h, v0.4h, #2 + dup v1.8h, v2.h[1] + dup v0.8h, v2.h[0] + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_dc_neon_10, export=1 + sub x2, x0, x1 + sub x3, x0, #2 + + ld1 {v0.8h}, [x2] + ldcol.16 v1, x3, x1, 8 + + addp v0.8h, v0.8h, v0.8h + addp v1.8h, v1.8h, v1.8h + trn1 v2.2s, v0.2s, v1.2s + trn2 v3.2s, v0.2s, v1.2s + addp v4.4h, v2.4h, v3.4h + addp v5.4h, v4.4h, v4.4h + urshr v6.4h, v5.4h, #3 + urshr v7.4h, v4.4h, #2 + dup v0.8h, v6.h[0] + dup v2.8h, v7.h[2] + dup v1.8h, v7.h[3] + dup v3.8h, v6.h[1] + zip1 v0.2d, v0.2d, v2.2d + zip1 v1.2d, v1.2d, v3.2d +.L_pred8x8_dc_10_end: + mov w3, #4 + add x2, x0, x1, lsl #2 + +6: st1 {v0.8h}, [x0], x1 + subs w3, w3, #1 + st1 {v1.8h}, [x2], x1 + b.ne 6b + ret +endfunc + +function ff_pred8x8_l0t_dc_neon_10, export=1 + sub x2, x0, x1 + sub x3, x0, #2 + + ld1 {v0.8h}, [x2] + ldcol.16 v1, x3, x1, 4 + + addp v0.8h, v0.8h, v0.8h + addp v1.4h, v1.4h, v1.4h + addp v0.4h, v0.4h, v0.4h + addp v1.4h, v1.4h, v1.4h + add v1.4h, v1.4h, v0.4h + + urshr v2.4h, v0.4h, #2 + urshr v3.4h, v1.4h, #3 // the pred4x4 part + + dup v4.4h, v3.h[0] + dup v5.4h, v2.h[0] + dup v6.4h, v2.h[1] + + zip1 v0.2d, v4.2d, v6.2d + zip1 v1.2d, v5.2d, v6.2d + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_l00_dc_neon_10, export=1 + sub x2, x0, #2 + + ldcol.16 v0, x2, x1, 4 + + addp v0.4h, v0.4h, v0.4h + addp v0.4h, v0.4h, v0.4h + urshr v0.4h, v0.4h, #2 + + movi v1.8h, #2, lsl #8 // 512 + dup v0.8h, v0.h[0] + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_0lt_dc_neon_10, export=1 + add x3, x0, x1, lsl #2 + sub x2, x0, x1 + sub x3, x3, #2 + + ld1 {v0.8h}, [x2] + ldcol.16 v1, x3, x1, hi=1 + + addp v0.8h, v0.8h, v0.8h + addp v1.8h, v1.8h, v1.8h + addp v0.4h, v0.4h, v0.4h + addp v1.4h, v1.4h, v1.4h + zip1 v0.2s, v0.2s, v1.2s + add v1.4h, v0.4h, v1.4h + + urshr v2.4h, v0.4h, #2 + urshr v3.4h, v1.4h, #3 + + dup v4.4h, v2.h[0] + dup v5.4h, v2.h[3] + dup v6.4h, v2.h[1] + dup v7.4h, v3.h[1] + + zip1 v0.2d, v4.2d, v6.2d + zip1 v1.2d, v5.2d, v7.2d + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_0l0_dc_neon_10, export=1 + add x2, x0, x1, lsl #2 + sub x2, x2, #2 + + ldcol.16 v1, x2, x1, 4 + + addp v2.8h, v1.8h, v1.8h + addp v2.4h, v2.4h, v2.4h + urshr v1.4h, v2.4h, #2 + + movi v0.8h, #2, lsl #8 // 512 + dup v1.8h, v1.h[0] + b .L_pred8x8_dc_10_end +endfunc
Benchmarks: A53 A72 pred8x8_dc_10_c: 64.2 55.7 pred8x8_dc_10_neon: 61.7 53.7 pred8x8_dc_128_10_c: 26.0 20.7 pred8x8_dc_128_10_neon: 30.7 24.5 pred8x8_horizontal_10_c: 60.0 35.2 pred8x8_horizontal_10_neon: 38.0 33.0 pred8x8_left_dc_10_c: 42.5 35.5 pred8x8_left_dc_10_neon: 50.7 41.5 pred8x8_mad_cow_dc_0l0_10_c: 55.7 44.7 pred8x8_mad_cow_dc_0l0_10_neon: 47.5 37.2 pred8x8_mad_cow_dc_0lt_10_c: 89.2 75.5 pred8x8_mad_cow_dc_0lt_10_neon: 52.2 47.0 pred8x8_mad_cow_dc_l0t_10_c: 74.7 59.2 pred8x8_mad_cow_dc_l0t_10_neon: 50.5 44.7 pred8x8_mad_cow_dc_l00_10_c: 58.0 45.7 pred8x8_mad_cow_dc_l00_10_neon: 42.5 37.5 pred8x8_plane_10_c: 347.7 295.5 pred8x8_plane_10_neon: 136.2 108.0 pred8x8_top_dc_10_c: 44.5 38.5 pred8x8_top_dc_10_neon: 39.7 34.5 pred8x8_vertical_10_c: 27.5 21.7 pred8x8_vertical_10_neon: 21.0 22.2 pred16x16_plane_10_c: 1242.0 1075.7 pred16x16_plane_10_neon: 324.0 199.5 Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com> --- moved to 32-bit, however, in plane the 16bit are not enough, and it overflows, so when it overflows the code starts using 32bit wide sections libavcodec/aarch64/h264pred_init.c | 40 +++- libavcodec/aarch64/h264pred_neon.S | 302 ++++++++++++++++++++++++++++- 2 files changed, 335 insertions(+), 7 deletions(-)