Message ID | 20210413120012.91790-2-mnitenko@gmail.com |
---|---|
State | Accepted |
Headers | show |
Series | [FFmpeg-devel,v3,1/2] lavc/aarch64: change h264pred_init structure | expand |
Context | Check | Description |
---|---|---|
andriy/x86_make | success | Make finished |
andriy/x86_make_fate | success | Make fate finished |
andriy/PPC64_make | success | Make finished |
andriy/PPC64_make_fate | success | Make fate finished |
On Tue, 13 Apr 2021, Mikhail Nitenko wrote: > Benchmarks: > pred16x16_dc_10_c: 124.0 > pred16x16_dc_10_neon: 97.2 > pred16x16_horizontal_10_c: 71.7 > pred16x16_horizontal_10_neon: 66.2 > pred16x16_top_dc_10_c: 90.7 > pred16x16_top_dc_10_neon: 71.5 > pred16x16_vertical_10_c: 64.7 > pred16x16_vertical_10_neon: 61.7 When posting benchmark numbers, it's nice if you'd mention what CPU it was benchmarked on, as the numbers sometimes differ quite a bit between various CPUs. > > Some functions work slower than C and are left commented out. > --- > libavcodec/aarch64/h264pred_init.c | 12 +++ > libavcodec/aarch64/h264pred_neon.S | 117 +++++++++++++++++++++++++++++ > 2 files changed, 129 insertions(+) > > diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c > index fc8989ae0d..9a1f13910d 100644 > --- a/libavcodec/aarch64/h264pred_init.c > +++ b/libavcodec/aarch64/h264pred_init.c > @@ -45,6 +45,11 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride); > void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride); > void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride); > > +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride); > + > static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, > const int bit_depth, > const int chroma_format_idc) > @@ -78,6 +83,12 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, > codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) > h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; > } > + if (bit_depth == 10) { > + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon_10; > + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon_10; > + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon_10; > + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10; > + } > } > > av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, > @@ -88,3 +99,4 @@ av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, > if (have_neon(cpu_flags)) > h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc); > } > + Stray newline added > diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S > index 213b40b3e7..5ce50323f8 100644 > --- a/libavcodec/aarch64/h264pred_neon.S > +++ b/libavcodec/aarch64/h264pred_neon.S > @@ -359,3 +359,120 @@ function ff_pred8x8_0l0_dc_neon, export=1 > dup v1.8b, v1.b[0] > b .L_pred8x8_dc_end > endfunc > + > +.macro ldcol.16 rd, rs, rt, n=4, hi=0 > +.if \n >= 4 || \hi == 0 > + ld1 {\rd\().h}[0], [\rs], \rt > + ld1 {\rd\().h}[1], [\rs], \rt > +.endif > +.if \n >= 4 || \hi == 1 > + ld1 {\rd\().h}[2], [\rs], \rt > + ld1 {\rd\().h}[3], [\rs], \rt > +.endif > +.if \n == 8 > + ld1 {\rd\().h}[4], [\rs], \rt > + ld1 {\rd\().h}[5], [\rs], \rt > + ld1 {\rd\().h}[6], [\rs], \rt > + ld1 {\rd\().h}[7], [\rs], \rt > +.endif > +.endm > + > +// slower than C > +/* > +function ff_pred16x16_128_dc_neon_10, export=1 > + movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) > + > + b .L_pred16x16_dc_10_end > +endfunc > +*/ > + > +function ff_pred16x16_top_dc_neon_10, export=1 > + sub x2, x0, x1 > + > + ld1 {v0.8h, v1.8h}, [x2] > + > + addv h0, v0.8h > + addv h1, v1.8h > + > + add v0.4h, v0.4h, v1.4h > + > + rshrn v0.4h, v0.4s, #4 > + dup v0.8h, v0.h[0] > + b .L_pred16x16_dc_10_end > +endfunc > + > +// slower than C > +/* > +function ff_pred16x16_left_dc_neon_10, export=1 > + sub x2, x0, #2 // access to the "left" column > + ldcol.16 v0, x2, x1, 8 > + ldcol.16 v1, x2, x1, 8 // load "left" column > + > + addv h0, v0.8h > + addv h1, v1.8h > + > + add v0.4h, v0.4h, v1.4h > + > + rshrn v0.4h, v0.4s, #4 > + dup v0.8h, v0.h[0] > + b .L_pred16x16_dc_10_end > +endfunc > +*/ > + > +function ff_pred16x16_dc_neon_10, export=1 > + sub x2, x0, x1 // access to the "top" row > + sub x3, x0, #2 // access to the "left" column > + > + ld1 {v0.8h, v1.8h}, [x2] > + ldcol.16 v2, x3, x1, 8 > + ldcol.16 v3, x3, x1, 8 // load pixels in "top" row and "left" col > + > + addv h0, v0.8h > + addv h1, v1.8h > + addv h2, v2.8h // sum all pixels in the "top" row and "left" col > + addv h3, v3.8h // (sum stays in h0-h3 registers) > + > + add v0.4h, v0.4h, v1.4h > + add v0.4h, v0.4h, v2.4h > + add v0.4h, v0.4h, v3.4h // sum registers h0-h3 Instead of doing ((v0+v1)+v2)+v3, try doing (v0+v1)+(v2+v3). That reduces the latency chain by one instruction, when the v2+v3 addition can start independent of the v0+v1 addition. > + > + rshrn v0.4h, v0.4s, #5 > + dup v0.8h, v0.h[0] > +.L_pred16x16_dc_10_end: > + mov v1.16b, v0.16b > + mov w3, #8 > +6: st1 {v0.8h, v1.8h}, [x0], x1 > + subs w3, w3, #1 > + st1 {v0.8h, v1.8h}, [x0], x1 > + b.ne 6b > + ret > +endfunc > + > +function ff_pred16x16_hor_neon_10, export=1 > + sub x2, x0, #2 > + add x3, x0, #16 > + > + mov w4, #16 > +1: ld1r {v0.8h}, [x2], x1 > + subs w4, w4, #1 > + st1 {v0.8h}, [x0], x1 > + st1 {v0.8h}, [x3], x1 > + b.ne 1b > + ret > +endfunc > + > +function ff_pred16x16_vert_neon_10, export=1 > + sub x2, x0, x1 > + add x1, x1, x1 > + > + ld1 {v0.8h, v1.8h}, [x2], x1 > + > + mov w3, #8 > +1: subs w3, w3, #1 > + st1 {v0.8h, v1.8h}, [x0], x1 > + st1 {v0.8h, v1.8h}, [x2], x1 > + > + b.ne 1b > + ret > +endfunc > + Stray newline at the end of the file, git complains about it. Other than that, the code looks straightforward and ok to me. // Martin
Inlined a few comments for ff_pred16x16_top_dc_neon_10, other are similar. At 2021-04-14 20:35:44, "Martin Storsjö" <martin@martin.st> wrote: >On Tue, 13 Apr 2021, Mikhail Nitenko wrote: > >> Benchmarks: >> pred16x16_dc_10_c: 124.0 >> pred16x16_dc_10_neon: 97.2 >> pred16x16_horizontal_10_c: 71.7 >> pred16x16_horizontal_10_neon: 66.2 >> pred16x16_top_dc_10_c: 90.7 >> pred16x16_top_dc_10_neon: 71.5 >> pred16x16_vertical_10_c: 64.7 >> pred16x16_vertical_10_neon: 61.7 > >When posting benchmark numbers, it's nice if you'd mention what CPU it was >benchmarked on, as the numbers sometimes differ quite a bit between >various CPUs. > >> >> Some functions work slower than C and are left commented out. >> --- >> libavcodec/aarch64/h264pred_init.c | 12 +++ >> libavcodec/aarch64/h264pred_neon.S | 117 +++++++++++++++++++++++++++++ >> 2 files changed, 129 insertions(+) >> >> diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c >> index fc8989ae0d..9a1f13910d 100644 >> --- a/libavcodec/aarch64/h264pred_init.c >> +++ b/libavcodec/aarch64/h264pred_init.c >> @@ -45,6 +45,11 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride); >> void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride); >> void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride); >> >> +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); >> +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); >> +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); >> +void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride); >> + >> static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, >> const int bit_depth, >> const int chroma_format_idc) >> @@ -78,6 +83,12 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, >> codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) >> h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; >> } >> + if (bit_depth == 10) { >> + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon_10; >> + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon_10; >> + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon_10; >> + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10; >> + } >> } >> >> av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, >> @@ -88,3 +99,4 @@ av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, >> if (have_neon(cpu_flags)) >> h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc); >> } >> + > >Stray newline added > >> diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S >> index 213b40b3e7..5ce50323f8 100644 >> --- a/libavcodec/aarch64/h264pred_neon.S >> +++ b/libavcodec/aarch64/h264pred_neon.S >> @@ -359,3 +359,120 @@ function ff_pred8x8_0l0_dc_neon, export=1 >> dup v1.8b, v1.b[0] >> b .L_pred8x8_dc_end >> endfunc >> + >> +.macro ldcol.16 rd, rs, rt, n=4, hi=0 >> +.if \n >= 4 || \hi == 0 >> + ld1 {\rd\().h}[0], [\rs], \rt >> + ld1 {\rd\().h}[1], [\rs], \rt >> +.endif >> +.if \n >= 4 || \hi == 1 >> + ld1 {\rd\().h}[2], [\rs], \rt >> + ld1 {\rd\().h}[3], [\rs], \rt >> +.endif >> +.if \n == 8 >> + ld1 {\rd\().h}[4], [\rs], \rt >> + ld1 {\rd\().h}[5], [\rs], \rt >> + ld1 {\rd\().h}[6], [\rs], \rt >> + ld1 {\rd\().h}[7], [\rs], \rt >> +.endif >> +.endm >> + >> +// slower than C >> +/* >> +function ff_pred16x16_128_dc_neon_10, export=1 >> + movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) >> + >> + b .L_pred16x16_dc_10_end >> +endfunc >> +*/ >> + >> +function ff_pred16x16_top_dc_neon_10, export=1 >> + sub x2, x0, x1 >> + >> + ld1 {v0.8h, v1.8h}, [x2] >> + >> + addv h0, v0.8h >> + addv h1, v1.8h >> + >> + add v0.4h, v0.4h, v1.4h ld1+addv+addv+add ==> latency 5+5+5+2=17, throughput 1+1+1+1/2=3.5 Dynamic range analyze: sum 16 of 10-bits is up to 14-bits New instructions may: ld1+add+addp+addp+addp ==> latency 5+2+2+2+2=13, throughput 1+1/2+1/2+1/2+1/2=3 or ld1+add+addv ==> latency 5+2+5=12, throughput 1+1/2+1=2.5 btw: we may replace LD1 by LDP to get more bandwidth. >> + >> + rshrn v0.4h, v0.4s, #4 >> + dup v0.8h, v0.h[0] >> + b .L_pred16x16_dc_10_end >> +endfunc >> + >> +// slower than C >> +/* >> +function ff_pred16x16_left_dc_neon_10, export=1 >> + sub x2, x0, #2 // access to the "left" column >> + ldcol.16 v0, x2, x1, 8 >> + ldcol.16 v1, x2, x1, 8 // load "left" column >> + >> + addv h0, v0.8h >> + addv h1, v1.8h >> + >> + add v0.4h, v0.4h, v1.4h >> + >> + rshrn v0.4h, v0.4s, #4 >> + dup v0.8h, v0.h[0] >> + b .L_pred16x16_dc_10_end >> +endfunc >> +*/ >> + >> +function ff_pred16x16_dc_neon_10, export=1 >> + sub x2, x0, x1 // access to the "top" row >> + sub x3, x0, #2 // access to the "left" column >> + >> + ld1 {v0.8h, v1.8h}, [x2] >> + ldcol.16 v2, x3, x1, 8 >> + ldcol.16 v3, x3, x1, 8 // load pixels in "top" row and "left" col >> + >> + addv h0, v0.8h >> + addv h1, v1.8h >> + addv h2, v2.8h // sum all pixels in the "top" row and "left" col >> + addv h3, v3.8h // (sum stays in h0-h3 registers) >> + >> + add v0.4h, v0.4h, v1.4h >> + add v0.4h, v0.4h, v2.4h >> + add v0.4h, v0.4h, v3.4h // sum registers h0-h3 > >Instead of doing ((v0+v1)+v2)+v3, try doing (v0+v1)+(v2+v3). That reduces >the latency chain by one instruction, when the v2+v3 addition can start >independent of the v0+v1 addition. > >> + >> + rshrn v0.4h, v0.4s, #5 >> + dup v0.8h, v0.h[0] >> +.L_pred16x16_dc_10_end: >> + mov v1.16b, v0.16b >> + mov w3, #8 >> +6: st1 {v0.8h, v1.8h}, [x0], x1 >> + subs w3, w3, #1 >> + st1 {v0.8h, v1.8h}, [x0], x1 >> + b.ne 6b >> + ret >> +endfunc >> + >> +function ff_pred16x16_hor_neon_10, export=1 >> + sub x2, x0, #2 >> + add x3, x0, #16 >> + >> + mov w4, #16 >> +1: ld1r {v0.8h}, [x2], x1 >> + subs w4, w4, #1 >> + st1 {v0.8h}, [x0], x1 >> + st1 {v0.8h}, [x3], x1 >> + b.ne 1b >> + ret >> +endfunc >> + >> +function ff_pred16x16_vert_neon_10, export=1 >> + sub x2, x0, x1 >> + add x1, x1, x1 >> + >> + ld1 {v0.8h, v1.8h}, [x2], x1 >> + >> + mov w3, #8 >> +1: subs w3, w3, #1 >> + st1 {v0.8h, v1.8h}, [x0], x1 >> + st1 {v0.8h, v1.8h}, [x2], x1 >> + >> + b.ne 1b >> + ret >> +endfunc >> + > >Stray newline at the end of the file, git complains about it. > >Other than that, the code looks straightforward and ok to me. > >// Martin > >_______________________________________________ >ffmpeg-devel mailing list >ffmpeg-devel@ffmpeg.org >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > >To unsubscribe, visit link above, or email >ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
thank you, i've tried your suggestions and here is what I got (the left column is A53 and the right is A72) current code: pred16x16_top_dc_10_c: 106.0 93.2 pred16x16_top_dc_10_neon: 87.7 77.5 ld1, add, addv variant: pred16x16_top_dc_10_c: 106.0 95.5 pred16x16_top_dc_10_neon: 87.7 77.2 ld1, add, addp, addp, addp variant: pred16x16_top_dc_10_c: 106.0 93.7 pred16x16_top_dc_10_neon: 93.7 81.2 ldp variant: pred16x16_top_dc_10_c: 106.0 93.7 pred16x16_top_dc_10_neon: 87.7 77.5 so I will change code to use one addv instruction ср, 14 апр. 2021 г. в 16:18, chen <chenm003@163.com>: > > Inlined a few comments for ff_pred16x16_top_dc_neon_10, other are similar. > > At 2021-04-14 20:35:44, "Martin Storsjö" <martin@martin.st> wrote: > >On Tue, 13 Apr 2021, Mikhail Nitenko wrote: > > > >> Benchmarks: > >> pred16x16_dc_10_c: 124.0 > >> pred16x16_dc_10_neon: 97.2 > >> pred16x16_horizontal_10_c: 71.7 > >> pred16x16_horizontal_10_neon: 66.2 > >> pred16x16_top_dc_10_c: 90.7 > >> pred16x16_top_dc_10_neon: 71.5 > >> pred16x16_vertical_10_c: 64.7 > >> pred16x16_vertical_10_neon: 61.7 > > > >When posting benchmark numbers, it's nice if you'd mention what CPU it was > >benchmarked on, as the numbers sometimes differ quite a bit between > >various CPUs. > > > >> > >> Some functions work slower than C and are left commented out. > >> --- > >> libavcodec/aarch64/h264pred_init.c | 12 +++ > >> libavcodec/aarch64/h264pred_neon.S | 117 +++++++++++++++++++++++++++++ > >> 2 files changed, 129 insertions(+) > >> > >> diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c > >> index fc8989ae0d..9a1f13910d 100644 > >> --- a/libavcodec/aarch64/h264pred_init.c > >> +++ b/libavcodec/aarch64/h264pred_init.c > >> @@ -45,6 +45,11 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride); > >> void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride); > >> void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride); > >> > >> +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); > >> +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); > >> +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); > >> +void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride); > >> + > >> static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, > >> const int bit_depth, > >> const int chroma_format_idc) > >> @@ -78,6 +83,12 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, > >> codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) > >> h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; > >> } > >> + if (bit_depth == 10) { > >> + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon_10; > >> + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon_10; > >> + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon_10; > >> + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10; > >> + } > >> } > >> > >> av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, > >> @@ -88,3 +99,4 @@ av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, > >> if (have_neon(cpu_flags)) > >> h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc); > >> } > >> + > > > >Stray newline added > > > >> diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S > >> index 213b40b3e7..5ce50323f8 100644 > >> --- a/libavcodec/aarch64/h264pred_neon.S > >> +++ b/libavcodec/aarch64/h264pred_neon.S > >> @@ -359,3 +359,120 @@ function ff_pred8x8_0l0_dc_neon, export=1 > >> dup v1.8b, v1.b[0] > >> b .L_pred8x8_dc_end > >> endfunc > >> + > >> +.macro ldcol.16 rd, rs, rt, n=4, hi=0 > >> +.if \n >= 4 || \hi == 0 > >> + ld1 {\rd\().h}[0], [\rs], \rt > >> + ld1 {\rd\().h}[1], [\rs], \rt > >> +.endif > >> +.if \n >= 4 || \hi == 1 > >> + ld1 {\rd\().h}[2], [\rs], \rt > >> + ld1 {\rd\().h}[3], [\rs], \rt > >> +.endif > >> +.if \n == 8 > >> + ld1 {\rd\().h}[4], [\rs], \rt > >> + ld1 {\rd\().h}[5], [\rs], \rt > >> + ld1 {\rd\().h}[6], [\rs], \rt > >> + ld1 {\rd\().h}[7], [\rs], \rt > >> +.endif > >> +.endm > >> + > >> +// slower than C > >> +/* > >> +function ff_pred16x16_128_dc_neon_10, export=1 > >> + movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) > >> + > >> + b .L_pred16x16_dc_10_end > >> +endfunc > >> +*/ > >> + > >> +function ff_pred16x16_top_dc_neon_10, export=1 > >> + sub x2, x0, x1 > >> + > >> + ld1 {v0.8h, v1.8h}, [x2] > >> + > >> + addv h0, v0.8h > >> + addv h1, v1.8h > >> + > > >> + add v0.4h, v0.4h, v1.4h > ld1+addv+addv+add ==> latency 5+5+5+2=17, throughput 1+1+1+1/2=3.5 > > > Dynamic range analyze: sum 16 of 10-bits is up to 14-bits > > > New instructions may: > ld1+add+addp+addp+addp ==> latency 5+2+2+2+2=13, throughput 1+1/2+1/2+1/2+1/2=3 > or > ld1+add+addv ==> latency 5+2+5=12, throughput 1+1/2+1=2.5 > > > btw: we may replace LD1 by LDP to get more bandwidth. > > > >> + > >> + rshrn v0.4h, v0.4s, #4 > >> + dup v0.8h, v0.h[0] > >> + b .L_pred16x16_dc_10_end > >> +endfunc > >> + > >> +// slower than C > >> +/* > >> +function ff_pred16x16_left_dc_neon_10, export=1 > >> + sub x2, x0, #2 // access to the "left" column > >> + ldcol.16 v0, x2, x1, 8 > >> + ldcol.16 v1, x2, x1, 8 // load "left" column > >> + > >> + addv h0, v0.8h > >> + addv h1, v1.8h > >> + > >> + add v0.4h, v0.4h, v1.4h > >> + > >> + rshrn v0.4h, v0.4s, #4 > >> + dup v0.8h, v0.h[0] > >> + b .L_pred16x16_dc_10_end > >> +endfunc > >> +*/ > >> + > >> +function ff_pred16x16_dc_neon_10, export=1 > >> + sub x2, x0, x1 // access to the "top" row > >> + sub x3, x0, #2 // access to the "left" column > >> + > >> + ld1 {v0.8h, v1.8h}, [x2] > >> + ldcol.16 v2, x3, x1, 8 > >> + ldcol.16 v3, x3, x1, 8 // load pixels in "top" row and "left" col > >> + > >> + addv h0, v0.8h > >> + addv h1, v1.8h > >> + addv h2, v2.8h // sum all pixels in the "top" row and "left" col > >> + addv h3, v3.8h // (sum stays in h0-h3 registers) > >> + > >> + add v0.4h, v0.4h, v1.4h > >> + add v0.4h, v0.4h, v2.4h > >> + add v0.4h, v0.4h, v3.4h // sum registers h0-h3 > > > >Instead of doing ((v0+v1)+v2)+v3, try doing (v0+v1)+(v2+v3). That reduces > >the latency chain by one instruction, when the v2+v3 addition can start > >independent of the v0+v1 addition. > > > >> + > >> + rshrn v0.4h, v0.4s, #5 > >> + dup v0.8h, v0.h[0] > >> +.L_pred16x16_dc_10_end: > >> + mov v1.16b, v0.16b > >> + mov w3, #8 > >> +6: st1 {v0.8h, v1.8h}, [x0], x1 > >> + subs w3, w3, #1 > >> + st1 {v0.8h, v1.8h}, [x0], x1 > >> + b.ne 6b > >> + ret > >> +endfunc > >> + > >> +function ff_pred16x16_hor_neon_10, export=1 > >> + sub x2, x0, #2 > >> + add x3, x0, #16 > >> + > >> + mov w4, #16 > >> +1: ld1r {v0.8h}, [x2], x1 > >> + subs w4, w4, #1 > >> + st1 {v0.8h}, [x0], x1 > >> + st1 {v0.8h}, [x3], x1 > >> + b.ne 1b > >> + ret > >> +endfunc > >> + > >> +function ff_pred16x16_vert_neon_10, export=1 > >> + sub x2, x0, x1 > >> + add x1, x1, x1 > >> + > >> + ld1 {v0.8h, v1.8h}, [x2], x1 > >> + > >> + mov w3, #8 > >> +1: subs w3, w3, #1 > >> + st1 {v0.8h, v1.8h}, [x0], x1 > >> + st1 {v0.8h, v1.8h}, [x2], x1 > >> + > >> + b.ne 1b > >> + ret > >> +endfunc > >> + > > > >Stray newline at the end of the file, git complains about it. > > > >Other than that, the code looks straightforward and ok to me. > > > >// Martin > > > >_______________________________________________ > >ffmpeg-devel mailing list > >ffmpeg-devel@ffmpeg.org > >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > > >To unsubscribe, visit link above, or email > >ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c index fc8989ae0d..9a1f13910d 100644 --- a/libavcodec/aarch64/h264pred_init.c +++ b/libavcodec/aarch64/h264pred_init.c @@ -45,6 +45,11 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride); void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride); void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride); + static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc) @@ -78,6 +83,12 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; } + if (bit_depth == 10) { + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon_10; + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon_10; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon_10; + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10; + } } av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, @@ -88,3 +99,4 @@ av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, if (have_neon(cpu_flags)) h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc); } + diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S index 213b40b3e7..5ce50323f8 100644 --- a/libavcodec/aarch64/h264pred_neon.S +++ b/libavcodec/aarch64/h264pred_neon.S @@ -359,3 +359,120 @@ function ff_pred8x8_0l0_dc_neon, export=1 dup v1.8b, v1.b[0] b .L_pred8x8_dc_end endfunc + +.macro ldcol.16 rd, rs, rt, n=4, hi=0 +.if \n >= 4 || \hi == 0 + ld1 {\rd\().h}[0], [\rs], \rt + ld1 {\rd\().h}[1], [\rs], \rt +.endif +.if \n >= 4 || \hi == 1 + ld1 {\rd\().h}[2], [\rs], \rt + ld1 {\rd\().h}[3], [\rs], \rt +.endif +.if \n == 8 + ld1 {\rd\().h}[4], [\rs], \rt + ld1 {\rd\().h}[5], [\rs], \rt + ld1 {\rd\().h}[6], [\rs], \rt + ld1 {\rd\().h}[7], [\rs], \rt +.endif +.endm + +// slower than C +/* +function ff_pred16x16_128_dc_neon_10, export=1 + movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) + + b .L_pred16x16_dc_10_end +endfunc +*/ + +function ff_pred16x16_top_dc_neon_10, export=1 + sub x2, x0, x1 + + ld1 {v0.8h, v1.8h}, [x2] + + addv h0, v0.8h + addv h1, v1.8h + + add v0.4h, v0.4h, v1.4h + + rshrn v0.4h, v0.4s, #4 + dup v0.8h, v0.h[0] + b .L_pred16x16_dc_10_end +endfunc + +// slower than C +/* +function ff_pred16x16_left_dc_neon_10, export=1 + sub x2, x0, #2 // access to the "left" column + ldcol.16 v0, x2, x1, 8 + ldcol.16 v1, x2, x1, 8 // load "left" column + + addv h0, v0.8h + addv h1, v1.8h + + add v0.4h, v0.4h, v1.4h + + rshrn v0.4h, v0.4s, #4 + dup v0.8h, v0.h[0] + b .L_pred16x16_dc_10_end +endfunc +*/ + +function ff_pred16x16_dc_neon_10, export=1 + sub x2, x0, x1 // access to the "top" row + sub x3, x0, #2 // access to the "left" column + + ld1 {v0.8h, v1.8h}, [x2] + ldcol.16 v2, x3, x1, 8 + ldcol.16 v3, x3, x1, 8 // load pixels in "top" row and "left" col + + addv h0, v0.8h + addv h1, v1.8h + addv h2, v2.8h // sum all pixels in the "top" row and "left" col + addv h3, v3.8h // (sum stays in h0-h3 registers) + + add v0.4h, v0.4h, v1.4h + add v0.4h, v0.4h, v2.4h + add v0.4h, v0.4h, v3.4h // sum registers h0-h3 + + rshrn v0.4h, v0.4s, #5 + dup v0.8h, v0.h[0] +.L_pred16x16_dc_10_end: + mov v1.16b, v0.16b + mov w3, #8 +6: st1 {v0.8h, v1.8h}, [x0], x1 + subs w3, w3, #1 + st1 {v0.8h, v1.8h}, [x0], x1 + b.ne 6b + ret +endfunc + +function ff_pred16x16_hor_neon_10, export=1 + sub x2, x0, #2 + add x3, x0, #16 + + mov w4, #16 +1: ld1r {v0.8h}, [x2], x1 + subs w4, w4, #1 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x3], x1 + b.ne 1b + ret +endfunc + +function ff_pred16x16_vert_neon_10, export=1 + sub x2, x0, x1 + add x1, x1, x1 + + ld1 {v0.8h, v1.8h}, [x2], x1 + + mov w3, #8 +1: subs w3, w3, #1 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x2], x1 + + b.ne 1b + ret +endfunc +