Message ID | 20210408190044.70816-1-mnitenko@gmail.com |
---|---|
State | Accepted |
Headers | show |
Series | [FFmpeg-devel] lavc/aarch64: add pred16x16 10-bit functions | expand |
Context | Check | Description |
---|---|---|
andriy/x86_make | success | Make finished |
andriy/x86_make_fate | success | Make fate finished |
andriy/PPC64_make | success | Make finished |
andriy/PPC64_make_fate | success | Make fate finished |
Am Do., 8. Apr. 2021 um 21:10 Uhr schrieb Mikhail Nitenko <mnitenko@gmail.com>: > > here are the benchmarks https://0x1.st/kX.txt Instead please add the relevant lines to the commit message. > --- > libavcodec/aarch64/h264pred_init.c | 75 +++++++++++------- > libavcodec/aarch64/h264pred_neon.S | 123 +++++++++++++++++++++++++++++ > 2 files changed, 168 insertions(+), 30 deletions(-) > > diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c > index b144376f90..d74205c2de 100644 > --- a/libavcodec/aarch64/h264pred_init.c > +++ b/libavcodec/aarch64/h264pred_init.c > @@ -45,42 +45,57 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride); > void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride); > void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride); > > +void ff_pred16x16_128_dc_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred16x16_left_dc_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); > +void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride); > + > static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, > const int bit_depth, > const int chroma_format_idc) > { > - const int high_depth = bit_depth > 8; > - > - if (high_depth) > - return; > - > - if (chroma_format_idc <= 1) { > - h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon; > - h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon; > - if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) > - h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon; > - h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon; > - if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 && > - codec_id != AV_CODEC_ID_VP8) { > - h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon; > - h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon; > - h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon; > - h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon; > - h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon; > - h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon; > - h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon; > + if (bit_depth == 8) { > + if (chroma_format_idc <= 1) { > + h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon; > + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon; > + if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) > + h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon; > + h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon; > + if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 && > + codec_id != AV_CODEC_ID_VP8) { > + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon; > + h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon; > + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon; > + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon; > + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon; > + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon; > + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon; Please do not re-indent these lines in the patch that adds the new functions, feel free to send a separate patch for the re-indentation. Carl Eugen
On Thu, 8 Apr 2021, Mikhail Nitenko wrote: > here are the benchmarks https://0x1.st/kX.txt > --- > libavcodec/aarch64/h264pred_init.c | 75 +++++++++++------- > libavcodec/aarch64/h264pred_neon.S | 123 +++++++++++++++++++++++++++++ > 2 files changed, 168 insertions(+), 30 deletions(-) > > av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, > diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S > index 213b40b3e7..633b401d59 100644 > --- a/libavcodec/aarch64/h264pred_neon.S > +++ b/libavcodec/aarch64/h264pred_neon.S > @@ -359,3 +359,126 @@ function ff_pred8x8_0l0_dc_neon, export=1 > dup v1.8b, v1.b[0] > b .L_pred8x8_dc_end > endfunc > + > +.macro ldcol.16 rd, rs, rt, n=4, hi=0 > +.if \n >= 4 || \hi == 0 > + ld1 {\rd\().h}[0], [\rs], \rt > + ld1 {\rd\().h}[1], [\rs], \rt > +.endif > +.if \n >= 4 || \hi == 1 > + ld1 {\rd\().h}[2], [\rs], \rt > + ld1 {\rd\().h}[3], [\rs], \rt > +.endif > +.if \n == 8 > + ld1 {\rd\().h}[4], [\rs], \rt > + ld1 {\rd\().h}[5], [\rs], \rt > + ld1 {\rd\().h}[6], [\rs], \rt > + ld1 {\rd\().h}[7], [\rs], \rt > +.endif > +.endm I believe this could be a bit faster by using two alternating registers that are incremented - but as the existing code doesn't do that, it's not necessary. > + > +function ff_pred16x16_128_dc_neon_10, export=1 > + movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) > + > + b .L_pred16x16_dc_10_end > +endfunc > + > +function ff_pred16x16_top_dc_neon_10, export=1 > + sub x2, x0, x1 > + > + ld1 {v0.8h}, [x2], #16 > + ld1 {v1.8h}, [x2] This can be one single instruction, ld1 {v0.8h, v1.8h}, [x2] > + uaddlv s0, v0.8h > + uaddlv s1, v1.8h When adding up 8 elements that are 10 bit, they still fit in 16 bit (it only requires 13 bit), so you don't need uaddlv here, addv would be better. And when adding the two results, it still fits in 16 bit (then it'd use 14 bits). > + > + add v0.2s, v0.2s, v1.2s > + > + rshrn v0.4h, v0.4s, #4 > + dup v0.8h, v0.h[0] > + b .L_pred16x16_dc_10_end > +endfunc > + > +function ff_pred16x16_left_dc_neon_10, export=1 > + sub x2, x0, #2 // access to the "left" column > + ldcol.16 v0, x2, x1, 8 > + ldcol.16 v1, x2, x1, 8 // load "left" column > + > + uaddlv s0, v0.8h > + uaddlv s1, v1.8h Same thing here, addv+addv should be enough > + > + add v0.2s, v0.2s, v1.2s > + > + rshrn v0.4h, v0.4s, #4 > + dup v0.8h, v0.h[0] > + b .L_pred16x16_dc_10_end > +endfunc > + > +function ff_pred16x16_dc_neon_10, export=1 > + sub x2, x0, x1 // access to the "top" row > + sub x3, x0, #2 // access to the "left" column > + > + ld1 {v0.8h}, [x2], #16 > + ld1 {v1.8h}, [x2] One single ld1 {v0.8h, v1.8h} > + ldcol.16 v2, x3, x1, 8 > + ldcol.16 v3, x3, x1, 8 // load pixels in "top" col and "left" row > + > + uaddlv s0, v0.8h > + uaddlv s1, v1.8h > + uaddlv s2, v2.8h // sum all pixels in the "top" row and "left" col > + uaddlv s3, v3.8h // (sum stays in v0-v3 registers) addv > + > + add v0.2s, v0.2s, v1.2s > + add v0.2s, v0.2s, v2.2s > + add v0.2s, v0.2s, v3.2s // sum registers v0-v3 > + > + rshrn v0.4h, v0.4s, #5 // right shift vector > + dup v0.8h, v0.h[0] // fill vector with 0th value (dcsplat) These comments are kinda pointless here > +.L_pred16x16_dc_10_end: > + sub x1, x1, #16 > + mov w3, #8 > +6: st1 {v0.8h}, [x0], #16 > + st1 {v0.8h}, [x0], x1 This can be one single "st1 {v0.8h, v1.8h}, [x0], x1" if you make sure that v1 contains the same > + st1 {v0.8h}, [x0], #16 > + st1 {v0.8h}, [x0], x1 > + > + subs w3, w3, #1 > + b.ne 6b > + ret > +endfunc > + > +function ff_pred16x16_hor_neon_10, export=1 > + sub x2, x0, #2 > + sub x3, x1, #16 > + > + mov w4, #16 > +1: ld1r {v0.8h}, [x2], x1 > + st1 {v0.8h}, [x0], #16 > + st1 {v0.8h}, [x0], x3 This might be ok here, but also do check if copying the value to v1 and doing one single "st1 {v0.8h, v1.8h}" is faster. > + > + subs w4, w4, #1 > + b.ne 1b > + ret > +endfunc > + > +function ff_pred16x16_vert_neon_10, export=1 > + sub x2, x0, x1 > + add x1, x1, x1 > + sub x1, x1, #16 > + > + ld1 {v0.8h}, [x2], #16 > + ld1 {v1.8h}, [x2], x1 One single ld1 > + > + mov w3, #8 > +1: st1 {v0.8h}, [x0], #16 > + st1 {v1.8h}, [x0], x1 One single st1 // Martin
diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c index b144376f90..d74205c2de 100644 --- a/libavcodec/aarch64/h264pred_init.c +++ b/libavcodec/aarch64/h264pred_init.c @@ -45,42 +45,57 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride); void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride); void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_128_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_left_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride); + static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc) { - const int high_depth = bit_depth > 8; - - if (high_depth) - return; - - if (chroma_format_idc <= 1) { - h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon; - h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon; - if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) - h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon; - h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon; - if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 && - codec_id != AV_CODEC_ID_VP8) { - h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon; - h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon; - h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon; - h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon; - h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon; - h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon; - h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon; + if (bit_depth == 8) { + if (chroma_format_idc <= 1) { + h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon; + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon; + if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) + h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon; + h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon; + if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 && + codec_id != AV_CODEC_ID_VP8) { + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon; + h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon; + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon; + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon; + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon; + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon; + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon; + } } - } - h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon; - h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon; - h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon; - h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon; - h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon; - h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon; - if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 && - codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) - h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon; + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon; + h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon; + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon; + h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon; + if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 && + codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; + } + if (bit_depth == 10) { + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon_10; + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon_10; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon_10; + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10; + h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon_10; + h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon_10; + if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 && + codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) {} + //h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; + } } av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S index 213b40b3e7..633b401d59 100644 --- a/libavcodec/aarch64/h264pred_neon.S +++ b/libavcodec/aarch64/h264pred_neon.S @@ -359,3 +359,126 @@ function ff_pred8x8_0l0_dc_neon, export=1 dup v1.8b, v1.b[0] b .L_pred8x8_dc_end endfunc + +.macro ldcol.16 rd, rs, rt, n=4, hi=0 +.if \n >= 4 || \hi == 0 + ld1 {\rd\().h}[0], [\rs], \rt + ld1 {\rd\().h}[1], [\rs], \rt +.endif +.if \n >= 4 || \hi == 1 + ld1 {\rd\().h}[2], [\rs], \rt + ld1 {\rd\().h}[3], [\rs], \rt +.endif +.if \n == 8 + ld1 {\rd\().h}[4], [\rs], \rt + ld1 {\rd\().h}[5], [\rs], \rt + ld1 {\rd\().h}[6], [\rs], \rt + ld1 {\rd\().h}[7], [\rs], \rt +.endif +.endm + +function ff_pred16x16_128_dc_neon_10, export=1 + movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) + + b .L_pred16x16_dc_10_end +endfunc + +function ff_pred16x16_top_dc_neon_10, export=1 + sub x2, x0, x1 + + ld1 {v0.8h}, [x2], #16 + ld1 {v1.8h}, [x2] + + uaddlv s0, v0.8h + uaddlv s1, v1.8h + + add v0.2s, v0.2s, v1.2s + + rshrn v0.4h, v0.4s, #4 + dup v0.8h, v0.h[0] + b .L_pred16x16_dc_10_end +endfunc + +function ff_pred16x16_left_dc_neon_10, export=1 + sub x2, x0, #2 // access to the "left" column + ldcol.16 v0, x2, x1, 8 + ldcol.16 v1, x2, x1, 8 // load "left" column + + uaddlv s0, v0.8h + uaddlv s1, v1.8h + + add v0.2s, v0.2s, v1.2s + + rshrn v0.4h, v0.4s, #4 + dup v0.8h, v0.h[0] + b .L_pred16x16_dc_10_end +endfunc + +function ff_pred16x16_dc_neon_10, export=1 + sub x2, x0, x1 // access to the "top" row + sub x3, x0, #2 // access to the "left" column + + ld1 {v0.8h}, [x2], #16 + ld1 {v1.8h}, [x2] + ldcol.16 v2, x3, x1, 8 + ldcol.16 v3, x3, x1, 8 // load pixels in "top" col and "left" row + + uaddlv s0, v0.8h + uaddlv s1, v1.8h + uaddlv s2, v2.8h // sum all pixels in the "top" row and "left" col + uaddlv s3, v3.8h // (sum stays in v0-v3 registers) + + add v0.2s, v0.2s, v1.2s + add v0.2s, v0.2s, v2.2s + add v0.2s, v0.2s, v3.2s // sum registers v0-v3 + + rshrn v0.4h, v0.4s, #5 // right shift vector + dup v0.8h, v0.h[0] // fill vector with 0th value (dcsplat) +.L_pred16x16_dc_10_end: + sub x1, x1, #16 + mov w3, #8 +6: st1 {v0.8h}, [x0], #16 + st1 {v0.8h}, [x0], x1 + + st1 {v0.8h}, [x0], #16 + st1 {v0.8h}, [x0], x1 + + subs w3, w3, #1 + b.ne 6b + ret +endfunc + +function ff_pred16x16_hor_neon_10, export=1 + sub x2, x0, #2 + sub x3, x1, #16 + + mov w4, #16 +1: ld1r {v0.8h}, [x2], x1 + st1 {v0.8h}, [x0], #16 + st1 {v0.8h}, [x0], x3 + + subs w4, w4, #1 + b.ne 1b + ret +endfunc + +function ff_pred16x16_vert_neon_10, export=1 + sub x2, x0, x1 + add x1, x1, x1 + sub x1, x1, #16 + + ld1 {v0.8h}, [x2], #16 + ld1 {v1.8h}, [x2], x1 + + mov w3, #8 +1: st1 {v0.8h}, [x0], #16 + st1 {v1.8h}, [x0], x1 + + st1 {v0.8h}, [x2], #16 + st1 {v1.8h}, [x2], x1 + + subs w3, w3, #1 + b.ne 1b + ret +endfunc +