Message ID | 20211007143057.17870-1-jdek@itanimul.li |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/4] lavc/aarch64: add hevc sao edge 16x16 | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
andriy/make_ppc | success | Make finished |
andriy/make_fate_ppc | success | Make fate finished |
On Thu, 7 Oct 2021, J. Dekker wrote: > --bench on AWS Graviton: > > hevc_sao_edge_16x16_8_c: 1857.0 > hevc_sao_edge_16x16_8_neon: 211.0 > hevc_sao_edge_32x32_8_c: 7802.2 > hevc_sao_edge_32x32_8_neon: 808.2 > hevc_sao_edge_48x48_8_c: 16764.2 > hevc_sao_edge_48x48_8_neon: 1796.5 > hevc_sao_edge_64x64_8_c: 32647.5 > hevc_sao_edge_64x64_8_neon: 3118.5 > > Signed-off-by: J. Dekker <jdek@itanimul.li> > --- > libavcodec/aarch64/hevcdsp_init_aarch64.c | 8 ++- > libavcodec/aarch64/hevcdsp_sao_neon.S | 66 +++++++++++++++++++++++ > 2 files changed, 72 insertions(+), 2 deletions(-) > > diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c > index c785e46f79..747ff0412d 100644 > --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c > +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c > @@ -57,8 +57,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src, > ptrdiff_t stride_dst, ptrdiff_t stride_src, > int16_t *sao_offset_val, int sao_left_class, > int width, int height); > - > - > +void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, > + int16_t *sao_offset_val, int eo, int width, int height); > > av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) > { > @@ -76,6 +76,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) > c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; > c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon; > c->sao_band_filter[0] = ff_hevc_sao_band_filter_8x8_8_neon; > + c->sao_edge_filter[1] = > + c->sao_edge_filter[2] = > + c->sao_edge_filter[3] = > + c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_16x16_8_neon; > } > if (bit_depth == 10) { > c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon; > diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S > index f9fed8345b..a7f054c075 100644 > --- a/libavcodec/aarch64/hevcdsp_sao_neon.S > +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S > @@ -85,3 +85,69 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 > bne 1b > ret > endfunc > + > +// ASSUMES STRIDE_SRC = 192 > +.Lsao_edge_pos: > +.word 1 // horizontal > +.word 192 // vertical > +.word 192 + 1 // 45 degree > +.word 192 - 1 // 135 degree > + > +// ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff stride_dst, > +// int16 *sao_offset_val, int eo, int width, int height) > +function ff_hevc_sao_edge_filter_16x16_8_neon, export=1 > + lsl w4, w4, #2 > + adr x7, .Lsao_edge_pos > + ld1 {v3.8h}, [x3] // load sao_offset_val > + sxtw x5, w5 > + ldr w4, [x7, x4] // stride_src You can fold the lsl into the ldr here, as "ldr w4, [x7, w4, uxtw #2]". > + mov v3.h[7], v3.h[0] // reorder to [1,2,0,3,4] > + mov v3.h[0], v3.h[1] > + mov v3.h[1], v3.h[2] > + mov v3.h[2], v3.h[7] This bit is going to be surprisingly slow, I think. If possible, it'd be nice if the sao_offset_val array could be in the right order from the get go. Other alternatives to this would be a tbl permutation, or a combination of ext + some masking, but I don't think it'd be significantly faster either. I guess this is fine as it's still only the setup for the function, with the body probably being much heavier anyway. > + // split 16bit values into two tables > + uzp2 v1.16b, v3.16b, v3.16b // sao_offset_val -> upper > + uzp1 v0.16b, v3.16b, v3.16b // sao_offset_val -> lower > + movi v2.16b, #2 > + mov x15, #192 > + // strides between end of line and next src/dst > + sub x15, x15, x5 // stride_src - width One could fold the sxtw of w5 into this line too, as "sub x15, x15, w5, sxtw", but as you need it sign extended in a couple places below too, keeping it as a separate instruction might be good. > + sub x16, x2, x5 // stride_dst - width > + mov x11, x1 // copy base src > +1: // new line > + mov x14, x5 // copy width > + sub x12, x11, x4 // src_a (prev) = src - sao_edge_pos > + add x13, x11, x4 // src_b (next) = src + sao_edge_pos > +2: // process 16 bytes > + ld1 {v3.16b}, [x11], #16 // load src > + ld1 {v4.16b}, [x12], #16 // load src_a (prev) > + ld1 {v5.16b}, [x13], #16 // load src_b (next) > + cmhi v16.16b, v4.16b, v3.16b // (prev > cur) > + cmhi v17.16b, v3.16b, v4.16b // (cur > prev) > + cmhi v18.16b, v5.16b, v3.16b // (next > cur) > + cmhi v19.16b, v3.16b, v5.16b // (cur > next) > + sub v20.16b, v16.16b, v17.16b // diff0 = CMP(cur, prev) = (cur > prev) - (cur < prev) > + sub v21.16b, v18.16b, v19.16b // diff1 = CMP(cur, next) = (cur > next) - (cur < next) > + add v20.16b, v20.16b, v21.16b // diff = diff0 + diff1 > + add v20.16b, v20.16b, v2.16b // offset_val = diff + 2 > + tbl v16.16b, {v0.16b}, v20.16b > + tbl v17.16b, {v1.16b}, v20.16b > + zip1 v18.16b, v16.16b, v17.16b // sao_offset_val lower -> > + zip2 v19.16b, v16.16b, v17.16b // sao_offset_val upper -> > + uxtl v20.8h, v3.8b // src[0:7] > + uxtl2 v21.8h, v3.16b // src[7:15] I think it might be better to schedule the uxtl after the 'tbl' which can have a fair bit of latency. > + sqadd v20.8h, v18.8h, v20.8h // + sao_offset_val > + sqadd v21.8h, v19.8h, v21.8h If you can do without clipping, you could avoid the uxtl by doing uaddw, uaddw2 here instead. But it might turn out that the uxtl is essentially for free while waiting for the results from tbl anyway. > + sqxtun v3.8b, v20.8h > + sqxtun2 v3.16b, v21.8h > + st1 {v3.16b}, [x0], #16 > + subs x14, x14, #16 // filtered 16 bytes > + b.ne 2b // do we have width to filter? > + // no width to filter, setup next line > + add x11, x11, x15 // stride src to next line > + add x0, x0, x16 // stride dst to next line > + subs w6, w6, #1 // filtered line > + b.ne 1b // do we have lines to process? > + // no lines to filter > + ret > +endfunc > -- > 2.30.1 (Apple Git-130) Looks mostly ok, the permutation bit feels a bit surprising though. // Martin
On Thu, 7 Oct 2021, J. Dekker wrote: > --bench on AWS Graviton: > > hevc_sao_edge_16x16_8_c: 1857.0 > hevc_sao_edge_16x16_8_neon: 211.0 > hevc_sao_edge_32x32_8_c: 7802.2 > hevc_sao_edge_32x32_8_neon: 808.2 > hevc_sao_edge_48x48_8_c: 16764.2 > hevc_sao_edge_48x48_8_neon: 1796.5 > hevc_sao_edge_64x64_8_c: 32647.5 > hevc_sao_edge_64x64_8_neon: 3118.5 > > Signed-off-by: J. Dekker <jdek@itanimul.li> > --- > libavcodec/aarch64/hevcdsp_init_aarch64.c | 8 ++- > libavcodec/aarch64/hevcdsp_sao_neon.S | 66 +++++++++++++++++++++++ > 2 files changed, 72 insertions(+), 2 deletions(-) > > diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c > index c785e46f79..747ff0412d 100644 > --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c > +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c > @@ -57,8 +57,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src, > ptrdiff_t stride_dst, ptrdiff_t stride_src, > int16_t *sao_offset_val, int sao_left_class, > int width, int height); > - > - > +void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, > + int16_t *sao_offset_val, int eo, int width, int height); > > av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) > { > @@ -76,6 +76,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) > c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; > c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon; > c->sao_band_filter[0] = ff_hevc_sao_band_filter_8x8_8_neon; > + c->sao_edge_filter[1] = > + c->sao_edge_filter[2] = > + c->sao_edge_filter[3] = > + c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_16x16_8_neon; > } > if (bit_depth == 10) { > c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon; > diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S > index f9fed8345b..a7f054c075 100644 > --- a/libavcodec/aarch64/hevcdsp_sao_neon.S > +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S > @@ -85,3 +85,69 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 > bne 1b > ret > endfunc > + > +// ASSUMES STRIDE_SRC = 192 > +.Lsao_edge_pos: > +.word 1 // horizontal > +.word 192 // vertical > +.word 192 + 1 // 45 degree > +.word 192 - 1 // 135 degree > + > +// ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff stride_dst, > +// int16 *sao_offset_val, int eo, int width, int height) > +function ff_hevc_sao_edge_filter_16x16_8_neon, export=1 > + lsl w4, w4, #2 Actually, the left indentation here is one char too little, compared with the existing function here in the same file, and compared with other asm sources. So instead of reindenting the old one, please indent the new one according to all other existing asm instead. // Martin
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index c785e46f79..747ff0412d 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -57,8 +57,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int16_t *sao_offset_val, int sao_left_class, int width, int height); - - +void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, + int16_t *sao_offset_val, int eo, int width, int height); av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) { @@ -76,6 +76,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon; c->sao_band_filter[0] = ff_hevc_sao_band_filter_8x8_8_neon; + c->sao_edge_filter[1] = + c->sao_edge_filter[2] = + c->sao_edge_filter[3] = + c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_16x16_8_neon; } if (bit_depth == 10) { c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon; diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S index f9fed8345b..a7f054c075 100644 --- a/libavcodec/aarch64/hevcdsp_sao_neon.S +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S @@ -85,3 +85,69 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 bne 1b ret endfunc + +// ASSUMES STRIDE_SRC = 192 +.Lsao_edge_pos: +.word 1 // horizontal +.word 192 // vertical +.word 192 + 1 // 45 degree +.word 192 - 1 // 135 degree + +// ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff stride_dst, +// int16 *sao_offset_val, int eo, int width, int height) +function ff_hevc_sao_edge_filter_16x16_8_neon, export=1 + lsl w4, w4, #2 + adr x7, .Lsao_edge_pos + ld1 {v3.8h}, [x3] // load sao_offset_val + sxtw x5, w5 + ldr w4, [x7, x4] // stride_src + mov v3.h[7], v3.h[0] // reorder to [1,2,0,3,4] + mov v3.h[0], v3.h[1] + mov v3.h[1], v3.h[2] + mov v3.h[2], v3.h[7] + // split 16bit values into two tables + uzp2 v1.16b, v3.16b, v3.16b // sao_offset_val -> upper + uzp1 v0.16b, v3.16b, v3.16b // sao_offset_val -> lower + movi v2.16b, #2 + mov x15, #192 + // strides between end of line and next src/dst + sub x15, x15, x5 // stride_src - width + sub x16, x2, x5 // stride_dst - width + mov x11, x1 // copy base src +1: // new line + mov x14, x5 // copy width + sub x12, x11, x4 // src_a (prev) = src - sao_edge_pos + add x13, x11, x4 // src_b (next) = src + sao_edge_pos +2: // process 16 bytes + ld1 {v3.16b}, [x11], #16 // load src + ld1 {v4.16b}, [x12], #16 // load src_a (prev) + ld1 {v5.16b}, [x13], #16 // load src_b (next) + cmhi v16.16b, v4.16b, v3.16b // (prev > cur) + cmhi v17.16b, v3.16b, v4.16b // (cur > prev) + cmhi v18.16b, v5.16b, v3.16b // (next > cur) + cmhi v19.16b, v3.16b, v5.16b // (cur > next) + sub v20.16b, v16.16b, v17.16b // diff0 = CMP(cur, prev) = (cur > prev) - (cur < prev) + sub v21.16b, v18.16b, v19.16b // diff1 = CMP(cur, next) = (cur > next) - (cur < next) + add v20.16b, v20.16b, v21.16b // diff = diff0 + diff1 + add v20.16b, v20.16b, v2.16b // offset_val = diff + 2 + tbl v16.16b, {v0.16b}, v20.16b + tbl v17.16b, {v1.16b}, v20.16b + zip1 v18.16b, v16.16b, v17.16b // sao_offset_val lower -> + zip2 v19.16b, v16.16b, v17.16b // sao_offset_val upper -> + uxtl v20.8h, v3.8b // src[0:7] + uxtl2 v21.8h, v3.16b // src[7:15] + sqadd v20.8h, v18.8h, v20.8h // + sao_offset_val + sqadd v21.8h, v19.8h, v21.8h + sqxtun v3.8b, v20.8h + sqxtun2 v3.16b, v21.8h + st1 {v3.16b}, [x0], #16 + subs x14, x14, #16 // filtered 16 bytes + b.ne 2b // do we have width to filter? + // no width to filter, setup next line + add x11, x11, x15 // stride src to next line + add x0, x0, x16 // stride dst to next line + subs w6, w6, #1 // filtered line + b.ne 1b // do we have lines to process? + // no lines to filter + ret +endfunc
--bench on AWS Graviton: hevc_sao_edge_16x16_8_c: 1857.0 hevc_sao_edge_16x16_8_neon: 211.0 hevc_sao_edge_32x32_8_c: 7802.2 hevc_sao_edge_32x32_8_neon: 808.2 hevc_sao_edge_48x48_8_c: 16764.2 hevc_sao_edge_48x48_8_neon: 1796.5 hevc_sao_edge_64x64_8_c: 32647.5 hevc_sao_edge_64x64_8_neon: 3118.5 Signed-off-by: J. Dekker <jdek@itanimul.li> --- libavcodec/aarch64/hevcdsp_init_aarch64.c | 8 ++- libavcodec/aarch64/hevcdsp_sao_neon.S | 66 +++++++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-)