Message ID | 20211007143057.17870-4-jdek@itanimul.li |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/4] lavc/aarch64: add hevc sao edge 16x16 | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
andriy/make_ppc | success | Make finished |
andriy/make_fate_ppc | success | Make fate finished |
On Thu, 7 Oct 2021, J. Dekker wrote: > Signed-off-by: J. Dekker <jdek@itanimul.li> > --- > libavcodec/aarch64/hevcdsp_sao_neon.S | 103 +++++++++++--------------- > 1 file changed, 44 insertions(+), 59 deletions(-) > > diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S > index 263747149f..c2519da7f5 100644 > --- a/libavcodec/aarch64/hevcdsp_sao_neon.S > +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S > @@ -3,7 +3,7 @@ > * > * AArch64 NEON optimised SAO functions for HEVC decoding > * > - * Copyright (c) 2020 Josh Dekker <josh@itanimul.li> > + * Copyright (c) 2020-2021 J. Dekker <jdek@itanimul.li> > * > * This file is part of FFmpeg. > * > @@ -29,64 +29,49 @@ > // int16_t *sao_offset_val, int sao_left_class, > // int width, int height) > function ff_hevc_sao_band_filter_8x8_8_neon, export=1 > - sub sp, sp, #64 > - stp xzr, xzr, [sp] This one had the right indentation to start with, don't reindent it according to the new incorrectly indented code you're adding. Also if you're going to reformat this, could you align the left edge of the operand columns instead of aligning the commas, i.e. making it match the rest of the asm we have? I.e. like this: sp, sp, #64 xzr, xzr, [sp] // Martin
On 19 Oct 2021, at 10:40, Martin Storsjö wrote: > On Thu, 7 Oct 2021, J. Dekker wrote: > >> Signed-off-by: J. Dekker <jdek@itanimul.li> >> --- >> libavcodec/aarch64/hevcdsp_sao_neon.S | 103 +++++++++++--------------- >> 1 file changed, 44 insertions(+), 59 deletions(-) >> >> diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S >> index 263747149f..c2519da7f5 100644 >> --- a/libavcodec/aarch64/hevcdsp_sao_neon.S >> +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S >> @@ -3,7 +3,7 @@ >> * >> * AArch64 NEON optimised SAO functions for HEVC decoding >> * >> - * Copyright (c) 2020 Josh Dekker <josh@itanimul.li> >> + * Copyright (c) 2020-2021 J. Dekker <jdek@itanimul.li> >> * >> * This file is part of FFmpeg. >> * >> @@ -29,64 +29,49 @@ >> // int16_t *sao_offset_val, int sao_left_class, >> // int width, int height) >> function ff_hevc_sao_band_filter_8x8_8_neon, export=1 >> - sub sp, sp, #64 >> - stp xzr, xzr, [sp] > > This one had the right indentation to start with, don't reindent it according to the new incorrectly indented code you're adding. Yep. This was a mistake, some of (my) previously pushed code is actually incorrect here as well. Instructions should be indented to 9 Columns with the first argument being at 25 Columns. You mentioned in the past about shifting the first argument left by 1 character when it began with a curly brace but I don't see this used in any other files (except the one added by me). Should this still be done? Also do you want a patch to reformat previous code to fit this 9,25 cols alignment? > Also if you're going to reformat this, could you align the left edge of the operand columns instead of aligning the commas, i.e. making it match the rest of the asm we have? I.e. like this: > > sp, sp, #64 > xzr, xzr, [sp] Sure, I thought comma aligning here was strange.
On Tue, 26 Oct 2021, J. Dekker wrote: > On 19 Oct 2021, at 10:40, Martin Storsjö wrote: > >> This one had the right indentation to start with, don't reindent it >> according to the new incorrectly indented code you're adding. > > Yep. This was a mistake, some of (my) previously pushed code is actually > incorrect here as well. Instructions should be indented to 9 Columns > with the first argument being at 25 Columns. > > You mentioned in the past about shifting the first argument left by 1 > character when it began with a curly brace but I don't see this used in > any other files (except the one added by me). Should this still be done? I don't remember exactly what I've said, but as the rest of our source doesn't do that, I'd prefer if you'd avoid it too. > Also do you want a patch to reformat previous code to fit this 9,25 cols > alignment? Maybe yes, so there's less risk of confusion when writing new code. // Martin
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S index 263747149f..c2519da7f5 100644 --- a/libavcodec/aarch64/hevcdsp_sao_neon.S +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S @@ -3,7 +3,7 @@ * * AArch64 NEON optimised SAO functions for HEVC decoding * - * Copyright (c) 2020 Josh Dekker <josh@itanimul.li> + * Copyright (c) 2020-2021 J. Dekker <jdek@itanimul.li> * * This file is part of FFmpeg. * @@ -29,64 +29,49 @@ // int16_t *sao_offset_val, int sao_left_class, // int width, int height) function ff_hevc_sao_band_filter_8x8_8_neon, export=1 - sub sp, sp, #64 - stp xzr, xzr, [sp] - stp xzr, xzr, [sp, #16] - stp xzr, xzr, [sp, #32] - stp xzr, xzr, [sp, #48] - mov w8, #4 - sxtw x6, w6 -0: - ldrsh x9, [x4, x8, lsl #1] // x9 = sao_offset_val[k+1] - subs w8, w8, #1 - add w10, w8, w5 // x10 = k + sao_left_class - and w10, w10, #0x1F - strh w9, [sp, x10, lsl #1] - bne 0b - ld1 {v16.16b-v19.16b}, [sp], #64 - movi v20.8h, #1 - sub x2, x2, x6 // stride_dst - width - sub x3, x3, x6 // stride_src - width -1: // beginning of line - mov x8, x6 -2: - // Simple layout for accessing 16bit values - // with 8bit LUT. - // - // 00 01 02 03 04 05 06 07 - // +-----------------------------------> - // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|.... - // +-----------------------------------> - // i-0 i-1 i-2 i-3 - // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); - ld1 {v2.8b}, [x1], #8 - // load src[x] - uxtl v0.8h, v2.8b - // >> shift - ushr v2.8h, v0.8h, #3 // BIT_DEPTH - 3 - // x2 (access lower short) - shl v1.8h, v2.8h, #1 // low (x2, accessing short) - // +1 access upper short - add v3.8h, v1.8h, v20.8h - // shift insert index to upper byte - sli v1.8h, v3.8h, #8 - // table - tbx v2.16b, {v16.16b-v19.16b}, v1.16b - // src[x] + table - add v1.8h, v0.8h, v2.8h - // clip + narrow - sqxtun v4.8b, v1.8h - // store - st1 {v4.8b}, [x0], #8 - // done 8 pixels - subs w8, w8, #8 - bne 2b - // finished line - subs w7, w7, #1 - add x0, x0, x2 // dst += stride_dst - add x1, x1, x3 // src += stride_src - bne 1b - ret + sub sp, sp, #64 + stp xzr, xzr, [sp] + stp xzr, xzr, [sp, #16] + stp xzr, xzr, [sp, #32] + stp xzr, xzr, [sp, #48] + mov w8, #4 + sxtw x6, w6 +0: ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1] + subs w8, w8, #1 + add w10, w8, w5 // k + sao_left_class + and w10, w10, #0x1F + strh w9, [sp, x10, lsl #1] + bne 0b + ld1 {v16.16b-v19.16b}, [sp], #64 + movi v20.8h, #1 + sub x2, x2, x6 // stride_dst - width + sub x3, x3, x6 // stride_src - width +1: mov x8, x6 // beginning of line +2: // Simple layout for accessing 16bit values + // with 8bit LUT. + // + // 00 01 02 03 04 05 06 07 + // +-----------------------------------> + // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|.... + // +-----------------------------------> + // i-0 i-1 i-2 i-3 + ld1 {v2.8b}, [x1], #8 // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); + uxtl v0.8h, v2.8b // load src[x] + ushr v2.8h, v0.8h, #3 // >> BIT_DEPTH - 3 + shl v1.8h, v2.8h, #1 // low (x2, accessing short) + add v3.8h, v1.8h, v20.8h // +1 access upper short + sli v1.8h, v3.8h, #8 // shift insert index to upper byte + tbx v2.16b, {v16.16b-v19.16b}, v1.16b // table + add v1.8h, v0.8h, v2.8h // src[x] + table + sqxtun v4.8b, v1.8h // clip + narrow + st1 {v4.8b}, [x0], #8 // store + subs w8, w8, #8 // done 8 pixels + bne 2b + subs w7, w7, #1 // finished line, prep. new + add x0, x0, x2 // dst += stride_dst + add x1, x1, x3 // src += stride_src + bne 1b + ret endfunc // ASSUMES STRIDE_SRC = 192
Signed-off-by: J. Dekker <jdek@itanimul.li> --- libavcodec/aarch64/hevcdsp_sao_neon.S | 103 +++++++++++--------------- 1 file changed, 44 insertions(+), 59 deletions(-)