Message ID | 20230629175729.224383-9-jc@kynesim.co.uk |
---|---|
State | New |
Headers | show |
Series | avfilter/vf_bwdif: Add aarch64 neon functions | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On Thu, 29 Jun 2023, John Cox wrote: > Signed-off-by: John Cox <jc@kynesim.co.uk> > --- > libavfilter/aarch64/vf_bwdif_init_aarch64.c | 20 ++++ > libavfilter/aarch64/vf_bwdif_neon.S | 104 ++++++++++++++++++++ > 2 files changed, 124 insertions(+) > > diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c > index 3ffaa07ab3..e75cf2f204 100644 > --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c > +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c > @@ -24,10 +24,29 @@ > #include "libavfilter/bwdif.h" > #include "libavutil/aarch64/cpu.h" > > +void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1, > + int w, int prefs, int mrefs, int prefs2, int mrefs2, > + int parity, int clip_max, int spat); > + > void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, > int prefs3, int mrefs3, int parity, int clip_max); > > > +static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1, > + int w, int prefs, int mrefs, int prefs2, int mrefs2, > + int parity, int clip_max, int spat) > +{ > + const int w0 = clip_max != 255 ? 0 : w & ~15; > + > + ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2, > + parity, clip_max, spat); > + > + if (w0 < w) > + ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, > + w - w0, prefs, mrefs, prefs2, mrefs2, > + parity, clip_max, spat); > +} > + > static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs, > int prefs3, int mrefs3, int parity, int clip_max) > { > @@ -52,5 +71,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) > return; > > s->filter_intra = filter_intra_helper; > + s->filter_edge = filter_edge_helper; > } > > diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S > index 6c5d1598f4..a33b235882 100644 > --- a/libavfilter/aarch64/vf_bwdif_neon.S > +++ b/libavfilter/aarch64/vf_bwdif_neon.S > @@ -128,6 +128,110 @@ coeffs: > .hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5] > .hword 5077, 981 // sp[0] = v0.h[6] > > +// ============================================================================ > +// > +// void ff_bwdif_filter_edge_neon( > +// void *dst1, // x0 > +// void *prev1, // x1 > +// void *cur1, // x2 > +// void *next1, // x3 > +// int w, // w4 > +// int prefs, // w5 > +// int mrefs, // w6 > +// int prefs2, // w7 > +// int mrefs2, // [sp, #0] > +// int parity, // [sp, #8] > +// int clip_max, // [sp, #16] unused > +// int spat); // [sp, #24] This doesn't hold for macOS targets (and the checkasm tests fail on that platform). On macOS, arguments that aren't passed in registers but on the stack, are tightly packed. So since parity is 32 bit and mrefs2 also was 32 bit, parity is available at [sp, #4]. Therefore, it's usually simplest for portability reasons, to pass any arguments after the first 8, as intptr_t or ptrdiff_t, as that makes them consistent across platforms. // Martin
On Sun, 2 Jul 2023 00:40:09 +0300 (EEST), you wrote: >On Thu, 29 Jun 2023, John Cox wrote: > >> Signed-off-by: John Cox <jc@kynesim.co.uk> >> --- >> libavfilter/aarch64/vf_bwdif_init_aarch64.c | 20 ++++ >> libavfilter/aarch64/vf_bwdif_neon.S | 104 ++++++++++++++++++++ >> 2 files changed, 124 insertions(+) >> >> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c >> index 3ffaa07ab3..e75cf2f204 100644 >> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c >> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c >> @@ -24,10 +24,29 @@ >> #include "libavfilter/bwdif.h" >> #include "libavutil/aarch64/cpu.h" >> >> +void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1, >> + int w, int prefs, int mrefs, int prefs2, int mrefs2, >> + int parity, int clip_max, int spat); >> + >> void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, >> int prefs3, int mrefs3, int parity, int clip_max); >> >> >> +static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1, >> + int w, int prefs, int mrefs, int prefs2, int mrefs2, >> + int parity, int clip_max, int spat) >> +{ >> + const int w0 = clip_max != 255 ? 0 : w & ~15; >> + >> + ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2, >> + parity, clip_max, spat); >> + >> + if (w0 < w) >> + ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, >> + w - w0, prefs, mrefs, prefs2, mrefs2, >> + parity, clip_max, spat); >> +} >> + >> static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs, >> int prefs3, int mrefs3, int parity, int clip_max) >> { >> @@ -52,5 +71,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) >> return; >> >> s->filter_intra = filter_intra_helper; >> + s->filter_edge = filter_edge_helper; >> } >> >> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S >> index 6c5d1598f4..a33b235882 100644 >> --- a/libavfilter/aarch64/vf_bwdif_neon.S >> +++ b/libavfilter/aarch64/vf_bwdif_neon.S >> @@ -128,6 +128,110 @@ coeffs: >> .hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5] >> .hword 5077, 981 // sp[0] = v0.h[6] >> >> +// ============================================================================ >> +// >> +// void ff_bwdif_filter_edge_neon( >> +// void *dst1, // x0 >> +// void *prev1, // x1 >> +// void *cur1, // x2 >> +// void *next1, // x3 >> +// int w, // w4 >> +// int prefs, // w5 >> +// int mrefs, // w6 >> +// int prefs2, // w7 >> +// int mrefs2, // [sp, #0] >> +// int parity, // [sp, #8] >> +// int clip_max, // [sp, #16] unused >> +// int spat); // [sp, #24] > >This doesn't hold for macOS targets (and the checkasm tests fail on that >platform). > >On macOS, arguments that aren't passed in registers but on the stack, are >tightly packed. So since parity is 32 bit and mrefs2 also was 32 bit, >parity is available at [sp, #4]. > >Therefore, it's usually simplest for portability reasons, to pass any >arguments after the first 8, as intptr_t or ptrdiff_t, as that makes them >consistent across platforms. Not my interface - this is already existing code. What do you suggest I do? I'm happy either to change the interface or fix my stack offsets if there is any clue that lets me detect this ABI. As personal preference I'd choose the latter. I don't have easy access to a mac. Is there any easy way of getting this tested before resubmission? Thanks JC >// Martin
On Sun, 2 Jul 2023, John Cox wrote: > On Sun, 2 Jul 2023 00:40:09 +0300 (EEST), you wrote: > >> On Thu, 29 Jun 2023, John Cox wrote: >> >>> Signed-off-by: John Cox <jc@kynesim.co.uk> >>> --- >>> libavfilter/aarch64/vf_bwdif_init_aarch64.c | 20 ++++ >>> libavfilter/aarch64/vf_bwdif_neon.S | 104 ++++++++++++++++++++ >>> 2 files changed, 124 insertions(+) >>> >>> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c >>> index 3ffaa07ab3..e75cf2f204 100644 >>> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c >>> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c >>> @@ -24,10 +24,29 @@ >>> #include "libavfilter/bwdif.h" >>> #include "libavutil/aarch64/cpu.h" >>> >>> +void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1, >>> + int w, int prefs, int mrefs, int prefs2, int mrefs2, >>> + int parity, int clip_max, int spat); >>> + >>> void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, >>> int prefs3, int mrefs3, int parity, int clip_max); >>> >>> >>> +static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1, >>> + int w, int prefs, int mrefs, int prefs2, int mrefs2, >>> + int parity, int clip_max, int spat) >>> +{ >>> + const int w0 = clip_max != 255 ? 0 : w & ~15; >>> + >>> + ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2, >>> + parity, clip_max, spat); >>> + >>> + if (w0 < w) >>> + ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, >>> + w - w0, prefs, mrefs, prefs2, mrefs2, >>> + parity, clip_max, spat); >>> +} >>> + >>> static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs, >>> int prefs3, int mrefs3, int parity, int clip_max) >>> { >>> @@ -52,5 +71,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) >>> return; >>> >>> s->filter_intra = filter_intra_helper; >>> + s->filter_edge = filter_edge_helper; >>> } >>> >>> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S >>> index 6c5d1598f4..a33b235882 100644 >>> --- a/libavfilter/aarch64/vf_bwdif_neon.S >>> +++ b/libavfilter/aarch64/vf_bwdif_neon.S >>> @@ -128,6 +128,110 @@ coeffs: >>> .hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5] >>> .hword 5077, 981 // sp[0] = v0.h[6] >>> >>> +// ============================================================================ >>> +// >>> +// void ff_bwdif_filter_edge_neon( >>> +// void *dst1, // x0 >>> +// void *prev1, // x1 >>> +// void *cur1, // x2 >>> +// void *next1, // x3 >>> +// int w, // w4 >>> +// int prefs, // w5 >>> +// int mrefs, // w6 >>> +// int prefs2, // w7 >>> +// int mrefs2, // [sp, #0] >>> +// int parity, // [sp, #8] >>> +// int clip_max, // [sp, #16] unused >>> +// int spat); // [sp, #24] >> >> This doesn't hold for macOS targets (and the checkasm tests fail on that >> platform). >> >> On macOS, arguments that aren't passed in registers but on the stack, are >> tightly packed. So since parity is 32 bit and mrefs2 also was 32 bit, >> parity is available at [sp, #4]. >> >> Therefore, it's usually simplest for portability reasons, to pass any >> arguments after the first 8, as intptr_t or ptrdiff_t, as that makes them >> consistent across platforms. > > Not my interface - this is already existing code. What do you suggest I > do? > > I'm happy either to change the interface or fix my stack offsets if > there is any clue that lets me detect this ABI. As personal preference > I'd choose the latter. You can litter the code with "#ifdef __APPLE__", but in general I'd prefer to not go down that route as it makes the code messier to maintain, and explicitly requires you to test the code on both system combinations whenever touching it, compared to just having to test it once if there's a shared codepath. We often do change function interfaces to make writing the assembly simpler; we often change parameters like "int stride" into "ptrdiff_t stride", to avoid needing explicit sign extension instructions on some architectures. Changing interfaces for this reason is less common though (we often don't have many functions that take that many parameters), but it shouldn't require a lot of changes for other architectures with existing assembly. // Martin
diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c index 3ffaa07ab3..e75cf2f204 100644 --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c @@ -24,10 +24,29 @@ #include "libavfilter/bwdif.h" #include "libavutil/aarch64/cpu.h" +void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int parity, int clip_max, int spat); + void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, int prefs3, int mrefs3, int parity, int clip_max); +static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int parity, int clip_max, int spat) +{ + const int w0 = clip_max != 255 ? 0 : w & ~15; + + ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2, + parity, clip_max, spat); + + if (w0 < w) + ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, + w - w0, prefs, mrefs, prefs2, mrefs2, + parity, clip_max, spat); +} + static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs, int prefs3, int mrefs3, int parity, int clip_max) { @@ -52,5 +71,6 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) return; s->filter_intra = filter_intra_helper; + s->filter_edge = filter_edge_helper; } diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S index 6c5d1598f4..a33b235882 100644 --- a/libavfilter/aarch64/vf_bwdif_neon.S +++ b/libavfilter/aarch64/vf_bwdif_neon.S @@ -128,6 +128,110 @@ coeffs: .hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5] .hword 5077, 981 // sp[0] = v0.h[6] +// ============================================================================ +// +// void ff_bwdif_filter_edge_neon( +// void *dst1, // x0 +// void *prev1, // x1 +// void *cur1, // x2 +// void *next1, // x3 +// int w, // w4 +// int prefs, // w5 +// int mrefs, // w6 +// int prefs2, // w7 +// int mrefs2, // [sp, #0] +// int parity, // [sp, #8] +// int clip_max, // [sp, #16] unused +// int spat); // [sp, #24] + +function ff_bwdif_filter_edge_neon, export=1 + // Sanity check w + cmp w4, #0 + ble 99f + +// #define prev2 cur +// const uint8_t * restrict next2 = parity ? prev : next; + + ldr w8, [sp, #0] // mrefs2 + + ldr w17, [sp, #8] // parity + ldr w16, [sp, #24] // spat + cmp w17, #0 + csel x17, x1, x3, ne + +// for (x = 0; x < w; x++) { + +10: +// int m1 = cur[mrefs]; +// int d = (prev2[0] + next2[0]) >> 1; +// int p1 = cur[prefs]; +// int temporal_diff0 = FFABS(prev2[0] - next2[0]); +// int temporal_diff1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; +// int temporal_diff2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; +// int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); + ldr q31, [x2] + ldr q21, [x17] + uhadd v16.16b, v31.16b, v21.16b // d0 = v16 + uabd v17.16b, v31.16b, v21.16b // td0 = v17 + ldr q24, [x2, w6, SXTW] // m1 = v24 + ldr q22, [x2, w5, SXTW] // p1 = v22 + + ldr q0, [x1, w6, SXTW] // prev[mrefs] + ldr q2, [x1, w5, SXTW] // prev[prefs] + ldr q1, [x3, w6, SXTW] // next[mrefs] + ldr q3, [x3, w5, SXTW] // next[prefs] + + ushr v29.16b, v17.16b, #1 + + uabd v31.16b, v0.16b, v24.16b + uabd v30.16b, v2.16b, v22.16b + uhadd v0.16b, v31.16b, v30.16b // td1 = q0 + + uabd v31.16b, v1.16b, v24.16b + uabd v30.16b, v3.16b, v22.16b + uhadd v1.16b, v31.16b, v30.16b // td2 = q1 + + umax v0.16b, v0.16b, v29.16b + umax v0.16b, v0.16b, v1.16b // diff = v0 + +// if (spat) { +// SPAT_CHECK() +// } +// i0 = (m1 + p1) >> 1; + cbz w16, 1f + + ldr q31, [x2, w8, SXTW] + ldr q18, [x17, w8, SXTW] + ldr q30, [x2, w7, SXTW] + ldr q19, [x17, w7, SXTW] + uhadd v18.16b, v18.16b, v31.16b + uhadd v19.16b, v19.16b, v30.16b + + SPAT_CHECK v0, v18, v24, v16, v22, v19, v31, v30, v29, v28 + +1: + uhadd v2.16b, v22.16b, v24.16b + + // i0 = v2, s0 = v2, d0 = v16, diff = v0, t0 = v31, t1 = v30 + DIFF_CLIP v2, v2, v16, v0, v31, v30 + +// dst[0] = av_clip(interpol, 0, clip_max); + str q2, [x0], #16 + +// dst++; +// cur++; +// } + subs w4, w4, #16 + add x1, x1, #16 + add x2, x2, #16 + add x3, x3, #16 + add x17, x17, #16 + bgt 10b + +99: + ret +endfunc + // ============================================================================ // // void ff_bwdif_filter_intra_neon(
Signed-off-by: John Cox <jc@kynesim.co.uk> --- libavfilter/aarch64/vf_bwdif_init_aarch64.c | 20 ++++ libavfilter/aarch64/vf_bwdif_neon.S | 104 ++++++++++++++++++++ 2 files changed, 124 insertions(+)