Message ID | 20230629175729.224383-12-jc@kynesim.co.uk |
---|---|
State | New |
Headers | show |
Series | avfilter/vf_bwdif: Add aarch64 neon functions | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On Thu, 29 Jun 2023, John Cox wrote: > Signed-off-by: John Cox <jc@kynesim.co.uk> > --- > libavfilter/aarch64/vf_bwdif_init_aarch64.c | 21 ++ > libavfilter/aarch64/vf_bwdif_neon.S | 215 ++++++++++++++++++++ > 2 files changed, 236 insertions(+) > > diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c > index e75cf2f204..21e67884ab 100644 > --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c > +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c > @@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1, > void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, > int prefs3, int mrefs3, int parity, int clip_max); > > +void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1, > + int w, int prefs, int mrefs, int prefs2, int mrefs2, > + int prefs3, int mrefs3, int prefs4, int mrefs4, > + int parity, int clip_max); > + > + > +static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1, > + int w, int prefs, int mrefs, int prefs2, int mrefs2, > + int prefs3, int mrefs3, int prefs4, int mrefs4, > + int parity, int clip_max) > +{ > + const int w0 = clip_max != 255 ? 0 : w & ~15; > + > + ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1, > + w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); > + > + if (w0 < w) > + ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, > + w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); > +} > > static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1, > int w, int prefs, int mrefs, int prefs2, int mrefs2, > @@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) > return; > > s->filter_intra = filter_intra_helper; > + s->filter_line = filter_line_helper; > s->filter_edge = filter_edge_helper; > } > > diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S > index a33b235882..675e97d966 100644 > --- a/libavfilter/aarch64/vf_bwdif_neon.S > +++ b/libavfilter/aarch64/vf_bwdif_neon.S > @@ -128,6 +128,221 @@ coeffs: > .hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5] > .hword 5077, 981 // sp[0] = v0.h[6] > > +// =========================================================================== > +// > +// void filter_line( > +// void *dst1, // x0 > +// void *prev1, // x1 > +// void *cur1, // x2 > +// void *next1, // x3 > +// int w, // w4 > +// int prefs, // w5 > +// int mrefs, // w6 > +// int prefs2, // w7 > +// int mrefs2, // [sp, #0] > +// int prefs3, // [sp, #8] > +// int mrefs3, // [sp, #16] > +// int prefs4, // [sp, #24] > +// int mrefs4, // [sp, #32] > +// int parity, // [sp, #40] > +// int clip_max) // [sp, #48] > + > +function ff_bwdif_filter_line_neon, export=1 > + // Sanity check w > + cmp w4, #0 > + ble 99f > + > + // Rearrange regs to be the same as line3 for ease of debug! > + mov w10, w4 // w10 = loop count > + mov w9, w6 // w9 = mref > + mov w12, w7 // w12 = pref2 > + mov w11, w5 // w11 = pref > + ldr w8, [sp, #0] // w8 = mref2 > + ldr w7, [sp, #16] // w7 = mref3 > + ldr w6, [sp, #32] // w6 = mref4 > + ldr w13, [sp, #8] // w13 = pref3 > + ldr w14, [sp, #24] // w14 = pref4 Btw, remember that you can load two arguments from the stack at once with ldp, e.g. "ldp x8, x13, [sp, #0]". If they're made intptr_t/ptrdiff_t, you won't have an issue with garbage in the upper 32 bits either. > + > + mov x4, x3 > + mov x3, x2 > + mov x2, x1 > + > +// #define prev2 cur > +// const uint8_t * restrict next2 = parity ? prev : next; > + ldr w17, [sp, #40] // parity > + cmp w17, #0 > + csel x17, x2, x4, ne > + > + // We want all the V registers - save all the ones we must > + stp d14, d15, [sp, #-64]! > + stp d8, d9, [sp, #48] > + stp d10, d11, [sp, #32] > + stp d12, d13, [sp, #16] The order looks a bit weird here even if they end up sequential on the stack. If you'd fill it from the bottom up, e.g. stp d8, d9, [sp, #-64]! stp d10, d11, [sp, #16] stp d12, d13, [sp, #32] stp d14, d15, [sp, #48] they're sequential both in code and on the stack. // Martin
On Sun, 2 Jul 2023 00:44:10 +0300 (EEST), you wrote: >On Thu, 29 Jun 2023, John Cox wrote: > >> Signed-off-by: John Cox <jc@kynesim.co.uk> >> --- >> libavfilter/aarch64/vf_bwdif_init_aarch64.c | 21 ++ >> libavfilter/aarch64/vf_bwdif_neon.S | 215 ++++++++++++++++++++ >> 2 files changed, 236 insertions(+) >> >> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c >> index e75cf2f204..21e67884ab 100644 >> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c >> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c >> @@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1, >> void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, >> int prefs3, int mrefs3, int parity, int clip_max); >> >> +void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1, >> + int w, int prefs, int mrefs, int prefs2, int mrefs2, >> + int prefs3, int mrefs3, int prefs4, int mrefs4, >> + int parity, int clip_max); >> + >> + >> +static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1, >> + int w, int prefs, int mrefs, int prefs2, int mrefs2, >> + int prefs3, int mrefs3, int prefs4, int mrefs4, >> + int parity, int clip_max) >> +{ >> + const int w0 = clip_max != 255 ? 0 : w & ~15; >> + >> + ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1, >> + w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); >> + >> + if (w0 < w) >> + ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, >> + w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); >> +} >> >> static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1, >> int w, int prefs, int mrefs, int prefs2, int mrefs2, >> @@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) >> return; >> >> s->filter_intra = filter_intra_helper; >> + s->filter_line = filter_line_helper; >> s->filter_edge = filter_edge_helper; >> } >> >> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S >> index a33b235882..675e97d966 100644 >> --- a/libavfilter/aarch64/vf_bwdif_neon.S >> +++ b/libavfilter/aarch64/vf_bwdif_neon.S >> @@ -128,6 +128,221 @@ coeffs: >> .hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5] >> .hword 5077, 981 // sp[0] = v0.h[6] >> >> +// =========================================================================== >> +// >> +// void filter_line( >> +// void *dst1, // x0 >> +// void *prev1, // x1 >> +// void *cur1, // x2 >> +// void *next1, // x3 >> +// int w, // w4 >> +// int prefs, // w5 >> +// int mrefs, // w6 >> +// int prefs2, // w7 >> +// int mrefs2, // [sp, #0] >> +// int prefs3, // [sp, #8] >> +// int mrefs3, // [sp, #16] >> +// int prefs4, // [sp, #24] >> +// int mrefs4, // [sp, #32] >> +// int parity, // [sp, #40] >> +// int clip_max) // [sp, #48] >> + >> +function ff_bwdif_filter_line_neon, export=1 >> + // Sanity check w >> + cmp w4, #0 >> + ble 99f >> + >> + // Rearrange regs to be the same as line3 for ease of debug! >> + mov w10, w4 // w10 = loop count >> + mov w9, w6 // w9 = mref >> + mov w12, w7 // w12 = pref2 >> + mov w11, w5 // w11 = pref >> + ldr w8, [sp, #0] // w8 = mref2 >> + ldr w7, [sp, #16] // w7 = mref3 >> + ldr w6, [sp, #32] // w6 = mref4 >> + ldr w13, [sp, #8] // w13 = pref3 >> + ldr w14, [sp, #24] // w14 = pref4 > >Btw, remember that you can load two arguments from the stack at once with >ldp, e.g. "ldp x8, x13, [sp, #0]". If they're made intptr_t/ptrdiff_t, you >won't have an issue with garbage in the upper 32 bits either. Fair point - I was indeed worrying about garbage in the upper half (and this is not performance or size critical code). >> + >> + mov x4, x3 >> + mov x3, x2 >> + mov x2, x1 >> + >> +// #define prev2 cur >> +// const uint8_t * restrict next2 = parity ? prev : next; >> + ldr w17, [sp, #40] // parity >> + cmp w17, #0 >> + csel x17, x2, x4, ne >> + >> + // We want all the V registers - save all the ones we must >> + stp d14, d15, [sp, #-64]! >> + stp d8, d9, [sp, #48] >> + stp d10, d11, [sp, #32] >> + stp d12, d13, [sp, #16] > >The order looks a bit weird here even if they end up sequential on the >stack. If you'd fill it from the bottom up, e.g. > >stp d8, d9, [sp, #-64]! >stp d10, d11, [sp, #16] >stp d12, d13, [sp, #32] >stp d14, d15, [sp, #48] > >they're sequential both in code and on the stack. Sure I can tweak that. JC >// Martin
On Sun, 2 Jul 2023, John Cox wrote: > On Sun, 2 Jul 2023 00:44:10 +0300 (EEST), you wrote: > >> On Thu, 29 Jun 2023, John Cox wrote: >> >>> Signed-off-by: John Cox <jc@kynesim.co.uk> >>> --- >>> libavfilter/aarch64/vf_bwdif_init_aarch64.c | 21 ++ >>> libavfilter/aarch64/vf_bwdif_neon.S | 215 ++++++++++++++++++++ >>> 2 files changed, 236 insertions(+) >>> >>> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c >>> index e75cf2f204..21e67884ab 100644 >>> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c >>> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c >>> @@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1, >>> void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, >>> int prefs3, int mrefs3, int parity, int clip_max); >>> >>> +void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1, >>> + int w, int prefs, int mrefs, int prefs2, int mrefs2, >>> + int prefs3, int mrefs3, int prefs4, int mrefs4, >>> + int parity, int clip_max); >>> + >>> + >>> +static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1, >>> + int w, int prefs, int mrefs, int prefs2, int mrefs2, >>> + int prefs3, int mrefs3, int prefs4, int mrefs4, >>> + int parity, int clip_max) >>> +{ >>> + const int w0 = clip_max != 255 ? 0 : w & ~15; >>> + >>> + ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1, >>> + w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); >>> + >>> + if (w0 < w) >>> + ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, >>> + w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); >>> +} >>> >>> static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1, >>> int w, int prefs, int mrefs, int prefs2, int mrefs2, >>> @@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) >>> return; >>> >>> s->filter_intra = filter_intra_helper; >>> + s->filter_line = filter_line_helper; >>> s->filter_edge = filter_edge_helper; >>> } >>> >>> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S >>> index a33b235882..675e97d966 100644 >>> --- a/libavfilter/aarch64/vf_bwdif_neon.S >>> +++ b/libavfilter/aarch64/vf_bwdif_neon.S >>> @@ -128,6 +128,221 @@ coeffs: >>> .hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5] >>> .hword 5077, 981 // sp[0] = v0.h[6] >>> >>> +// =========================================================================== >>> +// >>> +// void filter_line( >>> +// void *dst1, // x0 >>> +// void *prev1, // x1 >>> +// void *cur1, // x2 >>> +// void *next1, // x3 >>> +// int w, // w4 >>> +// int prefs, // w5 >>> +// int mrefs, // w6 >>> +// int prefs2, // w7 >>> +// int mrefs2, // [sp, #0] >>> +// int prefs3, // [sp, #8] >>> +// int mrefs3, // [sp, #16] >>> +// int prefs4, // [sp, #24] >>> +// int mrefs4, // [sp, #32] >>> +// int parity, // [sp, #40] >>> +// int clip_max) // [sp, #48] >>> + >>> +function ff_bwdif_filter_line_neon, export=1 >>> + // Sanity check w >>> + cmp w4, #0 >>> + ble 99f >>> + >>> + // Rearrange regs to be the same as line3 for ease of debug! >>> + mov w10, w4 // w10 = loop count >>> + mov w9, w6 // w9 = mref >>> + mov w12, w7 // w12 = pref2 >>> + mov w11, w5 // w11 = pref >>> + ldr w8, [sp, #0] // w8 = mref2 >>> + ldr w7, [sp, #16] // w7 = mref3 >>> + ldr w6, [sp, #32] // w6 = mref4 >>> + ldr w13, [sp, #8] // w13 = pref3 >>> + ldr w14, [sp, #24] // w14 = pref4 >> >> Btw, remember that you can load two arguments from the stack at once with >> ldp, e.g. "ldp x8, x13, [sp, #0]". If they're made intptr_t/ptrdiff_t, you >> won't have an issue with garbage in the upper 32 bits either. > > Fair point - I was indeed worrying about garbage in the upper half (and > this is not performance or size critical code). Well as long as you actually do refer to the register in the form of w8 instead of x8, it shouldn't matter. Checkasm does try to make sure that you actually should get garbage in such areas, so if it passes checkasm, it should be fine. // Martin
diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c index e75cf2f204..21e67884ab 100644 --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c @@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1, void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, int prefs3, int mrefs3, int parity, int clip_max); +void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int prefs3, int mrefs3, int prefs4, int mrefs4, + int parity, int clip_max); + + +static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1, + int w, int prefs, int mrefs, int prefs2, int mrefs2, + int prefs3, int mrefs3, int prefs4, int mrefs4, + int parity, int clip_max) +{ + const int w0 = clip_max != 255 ? 0 : w & ~15; + + ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1, + w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); + + if (w0 < w) + ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0, + w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max); +} static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1, int w, int prefs, int mrefs, int prefs2, int mrefs2, @@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth) return; s->filter_intra = filter_intra_helper; + s->filter_line = filter_line_helper; s->filter_edge = filter_edge_helper; } diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S index a33b235882..675e97d966 100644 --- a/libavfilter/aarch64/vf_bwdif_neon.S +++ b/libavfilter/aarch64/vf_bwdif_neon.S @@ -128,6 +128,221 @@ coeffs: .hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5] .hword 5077, 981 // sp[0] = v0.h[6] +// =========================================================================== +// +// void filter_line( +// void *dst1, // x0 +// void *prev1, // x1 +// void *cur1, // x2 +// void *next1, // x3 +// int w, // w4 +// int prefs, // w5 +// int mrefs, // w6 +// int prefs2, // w7 +// int mrefs2, // [sp, #0] +// int prefs3, // [sp, #8] +// int mrefs3, // [sp, #16] +// int prefs4, // [sp, #24] +// int mrefs4, // [sp, #32] +// int parity, // [sp, #40] +// int clip_max) // [sp, #48] + +function ff_bwdif_filter_line_neon, export=1 + // Sanity check w + cmp w4, #0 + ble 99f + + // Rearrange regs to be the same as line3 for ease of debug! + mov w10, w4 // w10 = loop count + mov w9, w6 // w9 = mref + mov w12, w7 // w12 = pref2 + mov w11, w5 // w11 = pref + ldr w8, [sp, #0] // w8 = mref2 + ldr w7, [sp, #16] // w7 = mref3 + ldr w6, [sp, #32] // w6 = mref4 + ldr w13, [sp, #8] // w13 = pref3 + ldr w14, [sp, #24] // w14 = pref4 + + mov x4, x3 + mov x3, x2 + mov x2, x1 + +// #define prev2 cur +// const uint8_t * restrict next2 = parity ? prev : next; + ldr w17, [sp, #40] // parity + cmp w17, #0 + csel x17, x2, x4, ne + + // We want all the V registers - save all the ones we must + stp d14, d15, [sp, #-64]! + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #16] + + ldr q0, coeffs + +// for (x = 0; x < w; x++) { +// int diff0, diff2; +// int d0, d2; +// int temporal_diff0, temporal_diff2; +// +// int i1, i2; +// int j1, j2; +// int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4; + +10: +// c0 = prev2[0] + next2[0]; // c0 = v20, v21 +// d0 = c0 >> 1; // d0 = v10 +// temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11 + ldr q31, [x3] + ldr q21, [x17] + uhadd v10.16b, v31.16b, v21.16b + uabd v11.16b, v31.16b, v21.16b + uaddl v20.8h, v21.8b, v31.8b + uaddl2 v21.8h, v21.16b, v31.16b + + ldr q31, [x3, w6, SXTW] + ldr q23, [x17, w6, SXTW] + +// i1 = coef_hf[0] * c0; // i1 = v2-v5 + UMULL4K v2, v3, v4, v5, v20, v21, v0.h[2] + + ldr q30, [x3, w14, SXTW] + ldr q25, [x17, w14, SXTW] + +// m4 = prev2[mrefs4] + next2[mrefs4]; // m4 = v22,v23 + uaddl v22.8h, v23.8b, v31.8b + uaddl2 v23.8h, v23.16b, v31.16b + +// p4 = prev2[prefs4] + next2[prefs4]; // p4 = v24,v25, (p4 >> 1) = v12 + uhadd v12.16b, v25.16b, v30.16b + uaddl v24.8h, v25.8b, v30.8b + uaddl2 v25.8h, v25.16b, v30.16b + +// m3 = cur[mrefs3]; // m3 = v20 + ldr q20, [x3, w7, SXTW] + +// p3 = cur[prefs3]; // p3 = v21 + ldr q21, [x3, w13, SXTW] + +// i1 += coef_hf[2] * (m4 + p4); // (-m4:v22,v23) (-p4:v24,v25) + add v22.8h, v22.8h, v24.8h + add v23.8h, v23.8h, v25.8h + UMLAL4K v2, v3, v4, v5, v22, v23, v0.h[4] + + ldr q29, [x3, w8, SXTW] + ldr q23, [x17, w8, SXTW] + +// i1 -= coef_lf[1] * 4 * (m3 + p3); // - + uaddl v30.8h, v20.8b, v21.8b + uaddl2 v31.8h, v20.16b, v21.16b + + UMLSL4K v2, v3, v4, v5, v30, v31, v0.h[1] + +// m2 = prev2[mrefs2] + next2[mrefs2]; // m2 = v22,v23, (m2 >> 1) = v13 + uhadd v13.16b, v23.16b, v29.16b + uaddl v22.8h, v23.8b, v29.8b + uaddl2 v23.8h, v23.16b, v29.16b + + ldr q31, [x3, w12, SXTW] + ldr q27, [x17, w12, SXTW] + +// j1 += coef_hf[2] * (m2 + p6); // (-p6:v24,v25) + add v24.8h, v24.8h, v22.8h + add v25.8h, v25.8h, v23.8h + UMLAL4K v6, v7, v8, v9, v24, v25, v0.h[4] + +// m1 = cur[mrefs]; // m1 = v24 + ldr q24, [x3, w9, SXTW] + +// p2 = prev2[prefs2] + next2[prefs2]; // p2 = v26, v27 +// temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14 +// d2 = p2 >> 1; // d2 = v15 + uabd v14.16b, v31.16b, v27.16b + uhadd v15.16b, v31.16b, v27.16b + uaddl v26.8h, v27.8b, v31.8b + uaddl2 v27.8h, v27.16b, v31.16b + +// i1 -= coef_hf[1] * (m2 + p2); // (-m2:v22,v23*) (-p2:v26*,v27*) + add v22.8h, v22.8h, v26.8h + add v23.8h, v23.8h, v27.8h + UMLSL4K v2, v3, v4, v5, v22, v23, v0.h[3] + +// p1 = cur[prefs]; // p1 = v22 + ldr q22, [x3, w11, SXTW] + +// i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17 + uaddl v18.8h, v22.8b, v24.8b + uaddl2 v19.8h, v22.16b, v24.16b + UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6] + + uaddl v18.8h, v20.8b, v21.8b + uaddl2 v19.8h, v20.16b, v21.16b + UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7] + + SQSHRUNN v17, v28, v29, v30, v31, 13 + +// i1 += coef_lf[0] * 4 * (m1 + p1); // p1 = v22, m1 = v24 + uaddl v26.8h, v24.8b, v22.8b + uaddl2 v27.8h, v24.16b, v22.16b + UMLAL4K v2, v3, v4, v5, v26, v27, v0.h[0] + + ldr q31, [x2, w9, SXTW] + ldr q29, [x4, w9, SXTW] + + ldr q30, [x2, w11, SXTW] + ldr q28, [x4, w11, SXTW] + +// i1 >>= 15; // i1 = v2, -v3, -v4*, -v5* + SQSHRUNN v2, v2, v3, v4, v5, 15 + +// { +// int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1; +// int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1; + uabd v30.16b, v22.16b, v30.16b + uabd v31.16b, v24.16b, v31.16b + uabd v28.16b, v22.16b, v28.16b + uabd v29.16b, v24.16b, v29.16b + uhadd v31.16b, v31.16b, v30.16b + uhadd v29.16b, v29.16b, v28.16b + +// diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18 + ushr v18.16b, v11.16b, #1 + umax v18.16b, v18.16b, v31.16b + umax v18.16b, v18.16b, v29.16b + + // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15 + SPAT_CHECK v18, v13, v24, v10, v22, v15, v31, v30, v29, v28 + + // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18 + INTERPOL v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29 + +// dst[0] = av_clip_uint8(interpol); + str q2, [x0], #16 +// } +// +// dst++; +// cur++; +// prev++; +// prev2++; +// next++; +// } + + subs w10, w10, #16 + add x2, x2, #16 + add x3, x3, #16 + add x4, x4, #16 + add x17, x17, #16 + bgt 10b + + ldp d12, d13, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d14, d15, [sp], #64 +99: + ret +endfunc + // ============================================================================ // // void ff_bwdif_filter_edge_neon(
Signed-off-by: John Cox <jc@kynesim.co.uk> --- libavfilter/aarch64/vf_bwdif_init_aarch64.c | 21 ++ libavfilter/aarch64/vf_bwdif_neon.S | 215 ++++++++++++++++++++ 2 files changed, 236 insertions(+)