Message ID | 20220816122016.64929-5-hum@semihalf.com |
---|---|
State | New |
Headers | show |
Series | Provide neon implementation for me_cmp functions | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On Tue, 16 Aug 2022, Hubert Mazur wrote: > Provide optimized implementation of sse8 function for arm64. > > Performance comparison tests are shown below. > - sse_1_c: 130.7 > - sse_1_neon: 29.7 > > Benchmarks and tests run with checkasm tool on AWS Graviton 3. > > Signed-off-by: Hubert Mazur <hum@semihalf.com> > --- > libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++ > libavcodec/aarch64/me_cmp_neon.S | 66 ++++++++++++++++++++++++ > 2 files changed, 70 insertions(+) > > diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c > index 1c36d3d7cb..2f51f0497e 100644 > --- a/libavcodec/aarch64/me_cmp_init_aarch64.c > +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c > @@ -34,9 +34,12 @@ int ff_pix_abs16_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t * > > int sse16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, > ptrdiff_t stride, int h); > +int sse8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, > + ptrdiff_t stride, int h); > int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, > ptrdiff_t stride, int h); Same as the others about function declaration indentation > diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S > index 0ec9c0465b..3f4266d4d5 100644 > --- a/libavcodec/aarch64/me_cmp_neon.S > +++ b/libavcodec/aarch64/me_cmp_neon.S > @@ -347,6 +347,72 @@ function sse16_neon, export=1 > ret > endfunc > > +function sse8_neon, export=1 > + // x0 - unused > + // x1 - pix1 > + // x2 - pix2 > + // x3 - stride > + // w4 - h > + > + movi d18, #0 Same as the others about d18 > + movi v21.4s, #0 > + movi v20.4s, #0 > + cmp w4, #4 > + b.le 2f > + > +// make 4 iterations at once > +1: > + > + // res = abs(pix1[0] - pix2[0]) > + // res * res > + > + ld1 {v0.8b}, [x1], x3 // Load pix1 for first iteration > + ld1 {v1.8b}, [x2], x3 // Load pix2 for second iteration > + ld1 {v2.8b}, [x1], x3 // Load pix1 for second iteration > + ld1 {v3.8b}, [x2], x3 // Load pix2 for second iteration > + uabdl v30.8h, v0.8b, v1.8b // Absolute difference, first iteration > + ld1 {v4.8b}, [x1], x3 // Load pix1 for third iteration > + ld1 {v5.8b}, [x2], x3 // Load pix2 for third iteration > + uabdl v29.8h, v2.8b, v3.8b // Absolute difference, second iteration > + umlal v21.4s, v30.4h, v30.4h // Multiply lower half, first iteration > + ld1 {v6.8b}, [x1], x3 // Load pix1 for fourth iteration > + ld1 {v7.8b}, [x2], x3 // Load pix2 for fourth iteration > + uabdl v28.8h, v4.8b, v5.8b // Absolute difference, third iteration > + umlal v21.4s, v29.4h, v29.4h // Multiply lower half, second iteration > + umlal2 v20.4s, v30.8h, v30.8h // Multiply upper half, second iteration The comment was wrong here, this is about the first iteration, not the second one. > + uabdl v27.8h, v6.8b, v7.8b // Absolute difference, fourth iteration > + umlal v21.4s, v28.4h, v28.4h // Multiply lower half, third iteration > + umlal2 v20.4s, v29.8h, v29.8h // Multiply upper half, second iteration > + sub w4, w4, #4 // h -= 4 > + umlal2 v20.4s, v28.8h, v28.8h // Multiply upper half, third iteration > + umlal v21.4s, v27.4h, v27.4h // Multiply lower half, fourth iteration > + cmp w4, #4 > + umlal2 v20.4s, v27.8h, v27.8h // Multiply upper half, fourth iteration > + b.ge 1b > + > + cbz w4, 3f > + > +// iterate by one > +2: > + ld1 {v0.8b}, [x1], x3 // Load pix1 > + ld1 {v1.8b}, [x2], x3 // Load pix2 > + subs w4, w4, #1 > + uabdl v30.8h, v0.8b, v1.8b > + umlal v21.4s, v30.4h, v30.4h > + umlal2 v20.4s, v30.8h, v30.8h > + > + b.ne 2b > + > +3: > + add v21.4s, v21.4s, v20.4s // Add accumulator vectors together > + uaddlv d17, v21.4s // Add up vector > + add d18, d18, d17 > + Unnecesssary d18. // Martin
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c b/libavcodec/aarch64/me_cmp_init_aarch64.c index 1c36d3d7cb..2f51f0497e 100644 --- a/libavcodec/aarch64/me_cmp_init_aarch64.c +++ b/libavcodec/aarch64/me_cmp_init_aarch64.c @@ -34,9 +34,12 @@ int ff_pix_abs16_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t * int sse16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h); +int sse8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, + ptrdiff_t stride, int h); int sse4_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h); + av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) { int cpu_flags = av_get_cpu_flags(); @@ -49,6 +52,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) c->sad[0] = ff_pix_abs16_neon; c->sse[0] = sse16_neon; + c->sse[1] = sse8_neon; c->sse[2] = sse4_neon; } } diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index 0ec9c0465b..3f4266d4d5 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -347,6 +347,72 @@ function sse16_neon, export=1 ret endfunc +function sse8_neon, export=1 + // x0 - unused + // x1 - pix1 + // x2 - pix2 + // x3 - stride + // w4 - h + + movi d18, #0 + movi v21.4s, #0 + movi v20.4s, #0 + cmp w4, #4 + b.le 2f + +// make 4 iterations at once +1: + + // res = abs(pix1[0] - pix2[0]) + // res * res + + ld1 {v0.8b}, [x1], x3 // Load pix1 for first iteration + ld1 {v1.8b}, [x2], x3 // Load pix2 for second iteration + ld1 {v2.8b}, [x1], x3 // Load pix1 for second iteration + ld1 {v3.8b}, [x2], x3 // Load pix2 for second iteration + uabdl v30.8h, v0.8b, v1.8b // Absolute difference, first iteration + ld1 {v4.8b}, [x1], x3 // Load pix1 for third iteration + ld1 {v5.8b}, [x2], x3 // Load pix2 for third iteration + uabdl v29.8h, v2.8b, v3.8b // Absolute difference, second iteration + umlal v21.4s, v30.4h, v30.4h // Multiply lower half, first iteration + ld1 {v6.8b}, [x1], x3 // Load pix1 for fourth iteration + ld1 {v7.8b}, [x2], x3 // Load pix2 for fourth iteration + uabdl v28.8h, v4.8b, v5.8b // Absolute difference, third iteration + umlal v21.4s, v29.4h, v29.4h // Multiply lower half, second iteration + umlal2 v20.4s, v30.8h, v30.8h // Multiply upper half, second iteration + uabdl v27.8h, v6.8b, v7.8b // Absolute difference, fourth iteration + umlal v21.4s, v28.4h, v28.4h // Multiply lower half, third iteration + umlal2 v20.4s, v29.8h, v29.8h // Multiply upper half, second iteration + sub w4, w4, #4 // h -= 4 + umlal2 v20.4s, v28.8h, v28.8h // Multiply upper half, third iteration + umlal v21.4s, v27.4h, v27.4h // Multiply lower half, fourth iteration + cmp w4, #4 + umlal2 v20.4s, v27.8h, v27.8h // Multiply upper half, fourth iteration + b.ge 1b + + cbz w4, 3f + +// iterate by one +2: + ld1 {v0.8b}, [x1], x3 // Load pix1 + ld1 {v1.8b}, [x2], x3 // Load pix2 + subs w4, w4, #1 + uabdl v30.8h, v0.8b, v1.8b + umlal v21.4s, v30.4h, v30.4h + umlal2 v20.4s, v30.8h, v30.8h + + b.ne 2b + +3: + add v21.4s, v21.4s, v20.4s // Add accumulator vectors together + uaddlv d17, v21.4s // Add up vector + add d18, d18, d17 + + fmov w0, s18 + ret + +endfunc + function sse4_neon, export=1 // x0 - unused // x1 - pix1
Provide optimized implementation of sse8 function for arm64. Performance comparison tests are shown below. - sse_1_c: 130.7 - sse_1_neon: 29.7 Benchmarks and tests run with checkasm tool on AWS Graviton 3. Signed-off-by: Hubert Mazur <hum@semihalf.com> --- libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++ libavcodec/aarch64/me_cmp_neon.S | 66 ++++++++++++++++++++++++ 2 files changed, 70 insertions(+)