@@ -52,6 +52,37 @@ void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *src0, const int16_t *src1, int width,
int height);
+void ff_vvc_w_avg_8_neon(uint8_t *_dst, const ptrdiff_t _dst_stride,
+ const int16_t *src0, const int16_t *src1,
+ const int width, const int height,
+ uintptr_t w0_w1, uintptr_t offset_shift);
+void ff_vvc_w_avg_10_neon(uint8_t *_dst, const ptrdiff_t _dst_stride,
+ const int16_t *src0, const int16_t *src1,
+ const int width, const int height,
+ uintptr_t w0_w1, uintptr_t offset_shift);
+void ff_vvc_w_avg_12_neon(uint8_t *_dst, const ptrdiff_t _dst_stride,
+ const int16_t *src0, const int16_t *src1,
+ const int width, const int height,
+ uintptr_t w0_w1, uintptr_t offset_shift);
+/* When passing arguments to functions, Apple platforms diverge from the ARM64
+ * standard ABI, that we can't implement the function directly in asm.
+ */
+#define W_AVG_FUN(bit_depth) \
+static void vvc_w_avg_ ## bit_depth(uint8_t *dst, const ptrdiff_t dst_stride, \
+ const int16_t *src0, const int16_t *src1, const int width, const int height, \
+ const int denom, const int w0, const int w1, const int o0, const int o1) \
+{ \
+ const int shift = denom + FFMAX(3, 15 - bit_depth); \
+ const int offset = ((o0 + o1) * (1 << (bit_depth - 8)) + 1) * (1 << (shift - 1)); \
+ uintptr_t w0_w1 = ((uintptr_t)w0 << 32) | (uint32_t)w1; \
+ uintptr_t offset_shift = ((uintptr_t)offset << 32) | (uint32_t)shift; \
+ ff_vvc_w_avg_ ## bit_depth ## _neon(dst, dst_stride, src0, src1, width, height, w0_w1, offset_shift); \
+}
+
+W_AVG_FUN(8)
+W_AVG_FUN(10)
+W_AVG_FUN(12)
+
void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
{
int cpu_flags = av_get_cpu_flags();
@@ -123,6 +154,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon;
c->inter.avg = ff_vvc_avg_8_neon;
+ c->inter.w_avg = vvc_w_avg_8;
for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
@@ -163,11 +195,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
}
} else if (bd == 10) {
c->inter.avg = ff_vvc_avg_10_neon;
+ c->inter.w_avg = vvc_w_avg_10;
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
} else if (bd == 12) {
c->inter.avg = ff_vvc_avg_12_neon;
+ c->inter.w_avg = vvc_w_avg_12;
c->alf.filter[LUMA] = alf_filter_luma_12_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
@@ -22,9 +22,9 @@
#define VVC_MAX_PB_SIZE 128
-.macro vvc_avg, bit_depth
+.macro vvc_avg type, bit_depth
-.macro vvc_avg_\bit_depth\()_2_4, tap
+.macro vvc_\type\()_\bit_depth\()_2_4 tap
.if \tap == 2
ldr s0, [src0]
ldr s2, [src1]
@@ -32,9 +32,19 @@
ldr d0, [src0]
ldr d2, [src1]
.endif
+
+.ifc \type, avg
saddl v4.4s, v0.4h, v2.4h
add v4.4s, v4.4s, v16.4s
sqshrn v4.4h, v4.4s, #(15 - \bit_depth)
+.else
+ mov v4.16b, v16.16b
+ smlal v4.4s, v0.4h, v19.4h
+ smlal v4.4s, v2.4h, v20.4h
+ sqshl v4.4s, v4.4s, v22.4s
+ sqxtn v4.4h, v4.4s
+.endif
+
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
.if \tap == 2
@@ -57,7 +67,7 @@
add dst, dst, dst_stride
.endm
-function ff_vvc_avg_\bit_depth\()_neon, export=1
+function ff_vvc_\type\()_\bit_depth\()_neon, export=1
dst .req x0
dst_stride .req x1
src0 .req x2
@@ -67,42 +77,64 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
mov x10, #(VVC_MAX_PB_SIZE * 2)
cmp width, #8
-.if \bit_depth == 8
- movi v16.4s, #64
-.else
-.if \bit_depth == 10
- mov w6, #1023
- movi v16.4s, #16
+.ifc \type, avg
+ movi v16.4s, #(1 << (14 - \bit_depth))
.else
- mov w6, #4095
- movi v16.4s, #4
-.endif
+ lsr x11, x6, #32 // weight0
+ mov w12, w6 // weight1
+ lsr x13, x7, #32 // offset
+ mov w14, w7 // shift
+
+ dup v19.8h, w11
+ neg w14, w14 // so we can use sqshl
+ dup v20.8h, w12
+ dup v16.4s, w13
+ dup v22.4s, w14
+.endif // avg
+
+ .if \bit_depth >= 10
+ // clip pixel
+ mov w6, #((1 << \bit_depth) - 1)
movi v18.8h, #0
dup v17.8h, w6
.endif
+
b.eq 8f
b.hi 16f
cmp width, #4
b.eq 4f
2: // width == 2
subs height, height, #1
- vvc_avg_\bit_depth\()_2_4 2
+ vvc_\type\()_\bit_depth\()_2_4 2
b.ne 2b
b 32f
4: // width == 4
subs height, height, #1
- vvc_avg_\bit_depth\()_2_4 4
+ vvc_\type\()_\bit_depth\()_2_4 4
b.ne 4b
b 32f
8: // width == 8
ld1 {v0.8h}, [src0], x10
ld1 {v2.8h}, [src1], x10
+.ifc \type, avg
saddl v4.4s, v0.4h, v2.4h
saddl2 v5.4s, v0.8h, v2.8h
add v4.4s, v4.4s, v16.4s
add v5.4s, v5.4s, v16.4s
sqshrn v4.4h, v4.4s, #(15 - \bit_depth)
sqshrn2 v4.8h, v5.4s, #(15 - \bit_depth)
+.else
+ mov v4.16b, v16.16b
+ mov v5.16b, v16.16b
+ smlal v4.4s, v0.4h, v19.4h
+ smlal v4.4s, v2.4h, v20.4h
+ smlal2 v5.4s, v0.8h, v19.8h
+ smlal2 v5.4s, v2.8h, v20.8h
+ sqshl v4.4s, v4.4s, v22.4s
+ sqshl v5.4s, v5.4s, v22.4s
+ sqxtn v4.4h, v4.4s
+ sqxtn2 v4.8h, v5.4s
+.endif
subs height, height, #1
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
@@ -122,6 +154,7 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
17:
ldp q0, q1, [x7], #32
ldp q2, q3, [x8], #32
+.ifc \type, avg
saddl v4.4s, v0.4h, v2.4h
saddl2 v5.4s, v0.8h, v2.8h
saddl v6.4s, v1.4h, v3.4h
@@ -134,6 +167,28 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
sqshrn2 v4.8h, v5.4s, #(15 - \bit_depth)
sqshrn v6.4h, v6.4s, #(15 - \bit_depth)
sqshrn2 v6.8h, v7.4s, #(15 - \bit_depth)
+.else // avg
+ mov v4.16b, v16.16b
+ mov v5.16b, v16.16b
+ mov v6.16b, v16.16b
+ mov v7.16b, v16.16b
+ smlal v4.4s, v0.4h, v19.4h
+ smlal v4.4s, v2.4h, v20.4h
+ smlal2 v5.4s, v0.8h, v19.8h
+ smlal2 v5.4s, v2.8h, v20.8h
+ smlal v6.4s, v1.4h, v19.4h
+ smlal v6.4s, v3.4h, v20.4h
+ smlal2 v7.4s, v1.8h, v19.8h
+ smlal2 v7.4s, v3.8h, v20.8h
+ sqshl v4.4s, v4.4s, v22.4s
+ sqshl v5.4s, v5.4s, v22.4s
+ sqshl v6.4s, v6.4s, v22.4s
+ sqshl v7.4s, v7.4s, v22.4s
+ sqxtn v4.4h, v4.4s
+ sqxtn v6.4h, v6.4s
+ sqxtn2 v4.8h, v5.4s
+ sqxtn2 v6.8h, v7.4s
+.endif // w_avg
subs w6, w6, #16
.if \bit_depth == 8
sqxtun v4.8b, v4.8h
@@ -158,6 +213,9 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
endfunc
.endm
-vvc_avg 8
-vvc_avg 10
-vvc_avg 12
+vvc_avg avg, 8
+vvc_avg avg, 10
+vvc_avg avg, 12
+vvc_avg w_avg, 8
+vvc_avg w_avg, 10
+vvc_avg w_avg, 12
From: Zhao Zhili <zhilizhao@tencent.com> w_avg_8_2x2_c: 0.0 ( 0.00x) w_avg_8_2x2_neon: 0.0 ( 0.00x) w_avg_8_4x4_c: 0.2 ( 1.00x) w_avg_8_4x4_neon: 0.0 ( 0.00x) w_avg_8_8x8_c: 1.2 ( 1.00x) w_avg_8_8x8_neon: 0.2 ( 5.00x) w_avg_8_16x16_c: 4.2 ( 1.00x) w_avg_8_16x16_neon: 0.8 ( 5.67x) w_avg_8_32x32_c: 16.2 ( 1.00x) w_avg_8_32x32_neon: 2.5 ( 6.50x) w_avg_8_64x64_c: 64.5 ( 1.00x) w_avg_8_64x64_neon: 9.0 ( 7.17x) w_avg_8_128x128_c: 269.5 ( 1.00x) w_avg_8_128x128_neon: 35.5 ( 7.59x) w_avg_10_2x2_c: 0.2 ( 1.00x) w_avg_10_2x2_neon: 0.2 ( 1.00x) w_avg_10_4x4_c: 0.2 ( 1.00x) w_avg_10_4x4_neon: 0.2 ( 1.00x) w_avg_10_8x8_c: 1.0 ( 1.00x) w_avg_10_8x8_neon: 0.2 ( 4.00x) w_avg_10_16x16_c: 4.2 ( 1.00x) w_avg_10_16x16_neon: 0.8 ( 5.67x) w_avg_10_32x32_c: 16.2 ( 1.00x) w_avg_10_32x32_neon: 2.5 ( 6.50x) w_avg_10_64x64_c: 66.2 ( 1.00x) w_avg_10_64x64_neon: 10.0 ( 6.62x) w_avg_10_128x128_c: 277.8 ( 1.00x) w_avg_10_128x128_neon: 39.8 ( 6.99x) w_avg_12_2x2_c: 0.0 ( 0.00x) w_avg_12_2x2_neon: 0.2 ( 0.00x) w_avg_12_4x4_c: 0.2 ( 1.00x) w_avg_12_4x4_neon: 0.0 ( 0.00x) w_avg_12_8x8_c: 1.2 ( 1.00x) w_avg_12_8x8_neon: 0.5 ( 2.50x) w_avg_12_16x16_c: 4.8 ( 1.00x) w_avg_12_16x16_neon: 0.8 ( 6.33x) w_avg_12_32x32_c: 17.0 ( 1.00x) w_avg_12_32x32_neon: 2.8 ( 6.18x) w_avg_12_64x64_c: 64.0 ( 1.00x) w_avg_12_64x64_neon: 10.0 ( 6.40x) w_avg_12_128x128_c: 269.2 ( 1.00x) w_avg_12_128x128_neon: 42.0 ( 6.41x) --- libavcodec/aarch64/vvc/dsp_init.c | 34 ++++++++++++ libavcodec/aarch64/vvc/inter.S | 92 +++++++++++++++++++++++++------ 2 files changed, 109 insertions(+), 17 deletions(-)