@@ -297,4 +297,12 @@ NEON8_FNPROTO_PARTIAL_6(qpel_hv, (int16_t *dst,
const uint8_t *src, ptrdiff_t srcstride, int height,
const int8_t *hf, const int8_t *vf, int width), _i8mm);
+NEON8_FNPROTO_PARTIAL_6(epel_hv, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride, int height,
+ const int8_t *hf, const int8_t *vf, int width),);
+
+NEON8_FNPROTO_PARTIAL_6(epel_hv, (int16_t *dst,
+ const uint8_t *src, ptrdiff_t srcstride, int height,
+ const int8_t *hf, const int8_t *vf, int width), _i8mm);
+
#endif
@@ -72,6 +72,11 @@ endconst
sxtl v0.8h, v0.8b
.endm
+.macro vvc_load_epel_filterh freg
+ ld1 {v0.8b}, [\freg]
+ sxtl v0.8h, v0.8b
+.endm
+
.macro calc_epelh dst, src0, src1, src2, src3
smull \dst\().4s, \src0\().4h, v0.h[0]
smlal \dst\().4s, \src1\().4h, v0.h[1]
@@ -2299,10 +2304,16 @@ endfunc
DISABLE_I8MM
#endif
+function vvc_put_epel_hv4_8_end_neon
+ vvc_load_epel_filterh x5
+ mov x10, #(VVC_MAX_PB_SIZE * 2)
+ b 0f
+endfunc
function hevc_put_hevc_epel_hv4_8_end_neon
load_epel_filterh x5, x4
mov x10, #(HEVC_MAX_PB_SIZE * 2)
+0:
ldr d16, [sp]
ldr d17, [sp, x10]
add sp, sp, x10, lsl #1
@@ -2339,9 +2350,16 @@ function hevc_put_hevc_epel_hv6_8_end_neon
2: ret
endfunc
+function vvc_put_epel_hv8_8_end_neon
+ vvc_load_epel_filterh x5
+ mov x10, #(VVC_MAX_PB_SIZE * 2)
+ b 0f
+endfunc
+
function hevc_put_hevc_epel_hv8_8_end_neon
load_epel_filterh x5, x4
mov x10, #(HEVC_MAX_PB_SIZE * 2)
+0:
ldr q16, [sp]
ldr q17, [sp, x10]
add sp, sp, x10, lsl #1
@@ -2379,9 +2397,16 @@ function hevc_put_hevc_epel_hv12_8_end_neon
2: ret
endfunc
+function vvc_put_epel_hv16_8_end_neon
+ vvc_load_epel_filterh x5
+ mov x10, #(VVC_MAX_PB_SIZE * 2)
+ b 0f
+endfunc
+
function hevc_put_hevc_epel_hv16_8_end_neon
load_epel_filterh x5, x4
mov x10, #(HEVC_MAX_PB_SIZE * 2)
+0:
ld1 {v16.8h, v17.8h}, [sp], x10
ld1 {v18.8h, v19.8h}, [sp], x10
ld1 {v20.8h, v21.8h}, [sp], x10
@@ -2437,6 +2462,21 @@ function ff_hevc_put_hevc_epel_hv4_8_\suffix, export=1
b hevc_put_hevc_epel_hv4_8_end_neon
endfunc
+function ff_vvc_put_epel_hv4_8_\suffix, export=1
+ add w10, w3, #3
+ lsl x10, x10, #8
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_vvc_put_epel_h4_8_\suffix)
+ ldp x0, x3, [sp, #16]
+ ldp x5, x30, [sp], #32
+ b vvc_put_epel_hv4_8_end_neon
+endfunc
+
function ff_hevc_put_hevc_epel_hv6_8_\suffix, export=1
add w10, w3, #3
lsl x10, x10, #7
@@ -2467,6 +2507,21 @@ function ff_hevc_put_hevc_epel_hv8_8_\suffix, export=1
b hevc_put_hevc_epel_hv8_8_end_neon
endfunc
+function ff_vvc_put_epel_hv8_8_\suffix, export=1
+ add w10, w3, #3
+ lsl x10, x10, #8
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_vvc_put_epel_h8_8_\suffix)
+ ldp x0, x3, [sp, #16]
+ ldp x5, x30, [sp], #32
+ b vvc_put_epel_hv8_8_end_neon
+endfunc
+
function ff_hevc_put_hevc_epel_hv12_8_\suffix, export=1
add w10, w3, #3
lsl x10, x10, #7
@@ -2497,6 +2552,21 @@ function ff_hevc_put_hevc_epel_hv16_8_\suffix, export=1
b hevc_put_hevc_epel_hv16_8_end_neon
endfunc
+function ff_vvc_put_epel_hv16_8_\suffix, export=1
+ add w10, w3, #3
+ lsl x10, x10, #8
+ sub sp, sp, x10 // tmp_array
+ stp x5, x30, [sp, #-32]!
+ stp x0, x3, [sp, #16]
+ add x0, sp, #32
+ sub x1, x1, x2
+ add w3, w3, #3
+ bl X(ff_vvc_put_epel_h16_8_\suffix)
+ ldp x0, x3, [sp, #16]
+ ldp x5, x30, [sp], #32
+ b vvc_put_epel_hv16_8_end_neon
+endfunc
+
function ff_hevc_put_hevc_epel_hv24_8_\suffix, export=1
add w10, w3, #3
lsl x10, x10, #7
@@ -2530,6 +2600,24 @@ function ff_hevc_put_hevc_epel_hv32_8_\suffix, export=1
ret
endfunc
+function ff_vvc_put_epel_hv32_8_\suffix, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #16
+ bl X(ff_vvc_put_epel_hv16_8_\suffix)
+ ldp x0, x1, [sp, #32]
+ ldp x2, x3, [sp, #16]
+ ldp x4, x5, [sp], #48
+ add x0, x0, #32
+ add x1, x1, #16
+ mov x6, #16
+ bl X(ff_vvc_put_epel_hv16_8_\suffix)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
function ff_hevc_put_hevc_epel_hv48_8_\suffix, export=1
stp x4, x5, [sp, #-64]!
stp x2, x3, [sp, #16]
@@ -2579,6 +2667,43 @@ function ff_hevc_put_hevc_epel_hv64_8_\suffix, export=1
ldr x30, [sp], #16
ret
endfunc
+
+function ff_vvc_put_epel_hv64_8_\suffix, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #32
+ bl X(ff_vvc_put_epel_hv32_8_\suffix)
+ ldp x0, x1, [sp, #32]
+ ldp x2, x3, [sp, #16]
+ ldp x4, x5, [sp], #48
+ add x0, x0, #64
+ add x1, x1, #32
+ mov x6, #32
+ bl X(ff_vvc_put_epel_hv32_8_\suffix)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
+function ff_vvc_put_epel_hv128_8_\suffix, export=1
+ stp x4, x5, [sp, #-64]!
+ stp x2, x3, [sp, #16]
+ stp x0, x1, [sp, #32]
+ str x30, [sp, #48]
+ mov x6, #64
+ bl X(ff_vvc_put_epel_hv64_8_\suffix)
+ ldp x0, x1, [sp, #32]
+ ldp x2, x3, [sp, #16]
+ ldp x4, x5, [sp], #48
+ add x0, x0, #128
+ add x1, x1, #64
+ mov x6, #64
+ bl X(ff_vvc_put_epel_hv64_8_\suffix)
+ ldr x30, [sp], #16
+ ret
+endfunc
+
.endm
epel_hv neon
@@ -84,6 +84,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.put[1][5][0][1] =
c->inter.put[1][6][0][1] = ff_vvc_put_epel_h32_8_neon;
+ c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon;
+ c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon;
+ c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon;
+ c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon;
+ c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon;
+ c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon;
+
c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon;
c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon;
c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon;
@@ -134,6 +141,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->inter.put[1][4][0][1] = ff_vvc_put_epel_h32_8_neon_i8mm;
c->inter.put[1][5][0][1] = ff_vvc_put_epel_h64_8_neon_i8mm;
c->inter.put[1][6][0][1] = ff_vvc_put_epel_h128_8_neon_i8mm;
+
+ c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon_i8mm;
+ c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon_i8mm;
+ c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon_i8mm;
+ c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon_i8mm;
+ c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon_i8mm;
+ c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm;
}
} else if (bd == 10) {
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
From: Zhao Zhili <zhilizhao@tencent.com> On Apple M1: put_chroma_hv_8_4x4_c: 1.7 ( 1.00x) put_chroma_hv_8_4x4_neon: 0.2 ( 7.67x) put_chroma_hv_8_8x8_c: 5.5 ( 1.00x) put_chroma_hv_8_8x8_neon: 0.5 (11.53x) put_chroma_hv_8_16x16_c: 18.5 ( 1.00x) put_chroma_hv_8_16x16_neon: 1.5 (12.53x) put_chroma_hv_8_32x32_c: 72.5 ( 1.00x) put_chroma_hv_8_32x32_neon: 4.7 (15.34x) put_chroma_hv_8_64x64_c: 274.0 ( 1.00x) put_chroma_hv_8_64x64_neon: 18.5 (14.83x) put_chroma_hv_8_128x128_c: 1058.7 ( 1.00x) put_chroma_hv_8_128x128_neon: 75.2 (14.07x) On Android Pixel 8 Pro: put_chroma_hv_8_4x4_c: 1.2 ( 1.00x) put_chroma_hv_8_4x4_neon: 0.0 ( 0.00x) put_chroma_hv_8_4x4_i8mm: 0.2 ( 5.00x) put_chroma_hv_8_8x8_c: 4.0 ( 1.00x) put_chroma_hv_8_8x8_neon: 0.5 ( 8.00x) put_chroma_hv_8_8x8_i8mm: 0.5 ( 8.00x) put_chroma_hv_8_16x16_c: 15.2 ( 1.00x) put_chroma_hv_8_16x16_neon: 2.5 ( 6.10x) put_chroma_hv_8_16x16_i8mm: 2.2 ( 6.78x) put_chroma_hv_8_32x32_c: 61.0 ( 1.00x) put_chroma_hv_8_32x32_neon: 9.8 ( 6.26x) put_chroma_hv_8_32x32_i8mm: 8.5 ( 7.18x) put_chroma_hv_8_64x64_c: 229.5 ( 1.00x) put_chroma_hv_8_64x64_neon: 38.5 ( 5.96x) put_chroma_hv_8_64x64_i8mm: 34.0 ( 6.75x) put_chroma_hv_8_128x128_c: 919.8 ( 1.00x) put_chroma_hv_8_128x128_neon: 154.5 ( 5.95x) put_chroma_hv_8_128x128_i8mm: 140.0 ( 6.57x) --- libavcodec/aarch64/h26x/dsp.h | 8 ++ libavcodec/aarch64/h26x/epel_neon.S | 125 ++++++++++++++++++++++++++++ libavcodec/aarch64/vvc/dsp_init.c | 14 ++++ 3 files changed, 147 insertions(+)