@@ -20,12 +20,42 @@
#include "libavutil/aarch64/asm.S"
-.macro lumConvertRange fromto
-function ff_lumRange\fromto\()Jpeg_neon, export=1
+.macro lumConvertRange fromto, bit_depth
+function ff_lumRange\fromto\()Jpeg\bit_depth\()_neon, export=1
// x0 int16_t *dst
// w1 int width
// w2 int coeff
// x3 int64_t offset
+.if \bit_depth == 16
+.ifc \fromto, To
+ movi v25.4s, #1
+ movi v24.4s, #1<<3, lsl #16
+ sub v24.4s, v24.4s, v25.4s
+.endif
+ dup v25.4s, w2
+ dup v26.2d, x3
+1:
+ ld1 {v0.4s, v1.4s}, [x0]
+ mov v16.16b, v26.16b
+ mov v17.16b, v26.16b
+ mov v18.16b, v26.16b
+ mov v19.16b, v26.16b
+ smlal v16.2d, v0.2s, v25.2s
+ smlal2 v17.2d, v0.4s, v25.4s
+ smlal v18.2d, v1.2s, v25.2s
+ smlal2 v19.2d, v1.4s, v25.4s
+ shrn v0.2s, v16.2d, 18
+ shrn2 v0.4s, v17.2d, 18
+ shrn v1.2s, v18.2d, 18
+ shrn2 v1.4s, v19.2d, 18
+ subs w1, w1, #8
+.ifc \fromto, To
+ smin v0.4s, v0.4s, v24.4s
+ smin v1.4s, v1.4s, v24.4s
+.endif
+ st1 {v0.4s, v1.4s}, [x0], #32
+ b.gt 1b
+.else
dup v25.4s, w2
dup v26.4s, w3
1:
@@ -46,17 +76,64 @@ function ff_lumRange\fromto\()Jpeg_neon, export=1
subs w1, w1, #8
st1 {v0.8h}, [x0], #16
b.gt 1b
+.endif
ret
endfunc
.endm
-.macro chrConvertRange fromto
-function ff_chrRange\fromto\()Jpeg_neon, export=1
+.macro chrConvertRange fromto, bit_depth
+function ff_chrRange\fromto\()Jpeg\bit_depth\()_neon, export=1
// x0 int16_t *dstU
// x1 int16_t *dstV
// w2 int width
// w3 int coeff
// x4 int64_t offset
+.if \bit_depth == 16
+.ifc \fromto, To
+ movi v25.4s, #1
+ movi v24.4s, #1<<3, lsl #16
+ sub v24.4s, v24.4s, v25.4s
+.endif
+ dup v25.4s, w3
+ dup v26.2d, x4
+1:
+ ld1 {v0.4s, v1.4s}, [x0]
+ ld1 {v2.4s, v3.4s}, [x1]
+ mov v16.16b, v26.16b
+ mov v17.16b, v26.16b
+ mov v18.16b, v26.16b
+ mov v19.16b, v26.16b
+ mov v20.16b, v26.16b
+ mov v21.16b, v26.16b
+ mov v22.16b, v26.16b
+ mov v23.16b, v26.16b
+ smlal v16.2d, v0.2s, v25.2s
+ smlal2 v17.2d, v0.4s, v25.4s
+ smlal v18.2d, v1.2s, v25.2s
+ smlal2 v19.2d, v1.4s, v25.4s
+ smlal v20.2d, v2.2s, v25.2s
+ smlal2 v21.2d, v2.4s, v25.4s
+ smlal v22.2d, v3.2s, v25.2s
+ smlal2 v23.2d, v3.4s, v25.4s
+ shrn v0.2s, v16.2d, 18
+ shrn2 v0.4s, v17.2d, 18
+ shrn v1.2s, v18.2d, 18
+ shrn2 v1.4s, v19.2d, 18
+ shrn v2.2s, v20.2d, 18
+ shrn2 v2.4s, v21.2d, 18
+ shrn v3.2s, v22.2d, 18
+ shrn2 v3.4s, v23.2d, 18
+ subs w2, w2, #8
+.ifc \fromto, To
+ smin v0.4s, v0.4s, v24.4s
+ smin v1.4s, v1.4s, v24.4s
+ smin v2.4s, v2.4s, v24.4s
+ smin v3.4s, v3.4s, v24.4s
+.endif
+ st1 {v0.4s, v1.4s}, [x0], #32
+ st1 {v2.4s, v3.4s}, [x1], #32
+ b.gt 1b
+.else
dup v25.4s, w3
dup v26.4s, w4
1:
@@ -89,11 +166,16 @@ function ff_chrRange\fromto\()Jpeg_neon, export=1
st1 {v0.8h}, [x0], #16
st1 {v1.8h}, [x1], #16
b.gt 1b
+.endif
ret
endfunc
.endm
-lumConvertRange To
-chrConvertRange To
-lumConvertRange From
-chrConvertRange From
+lumConvertRange To, 8
+lumConvertRange To, 16
+chrConvertRange To, 8
+chrConvertRange To, 16
+lumConvertRange From, 8
+lumConvertRange From, 16
+chrConvertRange From, 8
+chrConvertRange From, 16
@@ -218,14 +218,22 @@ NEON_INPUT(bgra32);
NEON_INPUT(rgb24);
NEON_INPUT(rgba32);
-void ff_lumRangeFromJpeg_neon(int16_t *dst, int width,
+void ff_lumRangeFromJpeg8_neon(int16_t *dst, int width,
+ int coeff, int64_t offset);
+void ff_chrRangeFromJpeg8_neon(int16_t *dstU, int16_t *dstV, int width,
+ int coeff, int64_t offset);
+void ff_lumRangeToJpeg8_neon(int16_t *dst, int width,
+ int coeff, int64_t offset);
+void ff_chrRangeToJpeg8_neon(int16_t *dstU, int16_t *dstV, int width,
+ int coeff, int64_t offset);
+void ff_lumRangeFromJpeg16_neon(int16_t *dst, int width,
+ int coeff, int64_t offset);
+void ff_chrRangeFromJpeg16_neon(int16_t *dstU, int16_t *dstV, int width,
+ int coeff, int64_t offset);
+void ff_lumRangeToJpeg16_neon(int16_t *dst, int width,
int coeff, int64_t offset);
-void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
+void ff_chrRangeToJpeg16_neon(int16_t *dstU, int16_t *dstV, int width,
int coeff, int64_t offset);
-void ff_lumRangeToJpeg_neon(int16_t *dst, int width,
- int coeff, int64_t offset);
-void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
- int coeff, int64_t offset);
av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
{
@@ -234,11 +242,19 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
if (have_neon(cpu_flags)) {
if (c->dstBpc <= 14) {
if (c->srcRange) {
- c->lumConvertRange = ff_lumRangeFromJpeg_neon;
- c->chrConvertRange = ff_chrRangeFromJpeg_neon;
+ c->lumConvertRange = ff_lumRangeFromJpeg8_neon;
+ c->chrConvertRange = ff_chrRangeFromJpeg8_neon;
} else {
- c->lumConvertRange = ff_lumRangeToJpeg_neon;
- c->chrConvertRange = ff_chrRangeToJpeg_neon;
+ c->lumConvertRange = ff_lumRangeToJpeg8_neon;
+ c->chrConvertRange = ff_chrRangeToJpeg8_neon;
+ }
+ } else {
+ if (c->srcRange) {
+ c->lumConvertRange = ff_lumRangeFromJpeg16_neon;
+ c->chrConvertRange = ff_chrRangeFromJpeg16_neon;
+ } else {
+ c->lumConvertRange = ff_lumRangeToJpeg16_neon;
+ c->chrConvertRange = ff_chrRangeToJpeg16_neon;
}
}
}