@@ -20,20 +20,21 @@
#include "libavutil/aarch64/asm.S"
-.macro lumConvertRange name, max, mult, offset, shift
-function ff_\name, export=1
-.if \max != 0
- mov w3, #\max
- dup v24.8h, w3
+.macro lumConvertRange fromto
+function ff_lumRange\fromto\()Jpeg_neon, export=1
+// x0 int16_t *dst
+// w1 int width
+// w2 int amax
+// w3 int coeff
+// x4 int64_t offset
+.ifc \fromto, To
+ dup v24.8h, w2
.endif
- mov w3, #\mult
dup v25.4s, w3
- movz w3, #(\offset & 0xffff)
- movk w3, #((\offset >> 16) & 0xffff), lsl #16
- dup v26.4s, w3
+ dup v26.4s, w4
1:
ld1 {v0.8h}, [x0]
-.if \max != 0
+.ifc \fromto, To
smin v0.8h, v0.8h, v24.8h
.endif
mov v16.16b, v26.16b
@@ -42,8 +43,8 @@ function ff_\name, export=1
sxtl2 v22.4s, v0.8h
mla v16.4s, v20.4s, v25.4s
mla v18.4s, v22.4s, v25.4s
- shrn v0.4h, v16.4s, #\shift
- shrn2 v0.8h, v18.4s, #\shift
+ shrn v0.4h, v16.4s, 14
+ shrn2 v0.8h, v18.4s, 14
subs w1, w1, #8
st1 {v0.8h}, [x0], #16
b.gt 1b
@@ -51,21 +52,23 @@ function ff_\name, export=1
endfunc
.endm
-.macro chrConvertRange name, max, mult, offset, shift
-function ff_\name, export=1
-.if \max != 0
- mov w3, #\max
+.macro chrConvertRange fromto
+function ff_chrRange\fromto\()Jpeg_neon, export=1
+// x0 int16_t *dstU
+// x1 int16_t *dstV
+// w2 int width
+// w3 int amax
+// w4 int coeff
+// x5 int64_t offset
+.ifc \fromto, To
dup v24.8h, w3
.endif
- mov w3, #\mult
- dup v25.4s, w3
- movz w3, #(\offset & 0xffff)
- movk w3, #((\offset >> 16) & 0xffff), lsl #16
- dup v26.4s, w3
+ dup v25.4s, w4
+ dup v26.4s, w5
1:
ld1 {v0.8h}, [x0]
ld1 {v1.8h}, [x1]
-.if \max != 0
+.ifc \fromto, To
smin v0.8h, v0.8h, v24.8h
smin v1.8h, v1.8h, v24.8h
.endif
@@ -81,10 +84,10 @@ function ff_\name, export=1
mla v17.4s, v21.4s, v25.4s
mla v18.4s, v22.4s, v25.4s
mla v19.4s, v23.4s, v25.4s
- shrn v0.4h, v16.4s, #\shift
- shrn v1.4h, v17.4s, #\shift
- shrn2 v0.8h, v18.4s, #\shift
- shrn2 v1.8h, v19.4s, #\shift
+ shrn v0.4h, v16.4s, 14
+ shrn v1.4h, v17.4s, 14
+ shrn2 v0.8h, v18.4s, 14
+ shrn2 v1.8h, v19.4s, 14
subs w2, w2, #8
st1 {v0.8h}, [x0], #16
st1 {v1.8h}, [x1], #16
@@ -93,7 +96,7 @@ function ff_\name, export=1
endfunc
.endm
-lumConvertRange lumRangeToJpeg_neon, 30189, 19077, -39057361, 14
-chrConvertRange chrRangeToJpeg_neon, 30775, 4663, -9289992, 12
-lumConvertRange lumRangeFromJpeg_neon, 0, 14071, 33561947, 14
-chrConvertRange chrRangeFromJpeg_neon, 0, 1799, 4081085, 11
+lumConvertRange To
+chrConvertRange To
+lumConvertRange From
+chrConvertRange From
@@ -218,14 +218,17 @@ NEON_INPUT(bgra32);
NEON_INPUT(rgb24);
NEON_INPUT(rgba32);
-void ff_lumRangeFromJpeg_neon(int16_t *dst, int width);
-void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
-void ff_lumRangeToJpeg_neon(int16_t *dst, int width);
-void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
+void ff_lumRangeFromJpeg_neon(int16_t *dst, int width,
+ int amax, int coeff, int64_t offset);
+void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
+ int amax, int coeff, int64_t offset);
+void ff_lumRangeToJpeg_neon(int16_t *dst, int width,
+ int amax, int coeff, int64_t offset);
+void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
+ int amax, int coeff, int64_t offset);
av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
{
-#if 0
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
@@ -239,7 +242,6 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c)
}
}
}
-#endif
}
av_cold void ff_sws_init_swscale_aarch64(SwsContext *c)