diff mbox series

[FFmpeg-devel,02/15] avfilter/vf_bwdif: Add common macros and consts for aarch64 neon

Message ID 20230629175729.224383-3-jc@kynesim.co.uk
State New
Headers show
Series avfilter/vf_bwdif: Add aarch64 neon functions | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

John Cox June 29, 2023, 5:57 p.m. UTC
Add macros for dual scalar half->single multiply and accumulate
Add macro for shift, saturate and shorten single to byte
Add filter constants

Signed-off-by: John Cox <jc@kynesim.co.uk>
---
 libavfilter/aarch64/vf_bwdif_neon.S | 46 +++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

Comments

Martin Storsjö July 1, 2023, 9:35 p.m. UTC | #1
On Thu, 29 Jun 2023, John Cox wrote:

> Add macros for dual scalar half->single multiply and accumulate
> Add macro for shift, saturate and shorten single to byte
> Add filter constants
>
> Signed-off-by: John Cox <jc@kynesim.co.uk>
> ---
> libavfilter/aarch64/vf_bwdif_neon.S | 46 +++++++++++++++++++++++++++++
> 1 file changed, 46 insertions(+)
>
> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
> index 639ab22998..a8f0ed525a 100644
> --- a/libavfilter/aarch64/vf_bwdif_neon.S
> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
> @@ -23,3 +23,49 @@
>
> #include "libavutil/aarch64/asm.S"
>
> +.macro SQSHRUNN b, s0, s1, s2, s3, n
> +        sqshrun         \s0\().4h, \s0\().4s, #\n - 8
> +        sqshrun2        \s0\().8h, \s1\().4s, #\n - 8
> +        sqshrun         \s1\().4h, \s2\().4s, #\n - 8
> +        sqshrun2        \s1\().8h, \s3\().4s, #\n - 8
> +        uzp2            \b\().16b, \s0\().16b, \s1\().16b
> +.endm
> +
> +.macro SMULL4K a0, a1, a2, a3, s0, s1, k
> +        smull           \a0\().4s, \s0\().4h, \k
> +        smull2          \a1\().4s, \s0\().8h, \k
> +        smull           \a2\().4s, \s1\().4h, \k
> +        smull2          \a3\().4s, \s1\().8h, \k
> +.endm
> +
> +.macro UMULL4K a0, a1, a2, a3, s0, s1, k
> +        umull           \a0\().4s, \s0\().4h, \k
> +        umull2          \a1\().4s, \s0\().8h, \k
> +        umull           \a2\().4s, \s1\().4h, \k
> +        umull2          \a3\().4s, \s1\().8h, \k
> +.endm
> +
> +.macro UMLAL4K a0, a1, a2, a3, s0, s1, k
> +        umlal           \a0\().4s, \s0\().4h, \k
> +        umlal2          \a1\().4s, \s0\().8h, \k
> +        umlal           \a2\().4s, \s1\().4h, \k
> +        umlal2          \a3\().4s, \s1\().8h, \k
> +.endm
> +
> +.macro UMLSL4K a0, a1, a2, a3, s0, s1, k
> +        umlsl           \a0\().4s, \s0\().4h, \k
> +        umlsl2          \a1\().4s, \s0\().8h, \k
> +        umlsl           \a2\().4s, \s1\().4h, \k
> +        umlsl2          \a3\().4s, \s1\().8h, \k
> +.endm
> +
> +// static const uint16_t coef_lf[2] = { 4309, 213 };
> +// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
> +// static const uint16_t coef_sp[2] = { 5077, 981 };
> +
> +        .align 16

Note that .align for arm is power of two; this triggers a 2^16 byte 
alignment here, which certainly isn't intended.

But in general, the usual way of defining constants is with a 
const/endconst block, which places them in the right rdata section instead 
of in the text section. But that then requires you to use a movrel macro 
for accessing the data, instead of a plain ldr instruction.

> +coeffs:
> +        .hword          4309 * 4, 213 * 4               // lf[0]*4 = v0.h[0]
> +        .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
> +        .hword          5077, 981                       // sp[0] = v0.h[6]
> +
> --


// Martin
John Cox July 2, 2023, 10:27 a.m. UTC | #2
On Sun, 2 Jul 2023 00:35:14 +0300 (EEST), you wrote:

>On Thu, 29 Jun 2023, John Cox wrote:
>
>> Add macros for dual scalar half->single multiply and accumulate
>> Add macro for shift, saturate and shorten single to byte
>> Add filter constants
>>
>> Signed-off-by: John Cox <jc@kynesim.co.uk>
>> ---
>> libavfilter/aarch64/vf_bwdif_neon.S | 46 +++++++++++++++++++++++++++++
>> 1 file changed, 46 insertions(+)
>>
>> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
>> index 639ab22998..a8f0ed525a 100644
>> --- a/libavfilter/aarch64/vf_bwdif_neon.S
>> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
>> @@ -23,3 +23,49 @@
>>
>> #include "libavutil/aarch64/asm.S"
>>
>> +.macro SQSHRUNN b, s0, s1, s2, s3, n
>> +        sqshrun         \s0\().4h, \s0\().4s, #\n - 8
>> +        sqshrun2        \s0\().8h, \s1\().4s, #\n - 8
>> +        sqshrun         \s1\().4h, \s2\().4s, #\n - 8
>> +        sqshrun2        \s1\().8h, \s3\().4s, #\n - 8
>> +        uzp2            \b\().16b, \s0\().16b, \s1\().16b
>> +.endm
>> +
>> +.macro SMULL4K a0, a1, a2, a3, s0, s1, k
>> +        smull           \a0\().4s, \s0\().4h, \k
>> +        smull2          \a1\().4s, \s0\().8h, \k
>> +        smull           \a2\().4s, \s1\().4h, \k
>> +        smull2          \a3\().4s, \s1\().8h, \k
>> +.endm
>> +
>> +.macro UMULL4K a0, a1, a2, a3, s0, s1, k
>> +        umull           \a0\().4s, \s0\().4h, \k
>> +        umull2          \a1\().4s, \s0\().8h, \k
>> +        umull           \a2\().4s, \s1\().4h, \k
>> +        umull2          \a3\().4s, \s1\().8h, \k
>> +.endm
>> +
>> +.macro UMLAL4K a0, a1, a2, a3, s0, s1, k
>> +        umlal           \a0\().4s, \s0\().4h, \k
>> +        umlal2          \a1\().4s, \s0\().8h, \k
>> +        umlal           \a2\().4s, \s1\().4h, \k
>> +        umlal2          \a3\().4s, \s1\().8h, \k
>> +.endm
>> +
>> +.macro UMLSL4K a0, a1, a2, a3, s0, s1, k
>> +        umlsl           \a0\().4s, \s0\().4h, \k
>> +        umlsl2          \a1\().4s, \s0\().8h, \k
>> +        umlsl           \a2\().4s, \s1\().4h, \k
>> +        umlsl2          \a3\().4s, \s1\().8h, \k
>> +.endm
>> +
>> +// static const uint16_t coef_lf[2] = { 4309, 213 };
>> +// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
>> +// static const uint16_t coef_sp[2] = { 5077, 981 };
>> +
>> +        .align 16
>
>Note that .align for arm is power of two; this triggers a 2^16 byte 
>alignment here, which certainly isn't intended.

Yikes! I'll swap that for a .balign now I've looked that up

>But in general, the usual way of defining constants is with a 
>const/endconst block, which places them in the right rdata section instead 
>of in the text section. But that then requires you to use a movrel macro 
>for accessing the data, instead of a plain ldr instruction.

Yeah - arm has a history of mixing text & const - I went with the
simpler code. Is this a deal breaker or can I leave it as is?

JC

>> +coeffs:
>> +        .hword          4309 * 4, 213 * 4               // lf[0]*4 = v0.h[0]
>> +        .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
>> +        .hword          5077, 981                       // sp[0] = v0.h[6]
>> +
>> --
>
>
>// Martin
Martin Storsjö July 2, 2023, 8:07 p.m. UTC | #3
On Sun, 2 Jul 2023, John Cox wrote:

> On Sun, 2 Jul 2023 00:35:14 +0300 (EEST), you wrote:
>
>> On Thu, 29 Jun 2023, John Cox wrote:
>>
>>> Add macros for dual scalar half->single multiply and accumulate
>>> Add macro for shift, saturate and shorten single to byte
>>> Add filter constants
>>>
>>> Signed-off-by: John Cox <jc@kynesim.co.uk>
>>> ---
>>> libavfilter/aarch64/vf_bwdif_neon.S | 46 +++++++++++++++++++++++++++++
>>> 1 file changed, 46 insertions(+)
>>>
>>> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
>>> index 639ab22998..a8f0ed525a 100644
>>> --- a/libavfilter/aarch64/vf_bwdif_neon.S
>>> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
>>> @@ -23,3 +23,49 @@
>>>
>>> #include "libavutil/aarch64/asm.S"
>>>
>>> +.macro SQSHRUNN b, s0, s1, s2, s3, n
>>> +        sqshrun         \s0\().4h, \s0\().4s, #\n - 8
>>> +        sqshrun2        \s0\().8h, \s1\().4s, #\n - 8
>>> +        sqshrun         \s1\().4h, \s2\().4s, #\n - 8
>>> +        sqshrun2        \s1\().8h, \s3\().4s, #\n - 8
>>> +        uzp2            \b\().16b, \s0\().16b, \s1\().16b
>>> +.endm
>>> +
>>> +.macro SMULL4K a0, a1, a2, a3, s0, s1, k
>>> +        smull           \a0\().4s, \s0\().4h, \k
>>> +        smull2          \a1\().4s, \s0\().8h, \k
>>> +        smull           \a2\().4s, \s1\().4h, \k
>>> +        smull2          \a3\().4s, \s1\().8h, \k
>>> +.endm
>>> +
>>> +.macro UMULL4K a0, a1, a2, a3, s0, s1, k
>>> +        umull           \a0\().4s, \s0\().4h, \k
>>> +        umull2          \a1\().4s, \s0\().8h, \k
>>> +        umull           \a2\().4s, \s1\().4h, \k
>>> +        umull2          \a3\().4s, \s1\().8h, \k
>>> +.endm
>>> +
>>> +.macro UMLAL4K a0, a1, a2, a3, s0, s1, k
>>> +        umlal           \a0\().4s, \s0\().4h, \k
>>> +        umlal2          \a1\().4s, \s0\().8h, \k
>>> +        umlal           \a2\().4s, \s1\().4h, \k
>>> +        umlal2          \a3\().4s, \s1\().8h, \k
>>> +.endm
>>> +
>>> +.macro UMLSL4K a0, a1, a2, a3, s0, s1, k
>>> +        umlsl           \a0\().4s, \s0\().4h, \k
>>> +        umlsl2          \a1\().4s, \s0\().8h, \k
>>> +        umlsl           \a2\().4s, \s1\().4h, \k
>>> +        umlsl2          \a3\().4s, \s1\().8h, \k
>>> +.endm
>>> +
>>> +// static const uint16_t coef_lf[2] = { 4309, 213 };
>>> +// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
>>> +// static const uint16_t coef_sp[2] = { 5077, 981 };
>>> +
>>> +        .align 16
>>
>> Note that .align for arm is power of two; this triggers a 2^16 byte
>> alignment here, which certainly isn't intended.
>
> Yikes! I'll swap that for a .balign now I've looked that up
>
>> But in general, the usual way of defining constants is with a
>> const/endconst block, which places them in the right rdata section instead
>> of in the text section. But that then requires you to use a movrel macro
>> for accessing the data, instead of a plain ldr instruction.
>
> Yeah - arm has a history of mixing text & const - I went with the
> simpler code. Is this a deal breaker or can I leave it as is?

I wouldn't treat it as a deal breaker as long as it works across all 
platforms - even if consistency with the code style elsewhere is 
preferred, but IIRC there may be issues with MS armasm (after passed 
through gas-preprocessor). IIRC there might be issues with starting out 
with straight up content without the full setup made by the function/const 
macros. OTOH I might have fixed that in gas-preprocessor too...

Last time around, the patchset failed building in that configuration due 
ot the out of range alignment, I'll see how it fares now.

// Martin
Martin Storsjö July 2, 2023, 9:02 p.m. UTC | #4
On Sun, 2 Jul 2023, Martin Storsjö wrote:

> On Sun, 2 Jul 2023, John Cox wrote:
>
>> On Sun, 2 Jul 2023 00:35:14 +0300 (EEST), you wrote:
>> 
>>> On Thu, 29 Jun 2023, John Cox wrote:
>>> 
>>>> Add macros for dual scalar half->single multiply and accumulate
>>>> Add macro for shift, saturate and shorten single to byte
>>>> Add filter constants
>>>> 
>>>> Signed-off-by: John Cox <jc@kynesim.co.uk>
>>>> ---
>>>> libavfilter/aarch64/vf_bwdif_neon.S | 46 +++++++++++++++++++++++++++++
>>>> 1 file changed, 46 insertions(+)
>>>> 
>>>> +
>>>> +        .align 16
>>> 
>>> Note that .align for arm is power of two; this triggers a 2^16 byte
>>> alignment here, which certainly isn't intended.
>> 
>> Yikes! I'll swap that for a .balign now I've looked that up
>> 
>>> But in general, the usual way of defining constants is with a
>>> const/endconst block, which places them in the right rdata section instead
>>> of in the text section. But that then requires you to use a movrel macro
>>> for accessing the data, instead of a plain ldr instruction.
>> 
>> Yeah - arm has a history of mixing text & const - I went with the
>> simpler code. Is this a deal breaker or can I leave it as is?
>
> I wouldn't treat it as a deal breaker as long as it works across all 
> platforms - even if consistency with the code style elsewhere is preferred, 
> but IIRC there may be issues with MS armasm (after passed through 
> gas-preprocessor). IIRC there might be issues with starting out with straight 
> up content without the full setup made by the function/const macros. OTOH I 
> might have fixed that in gas-preprocessor too...
>
> Last time around, the patchset failed building in that configuration due ot 
> the out of range alignment, I'll see how it fares now.

I'm sorry, but I'd just recommend you to go with the const macros.

Your current patch fails because gas-preprocessor, 
https://github.com/ffmpeg/gas-preprocessor, doesn't support the .balign 
directive, it only recognizes .align and .p2align. (Extending it to 
support it would be trivial though.)

If I change your code to ".align 4", I get the following warning:

\home\martin\code\ffmpeg-msvc-arm64\libavfilter\aarch64\vf_bwdif_neon.o.asm(1011) 
: warning A4228: Alignment value exceeds AREA alignment; alignment not 
guaranteed
         ALIGN 16

Since we haven't started any section, apparently armasm defaults to a 
section with 4 byte alignment.

But anyway, regardless of the alignment, it later fails with this error:

\home\martin\code\ffmpeg-msvc-arm64\libavfilter\aarch64\vf_bwdif_neon.o.asm(1051) 
: error A2504: operand 2: Expected address
         ldr             q0, coeffs


So I would request you to just go with the macros we use elsewhere. The 
gas-preprocessor/armasm setup doesn't support/expect any random assembly, 
but the disciplined subset we normally write. In most cases, we 
essentially never write bare directives in the code, but only use the 
macros from asm.S, which are set up to handle portability across all 
supported platforms and their toolchains.

// Martin
John Cox July 3, 2023, 8:31 a.m. UTC | #5
On Mon, 3 Jul 2023 00:02:27 +0300 (EEST), you wrote:

>On Sun, 2 Jul 2023, Martin Storsjö wrote:
>
>> On Sun, 2 Jul 2023, John Cox wrote:
>>
>>> On Sun, 2 Jul 2023 00:35:14 +0300 (EEST), you wrote:
>>> 
>>>> On Thu, 29 Jun 2023, John Cox wrote:
>>>> 
>>>>> Add macros for dual scalar half->single multiply and accumulate
>>>>> Add macro for shift, saturate and shorten single to byte
>>>>> Add filter constants
>>>>> 
>>>>> Signed-off-by: John Cox <jc@kynesim.co.uk>
>>>>> ---
>>>>> libavfilter/aarch64/vf_bwdif_neon.S | 46 +++++++++++++++++++++++++++++
>>>>> 1 file changed, 46 insertions(+)
>>>>> 
>>>>> +
>>>>> +        .align 16
>>>> 
>>>> Note that .align for arm is power of two; this triggers a 2^16 byte
>>>> alignment here, which certainly isn't intended.
>>> 
>>> Yikes! I'll swap that for a .balign now I've looked that up
>>> 
>>>> But in general, the usual way of defining constants is with a
>>>> const/endconst block, which places them in the right rdata section instead
>>>> of in the text section. But that then requires you to use a movrel macro
>>>> for accessing the data, instead of a plain ldr instruction.
>>> 
>>> Yeah - arm has a history of mixing text & const - I went with the
>>> simpler code. Is this a deal breaker or can I leave it as is?
>>
>> I wouldn't treat it as a deal breaker as long as it works across all 
>> platforms - even if consistency with the code style elsewhere is preferred, 
>> but IIRC there may be issues with MS armasm (after passed through 
>> gas-preprocessor). IIRC there might be issues with starting out with straight 
>> up content without the full setup made by the function/const macros. OTOH I 
>> might have fixed that in gas-preprocessor too...
>>
>> Last time around, the patchset failed building in that configuration due ot 
>> the out of range alignment, I'll see how it fares now.
>
>I'm sorry, but I'd just recommend you to go with the const macros.
>
>Your current patch fails because gas-preprocessor, 
>https://github.com/ffmpeg/gas-preprocessor, doesn't support the .balign 
>directive, it only recognizes .align and .p2align. (Extending it to 
>support it would be trivial though.)
>
>If I change your code to ".align 4", I get the following warning:
>
>\home\martin\code\ffmpeg-msvc-arm64\libavfilter\aarch64\vf_bwdif_neon.o.asm(1011) 
>: warning A4228: Alignment value exceeds AREA alignment; alignment not 
>guaranteed
>         ALIGN 16
>
>Since we haven't started any section, apparently armasm defaults to a 
>section with 4 byte alignment.
>
>But anyway, regardless of the alignment, it later fails with this error:
>
>\home\martin\code\ffmpeg-msvc-arm64\libavfilter\aarch64\vf_bwdif_neon.o.asm(1051) 
>: error A2504: operand 2: Expected address
>         ldr             q0, coeffs
>
>
>So I would request you to just go with the macros we use elsewhere. The 
>gas-preprocessor/armasm setup doesn't support/expect any random assembly, 
>but the disciplined subset we normally write. In most cases, we 
>essentially never write bare directives in the code, but only use the 
>macros from asm.S, which are set up to handle portability across all 
>supported platforms and their toolchains.

OK - will do.

JC

>// Martin
diff mbox series

Patch

diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index 639ab22998..a8f0ed525a 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -23,3 +23,49 @@ 
 
 #include "libavutil/aarch64/asm.S"
 
+.macro SQSHRUNN b, s0, s1, s2, s3, n
+        sqshrun         \s0\().4h, \s0\().4s, #\n - 8
+        sqshrun2        \s0\().8h, \s1\().4s, #\n - 8
+        sqshrun         \s1\().4h, \s2\().4s, #\n - 8
+        sqshrun2        \s1\().8h, \s3\().4s, #\n - 8
+        uzp2            \b\().16b, \s0\().16b, \s1\().16b
+.endm
+
+.macro SMULL4K a0, a1, a2, a3, s0, s1, k
+        smull           \a0\().4s, \s0\().4h, \k
+        smull2          \a1\().4s, \s0\().8h, \k
+        smull           \a2\().4s, \s1\().4h, \k
+        smull2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMULL4K a0, a1, a2, a3, s0, s1, k
+        umull           \a0\().4s, \s0\().4h, \k
+        umull2          \a1\().4s, \s0\().8h, \k
+        umull           \a2\().4s, \s1\().4h, \k
+        umull2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMLAL4K a0, a1, a2, a3, s0, s1, k
+        umlal           \a0\().4s, \s0\().4h, \k
+        umlal2          \a1\().4s, \s0\().8h, \k
+        umlal           \a2\().4s, \s1\().4h, \k
+        umlal2          \a3\().4s, \s1\().8h, \k
+.endm
+
+.macro UMLSL4K a0, a1, a2, a3, s0, s1, k
+        umlsl           \a0\().4s, \s0\().4h, \k
+        umlsl2          \a1\().4s, \s0\().8h, \k
+        umlsl           \a2\().4s, \s1\().4h, \k
+        umlsl2          \a3\().4s, \s1\().8h, \k
+.endm
+
+// static const uint16_t coef_lf[2] = { 4309, 213 };
+// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
+// static const uint16_t coef_sp[2] = { 5077, 981 };
+
+        .align 16
+coeffs:
+        .hword          4309 * 4, 213 * 4               // lf[0]*4 = v0.h[0]
+        .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
+        .hword          5077, 981                       // sp[0] = v0.h[6]
+