[FFmpeg-devel] lavfi/nlmeans: fixup aarch64 assembly with clang

Submitted by Jan Ekström on July 26, 2018, 9:03 p.m.

Details

Message ID 20180726210346.28828-1-jeebjp@gmail.com
State New
Headers show

Commit Message

Jan Ekström July 26, 2018, 9:03 p.m.
Clang is more strict about some things.
---
 libavfilter/aarch64/vf_nlmeans_neon.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Comments

Clément Bœsch July 27, 2018, 8:14 a.m.
On Fri, Jul 27, 2018 at 12:03:46AM +0300, Jan Ekström wrote:
> Clang is more strict about some things.
> ---
>  libavfilter/aarch64/vf_nlmeans_neon.S | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S b/libavfilter/aarch64/vf_nlmeans_neon.S
> index 6308a428db..ac16157bbd 100644
> --- a/libavfilter/aarch64/vf_nlmeans_neon.S
> +++ b/libavfilter/aarch64/vf_nlmeans_neon.S
> @@ -22,7 +22,7 @@
>  
>  // acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
>  .macro acc_sum_store x, xb
> -        dup             v24.4S, v24.4S[3]                               // ...X -> XXXX
> +        dup             v24.4s, v24.s[3]                                // ...X -> XXXX

can you keep the capitalized form?

>          ext             v25.16B, v26.16B, \xb, #12                      // ext(0000,ABCD,12)=0ABC
>          add             v24.4S, v24.4S, \x                              // XXXX+ABCD={X+A,X+B,X+C,X+D}
>          add             v24.4S, v24.4S, v25.4S                          // {X+A,X+B+A,X+C+B,X+D+C}       (+0ABC)
> @@ -37,7 +37,7 @@ function ff_compute_safe_ssd_integral_image_neon, export=1
>          movi            v26.4S, #0                                      // used as zero for the "rotations" in acc_sum_store
>          sub             x3, x3, w6, UXTW                                // s1 padding (s1_linesize - w)
>          sub             x5, x5, w6, UXTW                                // s2 padding (s2_linesize - w)
> -        sub             x9, x0, x1, UXTW #2                             // dst_top
> +        sub             x9, x0, w1, UXTW #2                             // dst_top
>          sub             x1, x1, w6, UXTW                                // dst padding (dst_linesize_32 - w)
>          lsl             x1, x1, #2                                      // dst padding expressed in bytes
>  1:      mov             w10, w6                                         // width copy for each line

LGTM otherwise, thx
Jan Ekström July 28, 2018, 2:45 p.m.
On Fri, Jul 27, 2018 at 11:14 AM, Clément Bœsch <u@pkh.me> wrote:
> On Fri, Jul 27, 2018 at 12:03:46AM +0300, Jan Ekström wrote:
>> Clang is more strict about some things.
>> ---
>>  libavfilter/aarch64/vf_nlmeans_neon.S | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S b/libavfilter/aarch64/vf_nlmeans_neon.S
>> index 6308a428db..ac16157bbd 100644
>> --- a/libavfilter/aarch64/vf_nlmeans_neon.S
>> +++ b/libavfilter/aarch64/vf_nlmeans_neon.S
>> @@ -22,7 +22,7 @@
>>
>>  // acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
>>  .macro acc_sum_store x, xb
>> -        dup             v24.4S, v24.4S[3]                               // ...X -> XXXX
>> +        dup             v24.4s, v24.s[3]                                // ...X -> XXXX
>
> can you keep the capitalized form?
>
>>          ext             v25.16B, v26.16B, \xb, #12                      // ext(0000,ABCD,12)=0ABC
>>          add             v24.4S, v24.4S, \x                              // XXXX+ABCD={X+A,X+B,X+C,X+D}
>>          add             v24.4S, v24.4S, v25.4S                          // {X+A,X+B+A,X+C+B,X+D+C}       (+0ABC)
>> @@ -37,7 +37,7 @@ function ff_compute_safe_ssd_integral_image_neon, export=1
>>          movi            v26.4S, #0                                      // used as zero for the "rotations" in acc_sum_store
>>          sub             x3, x3, w6, UXTW                                // s1 padding (s1_linesize - w)
>>          sub             x5, x5, w6, UXTW                                // s2 padding (s2_linesize - w)
>> -        sub             x9, x0, x1, UXTW #2                             // dst_top
>> +        sub             x9, x0, w1, UXTW #2                             // dst_top
>>          sub             x1, x1, w6, UXTW                                // dst padding (dst_linesize_32 - w)
>>          lsl             x1, x1, #2                                      // dst padding expressed in bytes
>>  1:      mov             w10, w6                                         // width copy for each line
>
> LGTM otherwise, thx
>

Fixed the capitalization with the s suffix and used "fix" instead of
"fixup" in the commit message due to IRC review.

Pushed, and default compilation configuration should now be fixed
again with aarch64+clang.

Jan

Patch hide | download patch | download mbox

diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S b/libavfilter/aarch64/vf_nlmeans_neon.S
index 6308a428db..ac16157bbd 100644
--- a/libavfilter/aarch64/vf_nlmeans_neon.S
+++ b/libavfilter/aarch64/vf_nlmeans_neon.S
@@ -22,7 +22,7 @@ 
 
 // acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
 .macro acc_sum_store x, xb
-        dup             v24.4S, v24.4S[3]                               // ...X -> XXXX
+        dup             v24.4s, v24.s[3]                                // ...X -> XXXX
         ext             v25.16B, v26.16B, \xb, #12                      // ext(0000,ABCD,12)=0ABC
         add             v24.4S, v24.4S, \x                              // XXXX+ABCD={X+A,X+B,X+C,X+D}
         add             v24.4S, v24.4S, v25.4S                          // {X+A,X+B+A,X+C+B,X+D+C}       (+0ABC)
@@ -37,7 +37,7 @@  function ff_compute_safe_ssd_integral_image_neon, export=1
         movi            v26.4S, #0                                      // used as zero for the "rotations" in acc_sum_store
         sub             x3, x3, w6, UXTW                                // s1 padding (s1_linesize - w)
         sub             x5, x5, w6, UXTW                                // s2 padding (s2_linesize - w)
-        sub             x9, x0, x1, UXTW #2                             // dst_top
+        sub             x9, x0, w1, UXTW #2                             // dst_top
         sub             x1, x1, w6, UXTW                                // dst padding (dst_linesize_32 - w)
         lsl             x1, x1, #2                                      // dst padding expressed in bytes
 1:      mov             w10, w6                                         // width copy for each line