diff mbox series

[FFmpeg-devel] swscale: aarch64: Avoid using the x18 register

Message ID 20200418214842.26470-1-martin@martin.st
State Accepted
Commit 872790b1f9be65d5fba2ddc0357f989001ecbd33
Headers show
Series [FFmpeg-devel] swscale: aarch64: Avoid using the x18 register | expand

Checks

Context Check Description
andriy/default pending
andriy/make success Make finished
andriy/make_fate success Make fate finished

Commit Message

Martin Storsjö April 18, 2020, 9:48 p.m. UTC
The x18 is a reserved platform register on Darwin and Windows.

x8/w8 seems to be unused in this function though (and same about
x10 and x14), so there's really no reason to use x18 here - just change
the uses of x18/w18 into x8/w8 instead without any further rewrites.
---
 libswscale/aarch64/hscale.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

Comments

Martin Storsjö April 19, 2020, 9:10 p.m. UTC | #1
On Sun, 19 Apr 2020, Martin Storsjö wrote:

> The x18 is a reserved platform register on Darwin and Windows.
>
> x8/w8 seems to be unused in this function though (and same about
> x10 and x14), so there's really no reason to use x18 here - just change
> the uses of x18/w18 into x8/w8 instead without any further rewrites.
> ---
> libswscale/aarch64/hscale.S | 8 ++++----
> 1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S
> index 8743183b51..ae73014a25 100644
> --- a/libswscale/aarch64/hscale.S
> +++ b/libswscale/aarch64/hscale.S
> @@ -22,7 +22,7 @@
> 
> function ff_hscale_8_to_15_neon, export=1
>         sbfiz               x7, x6, #1, #32             // filterSize*2 (*2 because int16)
> -1:      ldr                 w18, [x5], #4               // filterPos[idx]
> +1:      ldr                 w8, [x5], #4                // filterPos[idx]
>         ldr                 w0, [x5], #4                // filterPos[idx + 1]
>         ldr                 w11, [x5], #4               // filterPos[idx + 2]
>         ldr                 w9, [x5], #4                // filterPos[idx + 3]
> @@ -34,14 +34,14 @@ function ff_hscale_8_to_15_neon, export=1
>         movi                v1.2D, #0                   // val sum part 2 (for dst[1])
>         movi                v2.2D, #0                   // val sum part 3 (for dst[2])
>         movi                v3.2D, #0                   // val sum part 4 (for dst[3])
> -        add                 x17, x3, w18, UXTW          // srcp + filterPos[0]
> -        add                 x18, x3, w0, UXTW           // srcp + filterPos[1]
> +        add                 x17, x3, w8, UXTW           // srcp + filterPos[0]
> +        add                 x8,  x3, w0, UXTW           // srcp + filterPos[1]
>         add                 x0, x3, w11, UXTW           // srcp + filterPos[2]
>         add                 x11, x3, w9, UXTW           // srcp + filterPos[3]
>         mov                 w15, w6                     // filterSize counter
> 2:      ld1                 {v4.8B}, [x17], #8          // srcp[filterPos[0] + {0..7}]
>         ld1                 {v5.8H}, [x16], #16         // load 8x16-bit filter values, part 1
> -        ld1                 {v6.8B}, [x18], #8          // srcp[filterPos[1] + {0..7}]
> +        ld1                 {v6.8B}, [x8], #8           // srcp[filterPos[1] + {0..7}]
>         ld1                 {v7.8H}, [x12], #16         // load 8x16-bit at filter+filterSize
>         uxtl                v4.8H, v4.8B                // unpack part 1 to 16-bit
>         smlal               v0.4S, v4.4H, v5.4H         // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
> -- 
> 2.17.1

Pushed this one, as it passes tests and fixes things and otherwise seems 
undisputable.

// Martin
diff mbox series

Patch

diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S
index 8743183b51..ae73014a25 100644
--- a/libswscale/aarch64/hscale.S
+++ b/libswscale/aarch64/hscale.S
@@ -22,7 +22,7 @@ 
 
 function ff_hscale_8_to_15_neon, export=1
         sbfiz               x7, x6, #1, #32             // filterSize*2 (*2 because int16)
-1:      ldr                 w18, [x5], #4               // filterPos[idx]
+1:      ldr                 w8, [x5], #4                // filterPos[idx]
         ldr                 w0, [x5], #4                // filterPos[idx + 1]
         ldr                 w11, [x5], #4               // filterPos[idx + 2]
         ldr                 w9, [x5], #4                // filterPos[idx + 3]
@@ -34,14 +34,14 @@  function ff_hscale_8_to_15_neon, export=1
         movi                v1.2D, #0                   // val sum part 2 (for dst[1])
         movi                v2.2D, #0                   // val sum part 3 (for dst[2])
         movi                v3.2D, #0                   // val sum part 4 (for dst[3])
-        add                 x17, x3, w18, UXTW          // srcp + filterPos[0]
-        add                 x18, x3, w0, UXTW           // srcp + filterPos[1]
+        add                 x17, x3, w8, UXTW           // srcp + filterPos[0]
+        add                 x8,  x3, w0, UXTW           // srcp + filterPos[1]
         add                 x0, x3, w11, UXTW           // srcp + filterPos[2]
         add                 x11, x3, w9, UXTW           // srcp + filterPos[3]
         mov                 w15, w6                     // filterSize counter
 2:      ld1                 {v4.8B}, [x17], #8          // srcp[filterPos[0] + {0..7}]
         ld1                 {v5.8H}, [x16], #16         // load 8x16-bit filter values, part 1
-        ld1                 {v6.8B}, [x18], #8          // srcp[filterPos[1] + {0..7}]
+        ld1                 {v6.8B}, [x8], #8           // srcp[filterPos[1] + {0..7}]
         ld1                 {v7.8H}, [x12], #16         // load 8x16-bit at filter+filterSize
         uxtl                v4.8H, v4.8B                // unpack part 1 to 16-bit
         smlal               v0.4S, v4.4H, v5.4H         // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]