Message ID | 20200418214842.26470-1-martin@martin.st |
---|---|
State | Accepted |
Commit | 872790b1f9be65d5fba2ddc0357f989001ecbd33 |
Headers | show |
Series | [FFmpeg-devel] swscale: aarch64: Avoid using the x18 register | expand |
Context | Check | Description |
---|---|---|
andriy/default | pending | |
andriy/make | success | Make finished |
andriy/make_fate | success | Make fate finished |
On Sun, 19 Apr 2020, Martin Storsjö wrote: > The x18 is a reserved platform register on Darwin and Windows. > > x8/w8 seems to be unused in this function though (and same about > x10 and x14), so there's really no reason to use x18 here - just change > the uses of x18/w18 into x8/w8 instead without any further rewrites. > --- > libswscale/aarch64/hscale.S | 8 ++++---- > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S > index 8743183b51..ae73014a25 100644 > --- a/libswscale/aarch64/hscale.S > +++ b/libswscale/aarch64/hscale.S > @@ -22,7 +22,7 @@ > > function ff_hscale_8_to_15_neon, export=1 > sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16) > -1: ldr w18, [x5], #4 // filterPos[idx] > +1: ldr w8, [x5], #4 // filterPos[idx] > ldr w0, [x5], #4 // filterPos[idx + 1] > ldr w11, [x5], #4 // filterPos[idx + 2] > ldr w9, [x5], #4 // filterPos[idx + 3] > @@ -34,14 +34,14 @@ function ff_hscale_8_to_15_neon, export=1 > movi v1.2D, #0 // val sum part 2 (for dst[1]) > movi v2.2D, #0 // val sum part 3 (for dst[2]) > movi v3.2D, #0 // val sum part 4 (for dst[3]) > - add x17, x3, w18, UXTW // srcp + filterPos[0] > - add x18, x3, w0, UXTW // srcp + filterPos[1] > + add x17, x3, w8, UXTW // srcp + filterPos[0] > + add x8, x3, w0, UXTW // srcp + filterPos[1] > add x0, x3, w11, UXTW // srcp + filterPos[2] > add x11, x3, w9, UXTW // srcp + filterPos[3] > mov w15, w6 // filterSize counter > 2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}] > ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1 > - ld1 {v6.8B}, [x18], #8 // srcp[filterPos[1] + {0..7}] > + ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}] > ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize > uxtl v4.8H, v4.8B // unpack part 1 to 16-bit > smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}] > -- > 2.17.1 Pushed this one, as it passes tests and fixes things and otherwise seems undisputable. // Martin
diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S index 8743183b51..ae73014a25 100644 --- a/libswscale/aarch64/hscale.S +++ b/libswscale/aarch64/hscale.S @@ -22,7 +22,7 @@ function ff_hscale_8_to_15_neon, export=1 sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16) -1: ldr w18, [x5], #4 // filterPos[idx] +1: ldr w8, [x5], #4 // filterPos[idx] ldr w0, [x5], #4 // filterPos[idx + 1] ldr w11, [x5], #4 // filterPos[idx + 2] ldr w9, [x5], #4 // filterPos[idx + 3] @@ -34,14 +34,14 @@ function ff_hscale_8_to_15_neon, export=1 movi v1.2D, #0 // val sum part 2 (for dst[1]) movi v2.2D, #0 // val sum part 3 (for dst[2]) movi v3.2D, #0 // val sum part 4 (for dst[3]) - add x17, x3, w18, UXTW // srcp + filterPos[0] - add x18, x3, w0, UXTW // srcp + filterPos[1] + add x17, x3, w8, UXTW // srcp + filterPos[0] + add x8, x3, w0, UXTW // srcp + filterPos[1] add x0, x3, w11, UXTW // srcp + filterPos[2] add x11, x3, w9, UXTW // srcp + filterPos[3] mov w15, w6 // filterSize counter 2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}] ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1 - ld1 {v6.8B}, [x18], #8 // srcp[filterPos[1] + {0..7}] + ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}] ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize uxtl v4.8H, v4.8B // unpack part 1 to 16-bit smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]