diff mbox series

[FFmpeg-devel,v2] avcodec/ppc/h264dsp: Fix unaligned stores

Message ID AS8P250MB0744CEAADA3D52BA7559931B8F2A2@AS8P250MB0744.EURP250.PROD.OUTLOOK.COM
State Accepted
Commit f9aa5457ffe90a7f1bd8f80fdabbd417a262c94b
Headers show
Series [FFmpeg-devel,v2] avcodec/ppc/h264dsp: Fix unaligned stores | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Andreas Rheinhardt March 13, 2024, 11:30 a.m. UTC
Also fix an effective-type violation.
Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
 libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

Comments

James Almer March 13, 2024, 12:04 p.m. UTC | #1
On 3/13/2024 8:30 AM, Andreas Rheinhardt wrote:
> Also fix an effective-type violation.
> Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu
> 
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
> ---
>   libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------
>   1 file changed, 17 insertions(+), 18 deletions(-)
> 
> diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
> index c02733dda2..f50f2553a2 100644
> --- a/libavcodec/ppc/h264dsp.c
> +++ b/libavcodec/ppc/h264dsp.c
> @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
>                                register vec_u8 r0, register vec_u8 r1,
>                                register vec_u8 r2, register vec_u8 r3) {
>       DECLARE_ALIGNED(16, unsigned char, result)[64];
> -    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
> -    int int_dst_stride = dst_stride/4;
> +    uint32_t *src_int = (uint32_t *)result;
>   
>       vec_st(r0, 0, result);
>       vec_st(r1, 16, result);
>       vec_st(r2, 32, result);
>       vec_st(r3, 48, result);
>       /* FIXME: there has to be a better way!!!! */
> -    *dst_int = *src_int;
> -    *(dst_int+   int_dst_stride) = *(src_int + 1);
> -    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
> -    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
> -    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
> -    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
> -    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
> -    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
> -    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
> -    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
> -    *(dst_int+10*int_dst_stride) = *(src_int + 10);
> -    *(dst_int+11*int_dst_stride) = *(src_int + 11);
> -    *(dst_int+12*int_dst_stride) = *(src_int + 12);
> -    *(dst_int+13*int_dst_stride) = *(src_int + 13);
> -    *(dst_int+14*int_dst_stride) = *(src_int + 14);
> -    *(dst_int+15*int_dst_stride) = *(src_int + 15);
> +    AV_WN32(dst,                   AV_RN32A(src_int + 0));
> +    AV_WN32(dst +      dst_stride, AV_RN32A(src_int + 1));
> +    AV_WN32(dst +  2 * dst_stride, AV_RN32A(src_int + 2));
> +    AV_WN32(dst +  3 * dst_stride, AV_RN32A(src_int + 3));
> +    AV_WN32(dst +  4 * dst_stride, AV_RN32A(src_int + 4));
> +    AV_WN32(dst +  5 * dst_stride, AV_RN32A(src_int + 5));
> +    AV_WN32(dst +  6 * dst_stride, AV_RN32A(src_int + 6));
> +    AV_WN32(dst +  7 * dst_stride, AV_RN32A(src_int + 7));
> +    AV_WN32(dst +  8 * dst_stride, AV_RN32A(src_int + 8));
> +    AV_WN32(dst +  9 * dst_stride, AV_RN32A(src_int + 9));
> +    AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10));
> +    AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11));
> +    AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12));
> +    AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13));
> +    AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14));
> +    AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15));

Is there any benefit using AV_RN32A() when src_int is already a pointer 
to a uint32_t?

>   }
>   
>   /** @brief performs a 6x16 transpose of data in src, and stores it to dst
Andreas Rheinhardt March 13, 2024, 12:10 p.m. UTC | #2
James Almer:
> On 3/13/2024 8:30 AM, Andreas Rheinhardt wrote:
>> Also fix an effective-type violation.
>> Exposed by
>> https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu
>>
>> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
>> ---
>>   libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------
>>   1 file changed, 17 insertions(+), 18 deletions(-)
>>
>> diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
>> index c02733dda2..f50f2553a2 100644
>> --- a/libavcodec/ppc/h264dsp.c
>> +++ b/libavcodec/ppc/h264dsp.c
>> @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int
>> dst_stride,
>>                                register vec_u8 r0, register vec_u8 r1,
>>                                register vec_u8 r2, register vec_u8 r3) {
>>       DECLARE_ALIGNED(16, unsigned char, result)[64];
>> -    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
>> -    int int_dst_stride = dst_stride/4;
>> +    uint32_t *src_int = (uint32_t *)result;
>>         vec_st(r0, 0, result);
>>       vec_st(r1, 16, result);
>>       vec_st(r2, 32, result);
>>       vec_st(r3, 48, result);
>>       /* FIXME: there has to be a better way!!!! */
>> -    *dst_int = *src_int;
>> -    *(dst_int+   int_dst_stride) = *(src_int + 1);
>> -    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
>> -    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
>> -    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
>> -    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
>> -    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
>> -    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
>> -    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
>> -    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
>> -    *(dst_int+10*int_dst_stride) = *(src_int + 10);
>> -    *(dst_int+11*int_dst_stride) = *(src_int + 11);
>> -    *(dst_int+12*int_dst_stride) = *(src_int + 12);
>> -    *(dst_int+13*int_dst_stride) = *(src_int + 13);
>> -    *(dst_int+14*int_dst_stride) = *(src_int + 14);
>> -    *(dst_int+15*int_dst_stride) = *(src_int + 15);
>> +    AV_WN32(dst,                   AV_RN32A(src_int + 0));
>> +    AV_WN32(dst +      dst_stride, AV_RN32A(src_int + 1));
>> +    AV_WN32(dst +  2 * dst_stride, AV_RN32A(src_int + 2));
>> +    AV_WN32(dst +  3 * dst_stride, AV_RN32A(src_int + 3));
>> +    AV_WN32(dst +  4 * dst_stride, AV_RN32A(src_int + 4));
>> +    AV_WN32(dst +  5 * dst_stride, AV_RN32A(src_int + 5));
>> +    AV_WN32(dst +  6 * dst_stride, AV_RN32A(src_int + 6));
>> +    AV_WN32(dst +  7 * dst_stride, AV_RN32A(src_int + 7));
>> +    AV_WN32(dst +  8 * dst_stride, AV_RN32A(src_int + 8));
>> +    AV_WN32(dst +  9 * dst_stride, AV_RN32A(src_int + 9));
>> +    AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10));
>> +    AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11));
>> +    AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12));
>> +    AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13));
>> +    AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14));
>> +    AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15));
> 
> Is there any benefit using AV_RN32A() when src_int is already a pointer
> to a uint32_t?
> 

Simply reading via src_int[idx] would be a violation of the
effective-type rules (you would read from an array of unsigned char via
an lvalue of type uint32_t).
Alternatively, one could use a union of DECLARE_ALIGNED(16, unsigned
char, result)[64] and uint32_t[16], the former to store these vectors,
the latter to read the values from.

- Andreas
Sean McGovern March 14, 2024, 7:13 p.m. UTC | #3
Andreas:

On Wed, Mar 13, 2024 at 7:31 AM Andreas Rheinhardt
<andreas.rheinhardt@outlook.com> wrote:
>
> Also fix an effective-type violation.
> Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu
>
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
> ---
>  libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------
>  1 file changed, 17 insertions(+), 18 deletions(-)
>
> diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
> index c02733dda2..f50f2553a2 100644
> --- a/libavcodec/ppc/h264dsp.c
> +++ b/libavcodec/ppc/h264dsp.c
> @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
>                               register vec_u8 r0, register vec_u8 r1,
>                               register vec_u8 r2, register vec_u8 r3) {
>      DECLARE_ALIGNED(16, unsigned char, result)[64];
> -    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
> -    int int_dst_stride = dst_stride/4;
> +    uint32_t *src_int = (uint32_t *)result;
>
>      vec_st(r0, 0, result);
>      vec_st(r1, 16, result);
>      vec_st(r2, 32, result);
>      vec_st(r3, 48, result);
>      /* FIXME: there has to be a better way!!!! */
> -    *dst_int = *src_int;
> -    *(dst_int+   int_dst_stride) = *(src_int + 1);
> -    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
> -    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
> -    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
> -    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
> -    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
> -    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
> -    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
> -    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
> -    *(dst_int+10*int_dst_stride) = *(src_int + 10);
> -    *(dst_int+11*int_dst_stride) = *(src_int + 11);
> -    *(dst_int+12*int_dst_stride) = *(src_int + 12);
> -    *(dst_int+13*int_dst_stride) = *(src_int + 13);
> -    *(dst_int+14*int_dst_stride) = *(src_int + 14);
> -    *(dst_int+15*int_dst_stride) = *(src_int + 15);
> +    AV_WN32(dst,                   AV_RN32A(src_int + 0));
> +    AV_WN32(dst +      dst_stride, AV_RN32A(src_int + 1));
> +    AV_WN32(dst +  2 * dst_stride, AV_RN32A(src_int + 2));
> +    AV_WN32(dst +  3 * dst_stride, AV_RN32A(src_int + 3));
> +    AV_WN32(dst +  4 * dst_stride, AV_RN32A(src_int + 4));
> +    AV_WN32(dst +  5 * dst_stride, AV_RN32A(src_int + 5));
> +    AV_WN32(dst +  6 * dst_stride, AV_RN32A(src_int + 6));
> +    AV_WN32(dst +  7 * dst_stride, AV_RN32A(src_int + 7));
> +    AV_WN32(dst +  8 * dst_stride, AV_RN32A(src_int + 8));
> +    AV_WN32(dst +  9 * dst_stride, AV_RN32A(src_int + 9));
> +    AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10));
> +    AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11));
> +    AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12));
> +    AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13));
> +    AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14));
> +    AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15));
>  }
>
>  /** @brief performs a 6x16 transpose of data in src, and stores it to dst
> --
> 2.40.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

First of all, thank you for looking into this.

Second, do we feel that this change covers the FIXME immediately above
it that exclaims "there has to be a better way!!!!"?
If so, we can remove the comment.

I did not perform a full FATE run as it is expensive on my QEMU setup,
but I can confirm that this fixes the checkasm-h264dsp test under GCC
UBsan there as well as on a POWER7 (ppc64) and a POWER9 (ppc64le).

Thanks,
Sean McGovern
James Almer March 14, 2024, 7:23 p.m. UTC | #4
On 3/14/2024 4:13 PM, Sean McGovern wrote:
> Andreas:
> 
> On Wed, Mar 13, 2024 at 7:31 AM Andreas Rheinhardt
> <andreas.rheinhardt@outlook.com> wrote:
>>
>> Also fix an effective-type violation.
>> Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu
>>
>> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
>> ---
>>   libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------
>>   1 file changed, 17 insertions(+), 18 deletions(-)
>>
>> diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
>> index c02733dda2..f50f2553a2 100644
>> --- a/libavcodec/ppc/h264dsp.c
>> +++ b/libavcodec/ppc/h264dsp.c
>> @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
>>                                register vec_u8 r0, register vec_u8 r1,
>>                                register vec_u8 r2, register vec_u8 r3) {
>>       DECLARE_ALIGNED(16, unsigned char, result)[64];
>> -    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
>> -    int int_dst_stride = dst_stride/4;
>> +    uint32_t *src_int = (uint32_t *)result;
>>
>>       vec_st(r0, 0, result);
>>       vec_st(r1, 16, result);
>>       vec_st(r2, 32, result);
>>       vec_st(r3, 48, result);
>>       /* FIXME: there has to be a better way!!!! */
>> -    *dst_int = *src_int;
>> -    *(dst_int+   int_dst_stride) = *(src_int + 1);
>> -    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
>> -    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
>> -    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
>> -    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
>> -    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
>> -    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
>> -    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
>> -    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
>> -    *(dst_int+10*int_dst_stride) = *(src_int + 10);
>> -    *(dst_int+11*int_dst_stride) = *(src_int + 11);
>> -    *(dst_int+12*int_dst_stride) = *(src_int + 12);
>> -    *(dst_int+13*int_dst_stride) = *(src_int + 13);
>> -    *(dst_int+14*int_dst_stride) = *(src_int + 14);
>> -    *(dst_int+15*int_dst_stride) = *(src_int + 15);
>> +    AV_WN32(dst,                   AV_RN32A(src_int + 0));
>> +    AV_WN32(dst +      dst_stride, AV_RN32A(src_int + 1));
>> +    AV_WN32(dst +  2 * dst_stride, AV_RN32A(src_int + 2));
>> +    AV_WN32(dst +  3 * dst_stride, AV_RN32A(src_int + 3));
>> +    AV_WN32(dst +  4 * dst_stride, AV_RN32A(src_int + 4));
>> +    AV_WN32(dst +  5 * dst_stride, AV_RN32A(src_int + 5));
>> +    AV_WN32(dst +  6 * dst_stride, AV_RN32A(src_int + 6));
>> +    AV_WN32(dst +  7 * dst_stride, AV_RN32A(src_int + 7));
>> +    AV_WN32(dst +  8 * dst_stride, AV_RN32A(src_int + 8));
>> +    AV_WN32(dst +  9 * dst_stride, AV_RN32A(src_int + 9));
>> +    AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10));
>> +    AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11));
>> +    AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12));
>> +    AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13));
>> +    AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14));
>> +    AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15));
>>   }
>>
>>   /** @brief performs a 6x16 transpose of data in src, and stores it to dst
>> --
>> 2.40.1
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> 
> First of all, thank you for looking into this.
> 
> Second, do we feel that this change covers the FIXME immediately above
> it that exclaims "there has to be a better way!!!!"?
> If so, we can remove the comment.

Doubt it. Even after Andreas' change it's essentially the same as before 
(load four bytes, write four bytes) but without the UB. The FIXME 
probably refers to finding a way to do this with vector intrinsics.

> 
> I did not perform a full FATE run as it is expensive on my QEMU setup,
> but I can confirm that this fixes the checkasm-h264dsp test under GCC
> UBsan there as well as on a POWER7 (ppc64) and a POWER9 (ppc64le).
Andreas Rheinhardt March 14, 2024, 7:34 p.m. UTC | #5
Sean McGovern:
> Andreas:
> 
> On Wed, Mar 13, 2024 at 7:31 AM Andreas Rheinhardt
> <andreas.rheinhardt@outlook.com> wrote:
>>
>> Also fix an effective-type violation.
>> Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu
>>
>> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
>> ---
>>  libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------
>>  1 file changed, 17 insertions(+), 18 deletions(-)
>>
>> diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
>> index c02733dda2..f50f2553a2 100644
>> --- a/libavcodec/ppc/h264dsp.c
>> +++ b/libavcodec/ppc/h264dsp.c
>> @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
>>                               register vec_u8 r0, register vec_u8 r1,
>>                               register vec_u8 r2, register vec_u8 r3) {
>>      DECLARE_ALIGNED(16, unsigned char, result)[64];
>> -    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
>> -    int int_dst_stride = dst_stride/4;
>> +    uint32_t *src_int = (uint32_t *)result;
>>
>>      vec_st(r0, 0, result);
>>      vec_st(r1, 16, result);
>>      vec_st(r2, 32, result);
>>      vec_st(r3, 48, result);
>>      /* FIXME: there has to be a better way!!!! */
>> -    *dst_int = *src_int;
>> -    *(dst_int+   int_dst_stride) = *(src_int + 1);
>> -    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
>> -    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
>> -    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
>> -    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
>> -    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
>> -    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
>> -    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
>> -    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
>> -    *(dst_int+10*int_dst_stride) = *(src_int + 10);
>> -    *(dst_int+11*int_dst_stride) = *(src_int + 11);
>> -    *(dst_int+12*int_dst_stride) = *(src_int + 12);
>> -    *(dst_int+13*int_dst_stride) = *(src_int + 13);
>> -    *(dst_int+14*int_dst_stride) = *(src_int + 14);
>> -    *(dst_int+15*int_dst_stride) = *(src_int + 15);
>> +    AV_WN32(dst,                   AV_RN32A(src_int + 0));
>> +    AV_WN32(dst +      dst_stride, AV_RN32A(src_int + 1));
>> +    AV_WN32(dst +  2 * dst_stride, AV_RN32A(src_int + 2));
>> +    AV_WN32(dst +  3 * dst_stride, AV_RN32A(src_int + 3));
>> +    AV_WN32(dst +  4 * dst_stride, AV_RN32A(src_int + 4));
>> +    AV_WN32(dst +  5 * dst_stride, AV_RN32A(src_int + 5));
>> +    AV_WN32(dst +  6 * dst_stride, AV_RN32A(src_int + 6));
>> +    AV_WN32(dst +  7 * dst_stride, AV_RN32A(src_int + 7));
>> +    AV_WN32(dst +  8 * dst_stride, AV_RN32A(src_int + 8));
>> +    AV_WN32(dst +  9 * dst_stride, AV_RN32A(src_int + 9));
>> +    AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10));
>> +    AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11));
>> +    AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12));
>> +    AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13));
>> +    AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14));
>> +    AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15));
>>  }
>>
>>  /** @brief performs a 6x16 transpose of data in src, and stores it to dst
>> --
>> 2.40.1
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> 
> First of all, thank you for looking into this.
> 
> Second, do we feel that this change covers the FIXME immediately above
> it that exclaims "there has to be a better way!!!!"?
> If so, we can remove the comment.

I don't think so. This code comes from a time when FFmpeg did not care
about the effective-type-rules or about alignment due to undefined
behaviour; it only cared about alignment when it led to crashes.
The old discussion confirms this:
https://ffmpeg.org/pipermail/ffmpeg-devel/2007-May/034609.html
https://ffmpeg.org/pipermail/ffmpeg-devel/2007-May/034612.html contains
the following:
"As I said, I submitted this patch in order to have PPC users get some
speed-up now rather than having a hypothetic optimal code when some of
us who work on Altivec sit down and work on it.

I do think it's better to have a committed faster code that leaves
room for improvement than a fastest code that never sees the light."

The fixme relates to this; it was probably considered advantageous to
avoid storing the vectors in stack buffers.

> 
> I did not perform a full FATE run as it is expensive on my QEMU setup,
> but I can confirm that this fixes the checkasm-h264dsp test under GCC
> UBsan there as well as on a POWER7 (ppc64) and a POWER9 (ppc64le).
> 

Ok, will apply then. Thanks for testing.

- Andreas
diff mbox series

Patch

diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c
index c02733dda2..f50f2553a2 100644
--- a/libavcodec/ppc/h264dsp.c
+++ b/libavcodec/ppc/h264dsp.c
@@ -401,30 +401,29 @@  static inline void write16x4(uint8_t *dst, int dst_stride,
                              register vec_u8 r0, register vec_u8 r1,
                              register vec_u8 r2, register vec_u8 r3) {
     DECLARE_ALIGNED(16, unsigned char, result)[64];
-    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
-    int int_dst_stride = dst_stride/4;
+    uint32_t *src_int = (uint32_t *)result;
 
     vec_st(r0, 0, result);
     vec_st(r1, 16, result);
     vec_st(r2, 32, result);
     vec_st(r3, 48, result);
     /* FIXME: there has to be a better way!!!! */
-    *dst_int = *src_int;
-    *(dst_int+   int_dst_stride) = *(src_int + 1);
-    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
-    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
-    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
-    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
-    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
-    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
-    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
-    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
-    *(dst_int+10*int_dst_stride) = *(src_int + 10);
-    *(dst_int+11*int_dst_stride) = *(src_int + 11);
-    *(dst_int+12*int_dst_stride) = *(src_int + 12);
-    *(dst_int+13*int_dst_stride) = *(src_int + 13);
-    *(dst_int+14*int_dst_stride) = *(src_int + 14);
-    *(dst_int+15*int_dst_stride) = *(src_int + 15);
+    AV_WN32(dst,                   AV_RN32A(src_int + 0));
+    AV_WN32(dst +      dst_stride, AV_RN32A(src_int + 1));
+    AV_WN32(dst +  2 * dst_stride, AV_RN32A(src_int + 2));
+    AV_WN32(dst +  3 * dst_stride, AV_RN32A(src_int + 3));
+    AV_WN32(dst +  4 * dst_stride, AV_RN32A(src_int + 4));
+    AV_WN32(dst +  5 * dst_stride, AV_RN32A(src_int + 5));
+    AV_WN32(dst +  6 * dst_stride, AV_RN32A(src_int + 6));
+    AV_WN32(dst +  7 * dst_stride, AV_RN32A(src_int + 7));
+    AV_WN32(dst +  8 * dst_stride, AV_RN32A(src_int + 8));
+    AV_WN32(dst +  9 * dst_stride, AV_RN32A(src_int + 9));
+    AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10));
+    AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11));
+    AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12));
+    AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13));
+    AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14));
+    AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15));
 }
 
 /** @brief performs a 6x16 transpose of data in src, and stores it to dst