Message ID | AS8P250MB0744CEAADA3D52BA7559931B8F2A2@AS8P250MB0744.EURP250.PROD.OUTLOOK.COM |
---|---|
State | Accepted |
Commit | f9aa5457ffe90a7f1bd8f80fdabbd417a262c94b |
Headers | show |
Series | [FFmpeg-devel,v2] avcodec/ppc/h264dsp: Fix unaligned stores | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On 3/13/2024 8:30 AM, Andreas Rheinhardt wrote: > Also fix an effective-type violation. > Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu > > Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> > --- > libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------ > 1 file changed, 17 insertions(+), 18 deletions(-) > > diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c > index c02733dda2..f50f2553a2 100644 > --- a/libavcodec/ppc/h264dsp.c > +++ b/libavcodec/ppc/h264dsp.c > @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride, > register vec_u8 r0, register vec_u8 r1, > register vec_u8 r2, register vec_u8 r3) { > DECLARE_ALIGNED(16, unsigned char, result)[64]; > - uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; > - int int_dst_stride = dst_stride/4; > + uint32_t *src_int = (uint32_t *)result; > > vec_st(r0, 0, result); > vec_st(r1, 16, result); > vec_st(r2, 32, result); > vec_st(r3, 48, result); > /* FIXME: there has to be a better way!!!! */ > - *dst_int = *src_int; > - *(dst_int+ int_dst_stride) = *(src_int + 1); > - *(dst_int+ 2*int_dst_stride) = *(src_int + 2); > - *(dst_int+ 3*int_dst_stride) = *(src_int + 3); > - *(dst_int+ 4*int_dst_stride) = *(src_int + 4); > - *(dst_int+ 5*int_dst_stride) = *(src_int + 5); > - *(dst_int+ 6*int_dst_stride) = *(src_int + 6); > - *(dst_int+ 7*int_dst_stride) = *(src_int + 7); > - *(dst_int+ 8*int_dst_stride) = *(src_int + 8); > - *(dst_int+ 9*int_dst_stride) = *(src_int + 9); > - *(dst_int+10*int_dst_stride) = *(src_int + 10); > - *(dst_int+11*int_dst_stride) = *(src_int + 11); > - *(dst_int+12*int_dst_stride) = *(src_int + 12); > - *(dst_int+13*int_dst_stride) = *(src_int + 13); > - *(dst_int+14*int_dst_stride) = *(src_int + 14); > - *(dst_int+15*int_dst_stride) = *(src_int + 15); > + AV_WN32(dst, AV_RN32A(src_int + 0)); > + AV_WN32(dst + dst_stride, AV_RN32A(src_int + 1)); > + AV_WN32(dst + 2 * dst_stride, AV_RN32A(src_int + 2)); > + AV_WN32(dst + 3 * dst_stride, AV_RN32A(src_int + 3)); > + AV_WN32(dst + 4 * dst_stride, AV_RN32A(src_int + 4)); > + AV_WN32(dst + 5 * dst_stride, AV_RN32A(src_int + 5)); > + AV_WN32(dst + 6 * dst_stride, AV_RN32A(src_int + 6)); > + AV_WN32(dst + 7 * dst_stride, AV_RN32A(src_int + 7)); > + AV_WN32(dst + 8 * dst_stride, AV_RN32A(src_int + 8)); > + AV_WN32(dst + 9 * dst_stride, AV_RN32A(src_int + 9)); > + AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10)); > + AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11)); > + AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12)); > + AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13)); > + AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14)); > + AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15)); Is there any benefit using AV_RN32A() when src_int is already a pointer to a uint32_t? > } > > /** @brief performs a 6x16 transpose of data in src, and stores it to dst
James Almer: > On 3/13/2024 8:30 AM, Andreas Rheinhardt wrote: >> Also fix an effective-type violation. >> Exposed by >> https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu >> >> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> >> --- >> libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------ >> 1 file changed, 17 insertions(+), 18 deletions(-) >> >> diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c >> index c02733dda2..f50f2553a2 100644 >> --- a/libavcodec/ppc/h264dsp.c >> +++ b/libavcodec/ppc/h264dsp.c >> @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int >> dst_stride, >> register vec_u8 r0, register vec_u8 r1, >> register vec_u8 r2, register vec_u8 r3) { >> DECLARE_ALIGNED(16, unsigned char, result)[64]; >> - uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; >> - int int_dst_stride = dst_stride/4; >> + uint32_t *src_int = (uint32_t *)result; >> vec_st(r0, 0, result); >> vec_st(r1, 16, result); >> vec_st(r2, 32, result); >> vec_st(r3, 48, result); >> /* FIXME: there has to be a better way!!!! */ >> - *dst_int = *src_int; >> - *(dst_int+ int_dst_stride) = *(src_int + 1); >> - *(dst_int+ 2*int_dst_stride) = *(src_int + 2); >> - *(dst_int+ 3*int_dst_stride) = *(src_int + 3); >> - *(dst_int+ 4*int_dst_stride) = *(src_int + 4); >> - *(dst_int+ 5*int_dst_stride) = *(src_int + 5); >> - *(dst_int+ 6*int_dst_stride) = *(src_int + 6); >> - *(dst_int+ 7*int_dst_stride) = *(src_int + 7); >> - *(dst_int+ 8*int_dst_stride) = *(src_int + 8); >> - *(dst_int+ 9*int_dst_stride) = *(src_int + 9); >> - *(dst_int+10*int_dst_stride) = *(src_int + 10); >> - *(dst_int+11*int_dst_stride) = *(src_int + 11); >> - *(dst_int+12*int_dst_stride) = *(src_int + 12); >> - *(dst_int+13*int_dst_stride) = *(src_int + 13); >> - *(dst_int+14*int_dst_stride) = *(src_int + 14); >> - *(dst_int+15*int_dst_stride) = *(src_int + 15); >> + AV_WN32(dst, AV_RN32A(src_int + 0)); >> + AV_WN32(dst + dst_stride, AV_RN32A(src_int + 1)); >> + AV_WN32(dst + 2 * dst_stride, AV_RN32A(src_int + 2)); >> + AV_WN32(dst + 3 * dst_stride, AV_RN32A(src_int + 3)); >> + AV_WN32(dst + 4 * dst_stride, AV_RN32A(src_int + 4)); >> + AV_WN32(dst + 5 * dst_stride, AV_RN32A(src_int + 5)); >> + AV_WN32(dst + 6 * dst_stride, AV_RN32A(src_int + 6)); >> + AV_WN32(dst + 7 * dst_stride, AV_RN32A(src_int + 7)); >> + AV_WN32(dst + 8 * dst_stride, AV_RN32A(src_int + 8)); >> + AV_WN32(dst + 9 * dst_stride, AV_RN32A(src_int + 9)); >> + AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10)); >> + AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11)); >> + AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12)); >> + AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13)); >> + AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14)); >> + AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15)); > > Is there any benefit using AV_RN32A() when src_int is already a pointer > to a uint32_t? > Simply reading via src_int[idx] would be a violation of the effective-type rules (you would read from an array of unsigned char via an lvalue of type uint32_t). Alternatively, one could use a union of DECLARE_ALIGNED(16, unsigned char, result)[64] and uint32_t[16], the former to store these vectors, the latter to read the values from. - Andreas
Andreas: On Wed, Mar 13, 2024 at 7:31 AM Andreas Rheinhardt <andreas.rheinhardt@outlook.com> wrote: > > Also fix an effective-type violation. > Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu > > Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> > --- > libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------ > 1 file changed, 17 insertions(+), 18 deletions(-) > > diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c > index c02733dda2..f50f2553a2 100644 > --- a/libavcodec/ppc/h264dsp.c > +++ b/libavcodec/ppc/h264dsp.c > @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride, > register vec_u8 r0, register vec_u8 r1, > register vec_u8 r2, register vec_u8 r3) { > DECLARE_ALIGNED(16, unsigned char, result)[64]; > - uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; > - int int_dst_stride = dst_stride/4; > + uint32_t *src_int = (uint32_t *)result; > > vec_st(r0, 0, result); > vec_st(r1, 16, result); > vec_st(r2, 32, result); > vec_st(r3, 48, result); > /* FIXME: there has to be a better way!!!! */ > - *dst_int = *src_int; > - *(dst_int+ int_dst_stride) = *(src_int + 1); > - *(dst_int+ 2*int_dst_stride) = *(src_int + 2); > - *(dst_int+ 3*int_dst_stride) = *(src_int + 3); > - *(dst_int+ 4*int_dst_stride) = *(src_int + 4); > - *(dst_int+ 5*int_dst_stride) = *(src_int + 5); > - *(dst_int+ 6*int_dst_stride) = *(src_int + 6); > - *(dst_int+ 7*int_dst_stride) = *(src_int + 7); > - *(dst_int+ 8*int_dst_stride) = *(src_int + 8); > - *(dst_int+ 9*int_dst_stride) = *(src_int + 9); > - *(dst_int+10*int_dst_stride) = *(src_int + 10); > - *(dst_int+11*int_dst_stride) = *(src_int + 11); > - *(dst_int+12*int_dst_stride) = *(src_int + 12); > - *(dst_int+13*int_dst_stride) = *(src_int + 13); > - *(dst_int+14*int_dst_stride) = *(src_int + 14); > - *(dst_int+15*int_dst_stride) = *(src_int + 15); > + AV_WN32(dst, AV_RN32A(src_int + 0)); > + AV_WN32(dst + dst_stride, AV_RN32A(src_int + 1)); > + AV_WN32(dst + 2 * dst_stride, AV_RN32A(src_int + 2)); > + AV_WN32(dst + 3 * dst_stride, AV_RN32A(src_int + 3)); > + AV_WN32(dst + 4 * dst_stride, AV_RN32A(src_int + 4)); > + AV_WN32(dst + 5 * dst_stride, AV_RN32A(src_int + 5)); > + AV_WN32(dst + 6 * dst_stride, AV_RN32A(src_int + 6)); > + AV_WN32(dst + 7 * dst_stride, AV_RN32A(src_int + 7)); > + AV_WN32(dst + 8 * dst_stride, AV_RN32A(src_int + 8)); > + AV_WN32(dst + 9 * dst_stride, AV_RN32A(src_int + 9)); > + AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10)); > + AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11)); > + AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12)); > + AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13)); > + AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14)); > + AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15)); > } > > /** @brief performs a 6x16 transpose of data in src, and stores it to dst > -- > 2.40.1 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". First of all, thank you for looking into this. Second, do we feel that this change covers the FIXME immediately above it that exclaims "there has to be a better way!!!!"? If so, we can remove the comment. I did not perform a full FATE run as it is expensive on my QEMU setup, but I can confirm that this fixes the checkasm-h264dsp test under GCC UBsan there as well as on a POWER7 (ppc64) and a POWER9 (ppc64le). Thanks, Sean McGovern
On 3/14/2024 4:13 PM, Sean McGovern wrote: > Andreas: > > On Wed, Mar 13, 2024 at 7:31 AM Andreas Rheinhardt > <andreas.rheinhardt@outlook.com> wrote: >> >> Also fix an effective-type violation. >> Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu >> >> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> >> --- >> libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------ >> 1 file changed, 17 insertions(+), 18 deletions(-) >> >> diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c >> index c02733dda2..f50f2553a2 100644 >> --- a/libavcodec/ppc/h264dsp.c >> +++ b/libavcodec/ppc/h264dsp.c >> @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride, >> register vec_u8 r0, register vec_u8 r1, >> register vec_u8 r2, register vec_u8 r3) { >> DECLARE_ALIGNED(16, unsigned char, result)[64]; >> - uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; >> - int int_dst_stride = dst_stride/4; >> + uint32_t *src_int = (uint32_t *)result; >> >> vec_st(r0, 0, result); >> vec_st(r1, 16, result); >> vec_st(r2, 32, result); >> vec_st(r3, 48, result); >> /* FIXME: there has to be a better way!!!! */ >> - *dst_int = *src_int; >> - *(dst_int+ int_dst_stride) = *(src_int + 1); >> - *(dst_int+ 2*int_dst_stride) = *(src_int + 2); >> - *(dst_int+ 3*int_dst_stride) = *(src_int + 3); >> - *(dst_int+ 4*int_dst_stride) = *(src_int + 4); >> - *(dst_int+ 5*int_dst_stride) = *(src_int + 5); >> - *(dst_int+ 6*int_dst_stride) = *(src_int + 6); >> - *(dst_int+ 7*int_dst_stride) = *(src_int + 7); >> - *(dst_int+ 8*int_dst_stride) = *(src_int + 8); >> - *(dst_int+ 9*int_dst_stride) = *(src_int + 9); >> - *(dst_int+10*int_dst_stride) = *(src_int + 10); >> - *(dst_int+11*int_dst_stride) = *(src_int + 11); >> - *(dst_int+12*int_dst_stride) = *(src_int + 12); >> - *(dst_int+13*int_dst_stride) = *(src_int + 13); >> - *(dst_int+14*int_dst_stride) = *(src_int + 14); >> - *(dst_int+15*int_dst_stride) = *(src_int + 15); >> + AV_WN32(dst, AV_RN32A(src_int + 0)); >> + AV_WN32(dst + dst_stride, AV_RN32A(src_int + 1)); >> + AV_WN32(dst + 2 * dst_stride, AV_RN32A(src_int + 2)); >> + AV_WN32(dst + 3 * dst_stride, AV_RN32A(src_int + 3)); >> + AV_WN32(dst + 4 * dst_stride, AV_RN32A(src_int + 4)); >> + AV_WN32(dst + 5 * dst_stride, AV_RN32A(src_int + 5)); >> + AV_WN32(dst + 6 * dst_stride, AV_RN32A(src_int + 6)); >> + AV_WN32(dst + 7 * dst_stride, AV_RN32A(src_int + 7)); >> + AV_WN32(dst + 8 * dst_stride, AV_RN32A(src_int + 8)); >> + AV_WN32(dst + 9 * dst_stride, AV_RN32A(src_int + 9)); >> + AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10)); >> + AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11)); >> + AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12)); >> + AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13)); >> + AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14)); >> + AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15)); >> } >> >> /** @brief performs a 6x16 transpose of data in src, and stores it to dst >> -- >> 2.40.1 >> >> _______________________________________________ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> >> To unsubscribe, visit link above, or email >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > > First of all, thank you for looking into this. > > Second, do we feel that this change covers the FIXME immediately above > it that exclaims "there has to be a better way!!!!"? > If so, we can remove the comment. Doubt it. Even after Andreas' change it's essentially the same as before (load four bytes, write four bytes) but without the UB. The FIXME probably refers to finding a way to do this with vector intrinsics. > > I did not perform a full FATE run as it is expensive on my QEMU setup, > but I can confirm that this fixes the checkasm-h264dsp test under GCC > UBsan there as well as on a POWER7 (ppc64) and a POWER9 (ppc64le).
Sean McGovern: > Andreas: > > On Wed, Mar 13, 2024 at 7:31 AM Andreas Rheinhardt > <andreas.rheinhardt@outlook.com> wrote: >> >> Also fix an effective-type violation. >> Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu >> >> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> >> --- >> libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------ >> 1 file changed, 17 insertions(+), 18 deletions(-) >> >> diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c >> index c02733dda2..f50f2553a2 100644 >> --- a/libavcodec/ppc/h264dsp.c >> +++ b/libavcodec/ppc/h264dsp.c >> @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride, >> register vec_u8 r0, register vec_u8 r1, >> register vec_u8 r2, register vec_u8 r3) { >> DECLARE_ALIGNED(16, unsigned char, result)[64]; >> - uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; >> - int int_dst_stride = dst_stride/4; >> + uint32_t *src_int = (uint32_t *)result; >> >> vec_st(r0, 0, result); >> vec_st(r1, 16, result); >> vec_st(r2, 32, result); >> vec_st(r3, 48, result); >> /* FIXME: there has to be a better way!!!! */ >> - *dst_int = *src_int; >> - *(dst_int+ int_dst_stride) = *(src_int + 1); >> - *(dst_int+ 2*int_dst_stride) = *(src_int + 2); >> - *(dst_int+ 3*int_dst_stride) = *(src_int + 3); >> - *(dst_int+ 4*int_dst_stride) = *(src_int + 4); >> - *(dst_int+ 5*int_dst_stride) = *(src_int + 5); >> - *(dst_int+ 6*int_dst_stride) = *(src_int + 6); >> - *(dst_int+ 7*int_dst_stride) = *(src_int + 7); >> - *(dst_int+ 8*int_dst_stride) = *(src_int + 8); >> - *(dst_int+ 9*int_dst_stride) = *(src_int + 9); >> - *(dst_int+10*int_dst_stride) = *(src_int + 10); >> - *(dst_int+11*int_dst_stride) = *(src_int + 11); >> - *(dst_int+12*int_dst_stride) = *(src_int + 12); >> - *(dst_int+13*int_dst_stride) = *(src_int + 13); >> - *(dst_int+14*int_dst_stride) = *(src_int + 14); >> - *(dst_int+15*int_dst_stride) = *(src_int + 15); >> + AV_WN32(dst, AV_RN32A(src_int + 0)); >> + AV_WN32(dst + dst_stride, AV_RN32A(src_int + 1)); >> + AV_WN32(dst + 2 * dst_stride, AV_RN32A(src_int + 2)); >> + AV_WN32(dst + 3 * dst_stride, AV_RN32A(src_int + 3)); >> + AV_WN32(dst + 4 * dst_stride, AV_RN32A(src_int + 4)); >> + AV_WN32(dst + 5 * dst_stride, AV_RN32A(src_int + 5)); >> + AV_WN32(dst + 6 * dst_stride, AV_RN32A(src_int + 6)); >> + AV_WN32(dst + 7 * dst_stride, AV_RN32A(src_int + 7)); >> + AV_WN32(dst + 8 * dst_stride, AV_RN32A(src_int + 8)); >> + AV_WN32(dst + 9 * dst_stride, AV_RN32A(src_int + 9)); >> + AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10)); >> + AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11)); >> + AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12)); >> + AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13)); >> + AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14)); >> + AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15)); >> } >> >> /** @brief performs a 6x16 transpose of data in src, and stores it to dst >> -- >> 2.40.1 >> >> _______________________________________________ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> >> To unsubscribe, visit link above, or email >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > > First of all, thank you for looking into this. > > Second, do we feel that this change covers the FIXME immediately above > it that exclaims "there has to be a better way!!!!"? > If so, we can remove the comment. I don't think so. This code comes from a time when FFmpeg did not care about the effective-type-rules or about alignment due to undefined behaviour; it only cared about alignment when it led to crashes. The old discussion confirms this: https://ffmpeg.org/pipermail/ffmpeg-devel/2007-May/034609.html https://ffmpeg.org/pipermail/ffmpeg-devel/2007-May/034612.html contains the following: "As I said, I submitted this patch in order to have PPC users get some speed-up now rather than having a hypothetic optimal code when some of us who work on Altivec sit down and work on it. I do think it's better to have a committed faster code that leaves room for improvement than a fastest code that never sees the light." The fixme relates to this; it was probably considered advantageous to avoid storing the vectors in stack buffers. > > I did not perform a full FATE run as it is expensive on my QEMU setup, > but I can confirm that this fixes the checkasm-h264dsp test under GCC > UBsan there as well as on a POWER7 (ppc64) and a POWER9 (ppc64le). > Ok, will apply then. Thanks for testing. - Andreas
diff --git a/libavcodec/ppc/h264dsp.c b/libavcodec/ppc/h264dsp.c index c02733dda2..f50f2553a2 100644 --- a/libavcodec/ppc/h264dsp.c +++ b/libavcodec/ppc/h264dsp.c @@ -401,30 +401,29 @@ static inline void write16x4(uint8_t *dst, int dst_stride, register vec_u8 r0, register vec_u8 r1, register vec_u8 r2, register vec_u8 r3) { DECLARE_ALIGNED(16, unsigned char, result)[64]; - uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; - int int_dst_stride = dst_stride/4; + uint32_t *src_int = (uint32_t *)result; vec_st(r0, 0, result); vec_st(r1, 16, result); vec_st(r2, 32, result); vec_st(r3, 48, result); /* FIXME: there has to be a better way!!!! */ - *dst_int = *src_int; - *(dst_int+ int_dst_stride) = *(src_int + 1); - *(dst_int+ 2*int_dst_stride) = *(src_int + 2); - *(dst_int+ 3*int_dst_stride) = *(src_int + 3); - *(dst_int+ 4*int_dst_stride) = *(src_int + 4); - *(dst_int+ 5*int_dst_stride) = *(src_int + 5); - *(dst_int+ 6*int_dst_stride) = *(src_int + 6); - *(dst_int+ 7*int_dst_stride) = *(src_int + 7); - *(dst_int+ 8*int_dst_stride) = *(src_int + 8); - *(dst_int+ 9*int_dst_stride) = *(src_int + 9); - *(dst_int+10*int_dst_stride) = *(src_int + 10); - *(dst_int+11*int_dst_stride) = *(src_int + 11); - *(dst_int+12*int_dst_stride) = *(src_int + 12); - *(dst_int+13*int_dst_stride) = *(src_int + 13); - *(dst_int+14*int_dst_stride) = *(src_int + 14); - *(dst_int+15*int_dst_stride) = *(src_int + 15); + AV_WN32(dst, AV_RN32A(src_int + 0)); + AV_WN32(dst + dst_stride, AV_RN32A(src_int + 1)); + AV_WN32(dst + 2 * dst_stride, AV_RN32A(src_int + 2)); + AV_WN32(dst + 3 * dst_stride, AV_RN32A(src_int + 3)); + AV_WN32(dst + 4 * dst_stride, AV_RN32A(src_int + 4)); + AV_WN32(dst + 5 * dst_stride, AV_RN32A(src_int + 5)); + AV_WN32(dst + 6 * dst_stride, AV_RN32A(src_int + 6)); + AV_WN32(dst + 7 * dst_stride, AV_RN32A(src_int + 7)); + AV_WN32(dst + 8 * dst_stride, AV_RN32A(src_int + 8)); + AV_WN32(dst + 9 * dst_stride, AV_RN32A(src_int + 9)); + AV_WN32(dst + 10 * dst_stride, AV_RN32A(src_int + 10)); + AV_WN32(dst + 11 * dst_stride, AV_RN32A(src_int + 11)); + AV_WN32(dst + 12 * dst_stride, AV_RN32A(src_int + 12)); + AV_WN32(dst + 13 * dst_stride, AV_RN32A(src_int + 13)); + AV_WN32(dst + 14 * dst_stride, AV_RN32A(src_int + 14)); + AV_WN32(dst + 15 * dst_stride, AV_RN32A(src_int + 15)); } /** @brief performs a 6x16 transpose of data in src, and stores it to dst
Also fix an effective-type violation. Exposed by https://fate.ffmpeg.org/report.cgi?time=20240312011016&slot=ppc-linux-gcc-13.2-ubsan-altivec-qemu Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> --- libavcodec/ppc/h264dsp.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-)