diff mbox series

[FFmpeg-devel,v2] avcodec/startcode: Avoid unaligned accesses

Message ID AS8P250MB0744B14A56741101CC55826D8F239@AS8P250MB0744.EURP250.PROD.OUTLOOK.COM
State Accepted
Headers show
Series [FFmpeg-devel,v2] avcodec/startcode: Avoid unaligned accesses | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Andreas Rheinhardt Oct. 11, 2022, 10:20 p.m. UTC
Up until now, ff_startcode_find_candidate_c() simply casts
an uint8_t* to uint64_t*/uint32_t* to read 64/32 bits at a time
in case HAVE_FAST_UNALIGNED is true. Yet this ignores the
alignment requirement of these types as well as effective type
rules of the C standard. This commit therefore replaces these
direct accesses with AV_RN64/32; this also improves
readability.

UBSan reported these unaligned accesses which happened in 233
FATE-tests involving H.264 and VC-1 (this has also been reported
in tickets #8138 and #8485); these tests are fixed by this commit.

The output of GCC with -O3 is unchanged for aarch64, alpha, arm,
loongarch, ppc and x64. There was only a slight difference for mips.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
---
This is v2 of https://patchwork.ffmpeg.org/project/ffmpeg/patch/20200122145210.6898-1-andreas.rheinhardt@gmail.com/

Here is the mips code before this change:

startcode_old_O3.o:     file format elf64-tradlittlemips


Disassembly of section .text:

0000000000000000 <ff_startcode_find_candidate_c>:
   0:	18a00029 	blez	a1,a8 <ff_startcode_find_candidate_c+0xa8>
   4:	3c08ff7f 	lui	a4,0xff7f
   8:	3c07ff01 	lui	a3,0xff01
   c:	65087f7f 	daddiu	a4,a4,32639
  10:	64e70101 	daddiu	a3,a3,257
  14:	00084438 	dsll	a4,a4,0x10
  18:	00073c38 	dsll	a3,a3,0x10
  1c:	65087f7f 	daddiu	a4,a4,32639
  20:	64e70101 	daddiu	a3,a3,257
  24:	00084478 	dsll	a4,a4,0x11
  28:	00073df8 	dsll	a3,a3,0x17
  2c:	00803025 	move	a2,a0
  30:	00001025 	move	v0,zero
  34:	3508feff 	ori	a4,a4,0xfeff
  38:	10000005 	b	50 <ff_startcode_find_candidate_c+0x50>
  3c:	34e78080 	ori	a3,a3,0x8080
  40:	24420008 	addiu	v0,v0,8
  44:	0045182a 	slt	v1,v0,a1
  48:	10600015 	beqz	v1,a0 <ff_startcode_find_candidate_c+0xa0>
  4c:	00000000 	nop
  50:	dcc30000 	ld	v1,0(a2)
  54:	0068482d 	daddu	a5,v1,a4
  58:	44a30000 	dmtc1	v1,$f0
  5c:	44a90800 	dmtc1	a5,$f1
  60:	4be10002 	pandn	$f0,$f0,$f1
  64:	44230000 	dmfc1	v1,$f0
  68:	00671824 	and	v1,v1,a3
  6c:	1060fff4 	beqz	v1,40 <ff_startcode_find_candidate_c+0x40>
  70:	64c60008 	daddiu	a2,a2,8
  74:	0045182a 	slt	v1,v0,a1
  78:	10600009 	beqz	v1,a0 <ff_startcode_find_candidate_c+0xa0>
  7c:	0082182d 	daddu	v1,a0,v0
  80:	10000005 	b	98 <ff_startcode_find_candidate_c+0x98>
  84:	90640000 	lbu	a0,0(v1)
  88:	24420001 	addiu	v0,v0,1
  8c:	10a20008 	beq	a1,v0,b0 <ff_startcode_find_candidate_c+0xb0>
  90:	00000000 	nop
  94:	90640000 	lbu	a0,0(v1)
  98:	1480fffb 	bnez	a0,88 <ff_startcode_find_candidate_c+0x88>
  9c:	64630001 	daddiu	v1,v1,1
  a0:	03e00008 	jr	ra
  a4:	00000000 	nop
  a8:	03e00008 	jr	ra
  ac:	00001025 	move	v0,zero
  b0:	03e00008 	jr	ra
  b4:	00a01025 	move	v0,a1
	...

And here after this change:

startcode_new_O3.o:     file format elf64-tradlittlemips


Disassembly of section .text:

0000000000000000 <ff_startcode_find_candidate_c>:
   0:	18a0002b 	blez	a1,b0 <ff_startcode_find_candidate_c+0xb0>
   4:	3c08ff7f 	lui	a4,0xff7f
   8:	3c07ff01 	lui	a3,0xff01
   c:	65087f7f 	daddiu	a4,a4,32639
  10:	64e70101 	daddiu	a3,a3,257
  14:	00084438 	dsll	a4,a4,0x10
  18:	00073c38 	dsll	a3,a3,0x10
  1c:	65087f7f 	daddiu	a4,a4,32639
  20:	64e70101 	daddiu	a3,a3,257
  24:	00084478 	dsll	a4,a4,0x11
  28:	00073df8 	dsll	a3,a3,0x17
  2c:	00803025 	move	a2,a0
  30:	00001025 	move	v0,zero
  34:	3508feff 	ori	a4,a4,0xfeff
  38:	10000005 	b	50 <ff_startcode_find_candidate_c+0x50>
  3c:	34e78080 	ori	a3,a3,0x8080
  40:	24420008 	addiu	v0,v0,8
  44:	0045182a 	slt	v1,v0,a1
  48:	10600017 	beqz	v1,a8 <ff_startcode_find_candidate_c+0xa8>
  4c:	00000000 	nop
  50:	68c30007 	ldl	v1,7(a2)
  54:	6cc30000 	ldr	v1,0(a2)
  58:	0068482d 	daddu	a5,v1,a4
  5c:	44a30000 	dmtc1	v1,$f0
  60:	44a90800 	dmtc1	a5,$f1
  64:	4be10002 	pandn	$f0,$f0,$f1
  68:	44230000 	dmfc1	v1,$f0
  6c:	00671824 	and	v1,v1,a3
  70:	1060fff3 	beqz	v1,40 <ff_startcode_find_candidate_c+0x40>
  74:	64c60008 	daddiu	a2,a2,8
  78:	0045182a 	slt	v1,v0,a1
  7c:	1060000a 	beqz	v1,a8 <ff_startcode_find_candidate_c+0xa8>
  80:	0082182d 	daddu	v1,a0,v0
  84:	10000006 	b	a0 <ff_startcode_find_candidate_c+0xa0>
  88:	90640000 	lbu	a0,0(v1)
  8c:	00000000 	nop
  90:	24420001 	addiu	v0,v0,1
  94:	10a20008 	beq	a1,v0,b8 <ff_startcode_find_candidate_c+0xb8>
  98:	00000000 	nop
  9c:	90640000 	lbu	a0,0(v1)
  a0:	1480fffb 	bnez	a0,90 <ff_startcode_find_candidate_c+0x90>
  a4:	64630001 	daddiu	v1,v1,1
  a8:	03e00008 	jr	ra
  ac:	00000000 	nop
  b0:	03e00008 	jr	ra
  b4:	00001025 	move	v0,zero
  b8:	03e00008 	jr	ra
  bc:	00a01025 	move	v0,a1

As one can see, the difference is that an ld has been replaced
by a pair of ldl and ldr. I don't know the performance implications
of this.

 libavcodec/startcode.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

Comments

Anton Khirnov Oct. 14, 2022, 6:42 a.m. UTC | #1
Quoting Andreas Rheinhardt (2022-10-12 00:20:23)
> Up until now, ff_startcode_find_candidate_c() simply casts
> an uint8_t* to uint64_t*/uint32_t* to read 64/32 bits at a time
> in case HAVE_FAST_UNALIGNED is true. Yet this ignores the
> alignment requirement of these types as well as effective type
> rules of the C standard. This commit therefore replaces these
> direct accesses with AV_RN64/32; this also improves
> readability.
> 
> UBSan reported these unaligned accesses which happened in 233
> FATE-tests involving H.264 and VC-1 (this has also been reported
> in tickets #8138 and #8485); these tests are fixed by this commit.
> 
> The output of GCC with -O3 is unchanged for aarch64, alpha, arm,
> loongarch, ppc and x64. There was only a slight difference for mips.
> 
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
> ---
> This is v2 of https://patchwork.ffmpeg.org/project/ffmpeg/patch/20200122145210.6898-1-andreas.rheinhardt@gmail.com/
> 
> Here is the mips code before this change:
> 
> startcode_old_O3.o:     file format elf64-tradlittlemips
> 
> 
> Disassembly of section .text:
> 
> 0000000000000000 <ff_startcode_find_candidate_c>:
>    0:	18a00029 	blez	a1,a8 <ff_startcode_find_candidate_c+0xa8>
>    4:	3c08ff7f 	lui	a4,0xff7f
>    8:	3c07ff01 	lui	a3,0xff01
>    c:	65087f7f 	daddiu	a4,a4,32639
>   10:	64e70101 	daddiu	a3,a3,257
>   14:	00084438 	dsll	a4,a4,0x10
>   18:	00073c38 	dsll	a3,a3,0x10
>   1c:	65087f7f 	daddiu	a4,a4,32639
>   20:	64e70101 	daddiu	a3,a3,257
>   24:	00084478 	dsll	a4,a4,0x11
>   28:	00073df8 	dsll	a3,a3,0x17
>   2c:	00803025 	move	a2,a0
>   30:	00001025 	move	v0,zero
>   34:	3508feff 	ori	a4,a4,0xfeff
>   38:	10000005 	b	50 <ff_startcode_find_candidate_c+0x50>
>   3c:	34e78080 	ori	a3,a3,0x8080
>   40:	24420008 	addiu	v0,v0,8
>   44:	0045182a 	slt	v1,v0,a1
>   48:	10600015 	beqz	v1,a0 <ff_startcode_find_candidate_c+0xa0>
>   4c:	00000000 	nop
>   50:	dcc30000 	ld	v1,0(a2)
>   54:	0068482d 	daddu	a5,v1,a4
>   58:	44a30000 	dmtc1	v1,$f0
>   5c:	44a90800 	dmtc1	a5,$f1
>   60:	4be10002 	pandn	$f0,$f0,$f1
>   64:	44230000 	dmfc1	v1,$f0
>   68:	00671824 	and	v1,v1,a3
>   6c:	1060fff4 	beqz	v1,40 <ff_startcode_find_candidate_c+0x40>
>   70:	64c60008 	daddiu	a2,a2,8
>   74:	0045182a 	slt	v1,v0,a1
>   78:	10600009 	beqz	v1,a0 <ff_startcode_find_candidate_c+0xa0>
>   7c:	0082182d 	daddu	v1,a0,v0
>   80:	10000005 	b	98 <ff_startcode_find_candidate_c+0x98>
>   84:	90640000 	lbu	a0,0(v1)
>   88:	24420001 	addiu	v0,v0,1
>   8c:	10a20008 	beq	a1,v0,b0 <ff_startcode_find_candidate_c+0xb0>
>   90:	00000000 	nop
>   94:	90640000 	lbu	a0,0(v1)
>   98:	1480fffb 	bnez	a0,88 <ff_startcode_find_candidate_c+0x88>
>   9c:	64630001 	daddiu	v1,v1,1
>   a0:	03e00008 	jr	ra
>   a4:	00000000 	nop
>   a8:	03e00008 	jr	ra
>   ac:	00001025 	move	v0,zero
>   b0:	03e00008 	jr	ra
>   b4:	00a01025 	move	v0,a1
> 	...
> 
> And here after this change:
> 
> startcode_new_O3.o:     file format elf64-tradlittlemips
> 
> 
> Disassembly of section .text:
> 
> 0000000000000000 <ff_startcode_find_candidate_c>:
>    0:	18a0002b 	blez	a1,b0 <ff_startcode_find_candidate_c+0xb0>
>    4:	3c08ff7f 	lui	a4,0xff7f
>    8:	3c07ff01 	lui	a3,0xff01
>    c:	65087f7f 	daddiu	a4,a4,32639
>   10:	64e70101 	daddiu	a3,a3,257
>   14:	00084438 	dsll	a4,a4,0x10
>   18:	00073c38 	dsll	a3,a3,0x10
>   1c:	65087f7f 	daddiu	a4,a4,32639
>   20:	64e70101 	daddiu	a3,a3,257
>   24:	00084478 	dsll	a4,a4,0x11
>   28:	00073df8 	dsll	a3,a3,0x17
>   2c:	00803025 	move	a2,a0
>   30:	00001025 	move	v0,zero
>   34:	3508feff 	ori	a4,a4,0xfeff
>   38:	10000005 	b	50 <ff_startcode_find_candidate_c+0x50>
>   3c:	34e78080 	ori	a3,a3,0x8080
>   40:	24420008 	addiu	v0,v0,8
>   44:	0045182a 	slt	v1,v0,a1
>   48:	10600017 	beqz	v1,a8 <ff_startcode_find_candidate_c+0xa8>
>   4c:	00000000 	nop
>   50:	68c30007 	ldl	v1,7(a2)
>   54:	6cc30000 	ldr	v1,0(a2)
>   58:	0068482d 	daddu	a5,v1,a4
>   5c:	44a30000 	dmtc1	v1,$f0
>   60:	44a90800 	dmtc1	a5,$f1
>   64:	4be10002 	pandn	$f0,$f0,$f1
>   68:	44230000 	dmfc1	v1,$f0
>   6c:	00671824 	and	v1,v1,a3
>   70:	1060fff3 	beqz	v1,40 <ff_startcode_find_candidate_c+0x40>
>   74:	64c60008 	daddiu	a2,a2,8
>   78:	0045182a 	slt	v1,v0,a1
>   7c:	1060000a 	beqz	v1,a8 <ff_startcode_find_candidate_c+0xa8>
>   80:	0082182d 	daddu	v1,a0,v0
>   84:	10000006 	b	a0 <ff_startcode_find_candidate_c+0xa0>
>   88:	90640000 	lbu	a0,0(v1)
>   8c:	00000000 	nop
>   90:	24420001 	addiu	v0,v0,1
>   94:	10a20008 	beq	a1,v0,b8 <ff_startcode_find_candidate_c+0xb8>
>   98:	00000000 	nop
>   9c:	90640000 	lbu	a0,0(v1)
>   a0:	1480fffb 	bnez	a0,90 <ff_startcode_find_candidate_c+0x90>
>   a4:	64630001 	daddiu	v1,v1,1
>   a8:	03e00008 	jr	ra
>   ac:	00000000 	nop
>   b0:	03e00008 	jr	ra
>   b4:	00001025 	move	v0,zero
>   b8:	03e00008 	jr	ra
>   bc:	00a01025 	move	v0,a1
> 
> As one can see, the difference is that an ld has been replaced
> by a pair of ldl and ldr. I don't know the performance implications
> of this.

google tells me ld is an aligned load, so this change is then correct
whatever its performance implications are.
Andreas Rheinhardt Oct. 14, 2022, 11:42 p.m. UTC | #2
Anton Khirnov:
> Quoting Andreas Rheinhardt (2022-10-12 00:20:23)
>> Up until now, ff_startcode_find_candidate_c() simply casts
>> an uint8_t* to uint64_t*/uint32_t* to read 64/32 bits at a time
>> in case HAVE_FAST_UNALIGNED is true. Yet this ignores the
>> alignment requirement of these types as well as effective type
>> rules of the C standard. This commit therefore replaces these
>> direct accesses with AV_RN64/32; this also improves
>> readability.
>>
>> UBSan reported these unaligned accesses which happened in 233
>> FATE-tests involving H.264 and VC-1 (this has also been reported
>> in tickets #8138 and #8485); these tests are fixed by this commit.
>>
>> The output of GCC with -O3 is unchanged for aarch64, alpha, arm,
>> loongarch, ppc and x64. There was only a slight difference for mips.
>>
>> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
>> ---
>> This is v2 of https://patchwork.ffmpeg.org/project/ffmpeg/patch/20200122145210.6898-1-andreas.rheinhardt@gmail.com/
>>
>> Here is the mips code before this change:
>>
>> startcode_old_O3.o:     file format elf64-tradlittlemips
>>
>>
>> Disassembly of section .text:
>>
>> 0000000000000000 <ff_startcode_find_candidate_c>:
>>    0:	18a00029 	blez	a1,a8 <ff_startcode_find_candidate_c+0xa8>
>>    4:	3c08ff7f 	lui	a4,0xff7f
>>    8:	3c07ff01 	lui	a3,0xff01
>>    c:	65087f7f 	daddiu	a4,a4,32639
>>   10:	64e70101 	daddiu	a3,a3,257
>>   14:	00084438 	dsll	a4,a4,0x10
>>   18:	00073c38 	dsll	a3,a3,0x10
>>   1c:	65087f7f 	daddiu	a4,a4,32639
>>   20:	64e70101 	daddiu	a3,a3,257
>>   24:	00084478 	dsll	a4,a4,0x11
>>   28:	00073df8 	dsll	a3,a3,0x17
>>   2c:	00803025 	move	a2,a0
>>   30:	00001025 	move	v0,zero
>>   34:	3508feff 	ori	a4,a4,0xfeff
>>   38:	10000005 	b	50 <ff_startcode_find_candidate_c+0x50>
>>   3c:	34e78080 	ori	a3,a3,0x8080
>>   40:	24420008 	addiu	v0,v0,8
>>   44:	0045182a 	slt	v1,v0,a1
>>   48:	10600015 	beqz	v1,a0 <ff_startcode_find_candidate_c+0xa0>
>>   4c:	00000000 	nop
>>   50:	dcc30000 	ld	v1,0(a2)
>>   54:	0068482d 	daddu	a5,v1,a4
>>   58:	44a30000 	dmtc1	v1,$f0
>>   5c:	44a90800 	dmtc1	a5,$f1
>>   60:	4be10002 	pandn	$f0,$f0,$f1
>>   64:	44230000 	dmfc1	v1,$f0
>>   68:	00671824 	and	v1,v1,a3
>>   6c:	1060fff4 	beqz	v1,40 <ff_startcode_find_candidate_c+0x40>
>>   70:	64c60008 	daddiu	a2,a2,8
>>   74:	0045182a 	slt	v1,v0,a1
>>   78:	10600009 	beqz	v1,a0 <ff_startcode_find_candidate_c+0xa0>
>>   7c:	0082182d 	daddu	v1,a0,v0
>>   80:	10000005 	b	98 <ff_startcode_find_candidate_c+0x98>
>>   84:	90640000 	lbu	a0,0(v1)
>>   88:	24420001 	addiu	v0,v0,1
>>   8c:	10a20008 	beq	a1,v0,b0 <ff_startcode_find_candidate_c+0xb0>
>>   90:	00000000 	nop
>>   94:	90640000 	lbu	a0,0(v1)
>>   98:	1480fffb 	bnez	a0,88 <ff_startcode_find_candidate_c+0x88>
>>   9c:	64630001 	daddiu	v1,v1,1
>>   a0:	03e00008 	jr	ra
>>   a4:	00000000 	nop
>>   a8:	03e00008 	jr	ra
>>   ac:	00001025 	move	v0,zero
>>   b0:	03e00008 	jr	ra
>>   b4:	00a01025 	move	v0,a1
>> 	...
>>
>> And here after this change:
>>
>> startcode_new_O3.o:     file format elf64-tradlittlemips
>>
>>
>> Disassembly of section .text:
>>
>> 0000000000000000 <ff_startcode_find_candidate_c>:
>>    0:	18a0002b 	blez	a1,b0 <ff_startcode_find_candidate_c+0xb0>
>>    4:	3c08ff7f 	lui	a4,0xff7f
>>    8:	3c07ff01 	lui	a3,0xff01
>>    c:	65087f7f 	daddiu	a4,a4,32639
>>   10:	64e70101 	daddiu	a3,a3,257
>>   14:	00084438 	dsll	a4,a4,0x10
>>   18:	00073c38 	dsll	a3,a3,0x10
>>   1c:	65087f7f 	daddiu	a4,a4,32639
>>   20:	64e70101 	daddiu	a3,a3,257
>>   24:	00084478 	dsll	a4,a4,0x11
>>   28:	00073df8 	dsll	a3,a3,0x17
>>   2c:	00803025 	move	a2,a0
>>   30:	00001025 	move	v0,zero
>>   34:	3508feff 	ori	a4,a4,0xfeff
>>   38:	10000005 	b	50 <ff_startcode_find_candidate_c+0x50>
>>   3c:	34e78080 	ori	a3,a3,0x8080
>>   40:	24420008 	addiu	v0,v0,8
>>   44:	0045182a 	slt	v1,v0,a1
>>   48:	10600017 	beqz	v1,a8 <ff_startcode_find_candidate_c+0xa8>
>>   4c:	00000000 	nop
>>   50:	68c30007 	ldl	v1,7(a2)
>>   54:	6cc30000 	ldr	v1,0(a2)
>>   58:	0068482d 	daddu	a5,v1,a4
>>   5c:	44a30000 	dmtc1	v1,$f0
>>   60:	44a90800 	dmtc1	a5,$f1
>>   64:	4be10002 	pandn	$f0,$f0,$f1
>>   68:	44230000 	dmfc1	v1,$f0
>>   6c:	00671824 	and	v1,v1,a3
>>   70:	1060fff3 	beqz	v1,40 <ff_startcode_find_candidate_c+0x40>
>>   74:	64c60008 	daddiu	a2,a2,8
>>   78:	0045182a 	slt	v1,v0,a1
>>   7c:	1060000a 	beqz	v1,a8 <ff_startcode_find_candidate_c+0xa8>
>>   80:	0082182d 	daddu	v1,a0,v0
>>   84:	10000006 	b	a0 <ff_startcode_find_candidate_c+0xa0>
>>   88:	90640000 	lbu	a0,0(v1)
>>   8c:	00000000 	nop
>>   90:	24420001 	addiu	v0,v0,1
>>   94:	10a20008 	beq	a1,v0,b8 <ff_startcode_find_candidate_c+0xb8>
>>   98:	00000000 	nop
>>   9c:	90640000 	lbu	a0,0(v1)
>>   a0:	1480fffb 	bnez	a0,90 <ff_startcode_find_candidate_c+0x90>
>>   a4:	64630001 	daddiu	v1,v1,1
>>   a8:	03e00008 	jr	ra
>>   ac:	00000000 	nop
>>   b0:	03e00008 	jr	ra
>>   b4:	00001025 	move	v0,zero
>>   b8:	03e00008 	jr	ra
>>   bc:	00a01025 	move	v0,a1
>>
>> As one can see, the difference is that an ld has been replaced
>> by a pair of ldl and ldr. I don't know the performance implications
>> of this.
> 
> google tells me ld is an aligned load, so this change is then correct
> whatever its performance implications are.
> 

I was about to write the same after reading
https://www.cs.cmu.edu/afs/cs/academic/class/15740-f97/public/doc/mips-isa.pdf,
but then I found out that certain loongson processors allow unaligned
accesses (see
https://lists.gnu.org/archive/html/qemu-devel/2020-10/msg08563.html) and
configure only enables fast_unaligned for
loongson2e|loongson2f|loongson3*); if fast_unaligned would lead to
crashes on these processors, it would probably have been noticed long
ago. So the question of performance implications on such processors is
still open.
If no one bothers to test this in a reasonable time, I will treat this
as sign that no one is interested in the affected arches and apply this.

- Andreas
Andreas Rheinhardt Oct. 15, 2022, 12:04 a.m. UTC | #3
Andreas Rheinhardt:
> Up until now, ff_startcode_find_candidate_c() simply casts
> an uint8_t* to uint64_t*/uint32_t* to read 64/32 bits at a time
> in case HAVE_FAST_UNALIGNED is true. Yet this ignores the
> alignment requirement of these types as well as effective type
> rules of the C standard. This commit therefore replaces these
> direct accesses with AV_RN64/32; this also improves
> readability.
> 
> UBSan reported these unaligned accesses which happened in 233
> FATE-tests involving H.264 and VC-1 (this has also been reported
> in tickets #8138 and #8485); these tests are fixed by this commit.
> 
> The output of GCC with -O3 is unchanged for aarch64, alpha, arm,
> loongarch, ppc and x64. There was only a slight difference for mips.
> 
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
> ---
> This is v2 of https://patchwork.ffmpeg.org/project/ffmpeg/patch/20200122145210.6898-1-andreas.rheinhardt@gmail.com/
> 
> Here is the mips code before this change:
> 
> startcode_old_O3.o:     file format elf64-tradlittlemips
> 
> 
> Disassembly of section .text:
> 
> 0000000000000000 <ff_startcode_find_candidate_c>:
>    0:	18a00029 	blez	a1,a8 <ff_startcode_find_candidate_c+0xa8>
>    4:	3c08ff7f 	lui	a4,0xff7f
>    8:	3c07ff01 	lui	a3,0xff01
>    c:	65087f7f 	daddiu	a4,a4,32639
>   10:	64e70101 	daddiu	a3,a3,257
>   14:	00084438 	dsll	a4,a4,0x10
>   18:	00073c38 	dsll	a3,a3,0x10
>   1c:	65087f7f 	daddiu	a4,a4,32639
>   20:	64e70101 	daddiu	a3,a3,257
>   24:	00084478 	dsll	a4,a4,0x11
>   28:	00073df8 	dsll	a3,a3,0x17
>   2c:	00803025 	move	a2,a0
>   30:	00001025 	move	v0,zero
>   34:	3508feff 	ori	a4,a4,0xfeff
>   38:	10000005 	b	50 <ff_startcode_find_candidate_c+0x50>
>   3c:	34e78080 	ori	a3,a3,0x8080
>   40:	24420008 	addiu	v0,v0,8
>   44:	0045182a 	slt	v1,v0,a1
>   48:	10600015 	beqz	v1,a0 <ff_startcode_find_candidate_c+0xa0>
>   4c:	00000000 	nop
>   50:	dcc30000 	ld	v1,0(a2)
>   54:	0068482d 	daddu	a5,v1,a4
>   58:	44a30000 	dmtc1	v1,$f0
>   5c:	44a90800 	dmtc1	a5,$f1
>   60:	4be10002 	pandn	$f0,$f0,$f1
>   64:	44230000 	dmfc1	v1,$f0
>   68:	00671824 	and	v1,v1,a3
>   6c:	1060fff4 	beqz	v1,40 <ff_startcode_find_candidate_c+0x40>
>   70:	64c60008 	daddiu	a2,a2,8
>   74:	0045182a 	slt	v1,v0,a1
>   78:	10600009 	beqz	v1,a0 <ff_startcode_find_candidate_c+0xa0>
>   7c:	0082182d 	daddu	v1,a0,v0
>   80:	10000005 	b	98 <ff_startcode_find_candidate_c+0x98>
>   84:	90640000 	lbu	a0,0(v1)
>   88:	24420001 	addiu	v0,v0,1
>   8c:	10a20008 	beq	a1,v0,b0 <ff_startcode_find_candidate_c+0xb0>
>   90:	00000000 	nop
>   94:	90640000 	lbu	a0,0(v1)
>   98:	1480fffb 	bnez	a0,88 <ff_startcode_find_candidate_c+0x88>
>   9c:	64630001 	daddiu	v1,v1,1
>   a0:	03e00008 	jr	ra
>   a4:	00000000 	nop
>   a8:	03e00008 	jr	ra
>   ac:	00001025 	move	v0,zero
>   b0:	03e00008 	jr	ra
>   b4:	00a01025 	move	v0,a1
> 	...
> 
> And here after this change:
> 
> startcode_new_O3.o:     file format elf64-tradlittlemips
> 
> 
> Disassembly of section .text:
> 
> 0000000000000000 <ff_startcode_find_candidate_c>:
>    0:	18a0002b 	blez	a1,b0 <ff_startcode_find_candidate_c+0xb0>
>    4:	3c08ff7f 	lui	a4,0xff7f
>    8:	3c07ff01 	lui	a3,0xff01
>    c:	65087f7f 	daddiu	a4,a4,32639
>   10:	64e70101 	daddiu	a3,a3,257
>   14:	00084438 	dsll	a4,a4,0x10
>   18:	00073c38 	dsll	a3,a3,0x10
>   1c:	65087f7f 	daddiu	a4,a4,32639
>   20:	64e70101 	daddiu	a3,a3,257
>   24:	00084478 	dsll	a4,a4,0x11
>   28:	00073df8 	dsll	a3,a3,0x17
>   2c:	00803025 	move	a2,a0
>   30:	00001025 	move	v0,zero
>   34:	3508feff 	ori	a4,a4,0xfeff
>   38:	10000005 	b	50 <ff_startcode_find_candidate_c+0x50>
>   3c:	34e78080 	ori	a3,a3,0x8080
>   40:	24420008 	addiu	v0,v0,8
>   44:	0045182a 	slt	v1,v0,a1
>   48:	10600017 	beqz	v1,a8 <ff_startcode_find_candidate_c+0xa8>
>   4c:	00000000 	nop
>   50:	68c30007 	ldl	v1,7(a2)
>   54:	6cc30000 	ldr	v1,0(a2)
>   58:	0068482d 	daddu	a5,v1,a4
>   5c:	44a30000 	dmtc1	v1,$f0
>   60:	44a90800 	dmtc1	a5,$f1
>   64:	4be10002 	pandn	$f0,$f0,$f1
>   68:	44230000 	dmfc1	v1,$f0
>   6c:	00671824 	and	v1,v1,a3
>   70:	1060fff3 	beqz	v1,40 <ff_startcode_find_candidate_c+0x40>
>   74:	64c60008 	daddiu	a2,a2,8
>   78:	0045182a 	slt	v1,v0,a1
>   7c:	1060000a 	beqz	v1,a8 <ff_startcode_find_candidate_c+0xa8>
>   80:	0082182d 	daddu	v1,a0,v0
>   84:	10000006 	b	a0 <ff_startcode_find_candidate_c+0xa0>
>   88:	90640000 	lbu	a0,0(v1)
>   8c:	00000000 	nop
>   90:	24420001 	addiu	v0,v0,1
>   94:	10a20008 	beq	a1,v0,b8 <ff_startcode_find_candidate_c+0xb8>
>   98:	00000000 	nop
>   9c:	90640000 	lbu	a0,0(v1)
>   a0:	1480fffb 	bnez	a0,90 <ff_startcode_find_candidate_c+0x90>
>   a4:	64630001 	daddiu	v1,v1,1
>   a8:	03e00008 	jr	ra
>   ac:	00000000 	nop
>   b0:	03e00008 	jr	ra
>   b4:	00001025 	move	v0,zero
>   b8:	03e00008 	jr	ra
>   bc:	00a01025 	move	v0,a1
> 
> As one can see, the difference is that an ld has been replaced
> by a pair of ldl and ldr. I don't know the performance implications
> of this.
> 
>  libavcodec/startcode.c | 9 +++++----
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c
> index 9efdffe8c6..d84f326521 100644
> --- a/libavcodec/startcode.c
> +++ b/libavcodec/startcode.c
> @@ -25,6 +25,7 @@
>   * @author Michael Niedermayer <michaelni@gmx.at>
>   */
>  
> +#include "libavutil/intreadwrite.h"
>  #include "startcode.h"
>  #include "config.h"
>  
> @@ -38,14 +39,14 @@ int ff_startcode_find_candidate_c(const uint8_t *buf, int size)
>       */
>  #if HAVE_FAST_64BIT
>      while (i < size &&
> -            !((~*(const uint64_t *)(buf + i) &
> -                    (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
> +            !((~AV_RN64(buf + i) &
> +                    (AV_RN64(buf + i) - 0x0101010101010101ULL)) &
>                      0x8080808080808080ULL))
>          i += 8;
>  #else
>      while (i < size &&
> -            !((~*(const uint32_t *)(buf + i) &
> -                    (*(const uint32_t *)(buf + i) - 0x01010101U)) &
> +            !((~AV_RN32(buf + i) &
> +                    (AV_RN32(buf + i) - 0x01010101U)) &
>                      0x80808080U))
>          i += 4;
>  #endif

Unfortunately, the list in the commit message is incorrect: configure
enables fast_unaligned for aarch64, ppc, x86, loongarch, certain
loongson mips processors and armv6-8. The above claim that assembly for
arm is unaffected by this change is not correct when one actually deals
with one of the CPUs that are marked as fast unaligned. When compiling
for armv6 the assembly changes from

startcode_old_O3.o:     file format elf32-littlearm


Disassembly of section .text:

00000000 <ff_startcode_find_candidate_c>:
   0:	2900      	cmp	r1, #0
   2:	dd20      	ble.n	46 <ff_startcode_find_candidate_c+0x46>
   4:	f1a0 0c04 	sub.w	ip, r0, #4
   8:	2300      	movs	r3, #0
   a:	b410      	push	{r4}
   c:	e002      	b.n	14 <ff_startcode_find_candidate_c+0x14>
   e:	3304      	adds	r3, #4
  10:	4299      	cmp	r1, r3
  12:	dd14      	ble.n	3e <ff_startcode_find_candidate_c+0x3e>
  14:	f85c 4f04 	ldr.w	r4, [ip, #4]!
  18:	f1a4 3201 	sub.w	r2, r4, #16843009	; 0x1010101
  1c:	ea22 0204 	bic.w	r2, r2, r4
  20:	f012 3f80 	tst.w	r2, #2155905152	; 0x80808080
  24:	d0f3      	beq.n	e <ff_startcode_find_candidate_c+0xe>
  26:	4299      	cmp	r1, r3
  28:	dd09      	ble.n	3e <ff_startcode_find_candidate_c+0x3e>
  2a:	1e5a      	subs	r2, r3, #1
  2c:	4402      	add	r2, r0
  2e:	e002      	b.n	36 <ff_startcode_find_candidate_c+0x36>
  30:	3301      	adds	r3, #1
  32:	4299      	cmp	r1, r3
  34:	d003      	beq.n	3e <ff_startcode_find_candidate_c+0x3e>
  36:	f812 0f01 	ldrb.w	r0, [r2, #1]!
  3a:	2800      	cmp	r0, #0
  3c:	d1f8      	bne.n	30 <ff_startcode_find_candidate_c+0x30>
  3e:	4618      	mov	r0, r3
  40:	f85d 4b04 	ldr.w	r4, [sp], #4
  44:	4770      	bx	lr
  46:	2300      	movs	r3, #0
  48:	4618      	mov	r0, r3
  4a:	4770      	bx	lr

to

startcode_new_O3.o:     file format elf32-littlearm


Disassembly of section .text:

00000000 <ff_startcode_find_candidate_c>:
   0:	2300      	movs	r3, #0
   2:	2900      	cmp	r1, #0
   4:	dc03      	bgt.n	e <ff_startcode_find_candidate_c+0xe>
   6:	e017      	b.n	38 <ff_startcode_find_candidate_c+0x38>
   8:	3304      	adds	r3, #4
   a:	4299      	cmp	r1, r3
   c:	dd14      	ble.n	38 <ff_startcode_find_candidate_c+0x38>
   e:	f850 c003 	ldr.w	ip, [r0, r3]
  12:	f1ac 3201 	sub.w	r2, ip, #16843009	; 0x1010101
  16:	ea22 020c 	bic.w	r2, r2, ip
  1a:	f012 3f80 	tst.w	r2, #2155905152	; 0x80808080
  1e:	d0f3      	beq.n	8 <ff_startcode_find_candidate_c+0x8>
  20:	4299      	cmp	r1, r3
  22:	dd09      	ble.n	38 <ff_startcode_find_candidate_c+0x38>
  24:	1e5a      	subs	r2, r3, #1
  26:	4402      	add	r2, r0
  28:	e002      	b.n	30 <ff_startcode_find_candidate_c+0x30>
  2a:	3301      	adds	r3, #1
  2c:	4299      	cmp	r1, r3
  2e:	d003      	beq.n	38 <ff_startcode_find_candidate_c+0x38>
  30:	f812 0f01 	ldrb.w	r0, [r2, #1]!
  34:	2800      	cmp	r0, #0
  36:	d1f8      	bne.n	2a <ff_startcode_find_candidate_c+0x2a>
  38:	4618      	mov	r0, r3
  3a:	4770      	bx	lr

Once again, I have no clue about the performance implications of this
and would welcome any feedback on this.

- Andreas
Andreas Rheinhardt Oct. 17, 2022, 2:48 p.m. UTC | #4
Andreas Rheinhardt:
> Up until now, ff_startcode_find_candidate_c() simply casts
> an uint8_t* to uint64_t*/uint32_t* to read 64/32 bits at a time
> in case HAVE_FAST_UNALIGNED is true. Yet this ignores the
> alignment requirement of these types as well as effective type
> rules of the C standard. This commit therefore replaces these
> direct accesses with AV_RN64/32; this also improves
> readability.
> 
> UBSan reported these unaligned accesses which happened in 233
> FATE-tests involving H.264 and VC-1 (this has also been reported
> in tickets #8138 and #8485); these tests are fixed by this commit.
> 
> The output of GCC with -O3 is unchanged for aarch64, alpha, arm,
> loongarch, ppc and x64. There was only a slight difference for mips.

Will apply this patch in two days with the last paragraph replaced by
the following unless there are objections:

    The output of GCC with -O3 is unchanged for aarch64, loongarch,
    ppc and x64 (as well as for arches like alpha for which
    HAVE_FAST_UNALIGNED is never true in the first place).
    There was only a slight difference for mips and arm.
    I don't know about the speed impact of them.

> 
> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
> ---
> This is v2 of https://patchwork.ffmpeg.org/project/ffmpeg/patch/20200122145210.6898-1-andreas.rheinhardt@gmail.com/
> 
> Here is the mips code before this change:
> 
> startcode_old_O3.o:     file format elf64-tradlittlemips
> 
> 
> Disassembly of section .text:
> 
> 0000000000000000 <ff_startcode_find_candidate_c>:
>    0:	18a00029 	blez	a1,a8 <ff_startcode_find_candidate_c+0xa8>
>    4:	3c08ff7f 	lui	a4,0xff7f
>    8:	3c07ff01 	lui	a3,0xff01
>    c:	65087f7f 	daddiu	a4,a4,32639
>   10:	64e70101 	daddiu	a3,a3,257
>   14:	00084438 	dsll	a4,a4,0x10
>   18:	00073c38 	dsll	a3,a3,0x10
>   1c:	65087f7f 	daddiu	a4,a4,32639
>   20:	64e70101 	daddiu	a3,a3,257
>   24:	00084478 	dsll	a4,a4,0x11
>   28:	00073df8 	dsll	a3,a3,0x17
>   2c:	00803025 	move	a2,a0
>   30:	00001025 	move	v0,zero
>   34:	3508feff 	ori	a4,a4,0xfeff
>   38:	10000005 	b	50 <ff_startcode_find_candidate_c+0x50>
>   3c:	34e78080 	ori	a3,a3,0x8080
>   40:	24420008 	addiu	v0,v0,8
>   44:	0045182a 	slt	v1,v0,a1
>   48:	10600015 	beqz	v1,a0 <ff_startcode_find_candidate_c+0xa0>
>   4c:	00000000 	nop
>   50:	dcc30000 	ld	v1,0(a2)
>   54:	0068482d 	daddu	a5,v1,a4
>   58:	44a30000 	dmtc1	v1,$f0
>   5c:	44a90800 	dmtc1	a5,$f1
>   60:	4be10002 	pandn	$f0,$f0,$f1
>   64:	44230000 	dmfc1	v1,$f0
>   68:	00671824 	and	v1,v1,a3
>   6c:	1060fff4 	beqz	v1,40 <ff_startcode_find_candidate_c+0x40>
>   70:	64c60008 	daddiu	a2,a2,8
>   74:	0045182a 	slt	v1,v0,a1
>   78:	10600009 	beqz	v1,a0 <ff_startcode_find_candidate_c+0xa0>
>   7c:	0082182d 	daddu	v1,a0,v0
>   80:	10000005 	b	98 <ff_startcode_find_candidate_c+0x98>
>   84:	90640000 	lbu	a0,0(v1)
>   88:	24420001 	addiu	v0,v0,1
>   8c:	10a20008 	beq	a1,v0,b0 <ff_startcode_find_candidate_c+0xb0>
>   90:	00000000 	nop
>   94:	90640000 	lbu	a0,0(v1)
>   98:	1480fffb 	bnez	a0,88 <ff_startcode_find_candidate_c+0x88>
>   9c:	64630001 	daddiu	v1,v1,1
>   a0:	03e00008 	jr	ra
>   a4:	00000000 	nop
>   a8:	03e00008 	jr	ra
>   ac:	00001025 	move	v0,zero
>   b0:	03e00008 	jr	ra
>   b4:	00a01025 	move	v0,a1
> 	...
> 
> And here after this change:
> 
> startcode_new_O3.o:     file format elf64-tradlittlemips
> 
> 
> Disassembly of section .text:
> 
> 0000000000000000 <ff_startcode_find_candidate_c>:
>    0:	18a0002b 	blez	a1,b0 <ff_startcode_find_candidate_c+0xb0>
>    4:	3c08ff7f 	lui	a4,0xff7f
>    8:	3c07ff01 	lui	a3,0xff01
>    c:	65087f7f 	daddiu	a4,a4,32639
>   10:	64e70101 	daddiu	a3,a3,257
>   14:	00084438 	dsll	a4,a4,0x10
>   18:	00073c38 	dsll	a3,a3,0x10
>   1c:	65087f7f 	daddiu	a4,a4,32639
>   20:	64e70101 	daddiu	a3,a3,257
>   24:	00084478 	dsll	a4,a4,0x11
>   28:	00073df8 	dsll	a3,a3,0x17
>   2c:	00803025 	move	a2,a0
>   30:	00001025 	move	v0,zero
>   34:	3508feff 	ori	a4,a4,0xfeff
>   38:	10000005 	b	50 <ff_startcode_find_candidate_c+0x50>
>   3c:	34e78080 	ori	a3,a3,0x8080
>   40:	24420008 	addiu	v0,v0,8
>   44:	0045182a 	slt	v1,v0,a1
>   48:	10600017 	beqz	v1,a8 <ff_startcode_find_candidate_c+0xa8>
>   4c:	00000000 	nop
>   50:	68c30007 	ldl	v1,7(a2)
>   54:	6cc30000 	ldr	v1,0(a2)
>   58:	0068482d 	daddu	a5,v1,a4
>   5c:	44a30000 	dmtc1	v1,$f0
>   60:	44a90800 	dmtc1	a5,$f1
>   64:	4be10002 	pandn	$f0,$f0,$f1
>   68:	44230000 	dmfc1	v1,$f0
>   6c:	00671824 	and	v1,v1,a3
>   70:	1060fff3 	beqz	v1,40 <ff_startcode_find_candidate_c+0x40>
>   74:	64c60008 	daddiu	a2,a2,8
>   78:	0045182a 	slt	v1,v0,a1
>   7c:	1060000a 	beqz	v1,a8 <ff_startcode_find_candidate_c+0xa8>
>   80:	0082182d 	daddu	v1,a0,v0
>   84:	10000006 	b	a0 <ff_startcode_find_candidate_c+0xa0>
>   88:	90640000 	lbu	a0,0(v1)
>   8c:	00000000 	nop
>   90:	24420001 	addiu	v0,v0,1
>   94:	10a20008 	beq	a1,v0,b8 <ff_startcode_find_candidate_c+0xb8>
>   98:	00000000 	nop
>   9c:	90640000 	lbu	a0,0(v1)
>   a0:	1480fffb 	bnez	a0,90 <ff_startcode_find_candidate_c+0x90>
>   a4:	64630001 	daddiu	v1,v1,1
>   a8:	03e00008 	jr	ra
>   ac:	00000000 	nop
>   b0:	03e00008 	jr	ra
>   b4:	00001025 	move	v0,zero
>   b8:	03e00008 	jr	ra
>   bc:	00a01025 	move	v0,a1
> 
> As one can see, the difference is that an ld has been replaced
> by a pair of ldl and ldr. I don't know the performance implications
> of this.
> 
>  libavcodec/startcode.c | 9 +++++----
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c
> index 9efdffe8c6..d84f326521 100644
> --- a/libavcodec/startcode.c
> +++ b/libavcodec/startcode.c
> @@ -25,6 +25,7 @@
>   * @author Michael Niedermayer <michaelni@gmx.at>
>   */
>  
> +#include "libavutil/intreadwrite.h"
>  #include "startcode.h"
>  #include "config.h"
>  
> @@ -38,14 +39,14 @@ int ff_startcode_find_candidate_c(const uint8_t *buf, int size)
>       */
>  #if HAVE_FAST_64BIT
>      while (i < size &&
> -            !((~*(const uint64_t *)(buf + i) &
> -                    (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
> +            !((~AV_RN64(buf + i) &
> +                    (AV_RN64(buf + i) - 0x0101010101010101ULL)) &
>                      0x8080808080808080ULL))
>          i += 8;
>  #else
>      while (i < size &&
> -            !((~*(const uint32_t *)(buf + i) &
> -                    (*(const uint32_t *)(buf + i) - 0x01010101U)) &
> +            !((~AV_RN32(buf + i) &
> +                    (AV_RN32(buf + i) - 0x01010101U)) &
>                      0x80808080U))
>          i += 4;
>  #endif
diff mbox series

Patch

diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c
index 9efdffe8c6..d84f326521 100644
--- a/libavcodec/startcode.c
+++ b/libavcodec/startcode.c
@@ -25,6 +25,7 @@ 
  * @author Michael Niedermayer <michaelni@gmx.at>
  */
 
+#include "libavutil/intreadwrite.h"
 #include "startcode.h"
 #include "config.h"
 
@@ -38,14 +39,14 @@  int ff_startcode_find_candidate_c(const uint8_t *buf, int size)
      */
 #if HAVE_FAST_64BIT
     while (i < size &&
-            !((~*(const uint64_t *)(buf + i) &
-                    (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
+            !((~AV_RN64(buf + i) &
+                    (AV_RN64(buf + i) - 0x0101010101010101ULL)) &
                     0x8080808080808080ULL))
         i += 8;
 #else
     while (i < size &&
-            !((~*(const uint32_t *)(buf + i) &
-                    (*(const uint32_t *)(buf + i) - 0x01010101U)) &
+            !((~AV_RN32(buf + i) &
+                    (AV_RN32(buf + i) - 0x01010101U)) &
                     0x80808080U))
         i += 4;
 #endif