diff mbox

[FFmpeg-devel] avutil: fix data race in av_get_cpu_flags().

Message ID 1479771469-60379-2-git-send-email-wtc@google.com
State Superseded
Headers show

Commit Message

Wan-Teh Chang Nov. 21, 2016, 11:37 p.m. UTC
Make the one-time initialization in av_get_cpu_flags() thread-safe.
The fix assumes -1 is an invalid value for cpu flags.

The fix requires new atomic functions to get, set, and compare-and-swap
an integer without a memory barrier.

The data race fix is written by Dmitry Vyukov of Google.

Signed-off-by: Wan-Teh Chang <wtc@google.com>
---
 libavutil/atomic.c       | 40 ++++++++++++++++++++++++++++++++++++++++
 libavutil/atomic.h       | 34 ++++++++++++++++++++++++++++++++--
 libavutil/atomic_gcc.h   | 33 +++++++++++++++++++++++++++++++++
 libavutil/atomic_suncc.h | 19 +++++++++++++++++++
 libavutil/atomic_win32.h | 21 +++++++++++++++++++++
 libavutil/cpu.c          | 41 ++++++++++++++++++++++-------------------
 libavutil/tests/atomic.c | 13 +++++++++++++
 7 files changed, 180 insertions(+), 21 deletions(-)

Comments

Michael Niedermayer Nov. 22, 2016, 12:35 a.m. UTC | #1
On Mon, Nov 21, 2016 at 03:37:49PM -0800, Wan-Teh Chang wrote:
> Make the one-time initialization in av_get_cpu_flags() thread-safe.
> The fix assumes -1 is an invalid value for cpu flags.

please explain the bug / race that occurs and how it is fixed


> 
> The fix requires new atomic functions to get, set, and compare-and-swap
> an integer without a memory barrier.

why ?


> 
> The data race fix is written by Dmitry Vyukov of Google.

Then the author for the git patch should be set accordingly


[...]

> @@ -44,7 +45,20 @@
>  #include <unistd.h>
>  #endif
>  
> -static int flags, checked;
> +static int cpu_flags = -1;
> +
> +static int get_cpu_flags(void)
> +{
> +    if (ARCH_AARCH64)
> +        return ff_get_cpu_flags_aarch64();
> +    if (ARCH_ARM)
> +        return ff_get_cpu_flags_arm();
> +    if (ARCH_PPC)
> +        return ff_get_cpu_flags_ppc();
> +    if (ARCH_X86)
> +        return ff_get_cpu_flags_x86();
> +    /* Not reached. */

src/libavutil/cpu.c: In function ‘get_cpu_flags’:
src/libavutil/cpu.c:61: error: control reaches end of non-void function

[...]
Wan-Teh Chang Nov. 22, 2016, 7:28 p.m. UTC | #2
Hi Michael,

On Mon, Nov 21, 2016 at 4:35 PM, Michael Niedermayer
<michael@niedermayer.cc> wrote:
> On Mon, Nov 21, 2016 at 03:37:49PM -0800, Wan-Teh Chang wrote:
>> Make the one-time initialization in av_get_cpu_flags() thread-safe.
>> The fix assumes -1 is an invalid value for cpu flags.
>
> please explain the bug / race that occurs and how it is fixed

Note: I will include the following explanation in the next version of my patch.

The static variables |flags| and |checked| in libavutil/cpu.c are read
and written using normal load and store instructions. These are
considered as data races. The fix is to use atomic load and store
instructions. The fix also eliminates the |checked| variable because
the invalid value of -1 for |flags| can be used to indicate the same
condition.

Our application runs into the data races because two threads call
avformat_find_stream_info() at the same time.
avformat_find_stream_info() calls av_parser_init(), which may
eventually call av_get_cpu_flag(), depending on the codec.

I just wrote a minimal test case (libavutil/tests/cpu_init.c) that
reproduces the data races. The test program creates two threads that
call av_get_cpu_flags(). When built with --toolchain=clang-tsan, the
test program triggers two ThreadSanitizer warnings:

$ libavutil/tests/cpu_init
==================
WARNING: ThreadSanitizer: data race (pid=66608)
  Read of size 4 at 0x7f7aa8c15d40 by thread T2:
    #0 av_get_cpu_flags
/usr/local/google/home/wtc/ffmpeg-cpu-data-race-before/ffmpeg/libavutil/cpu.c:78
(exe+0x0000000a8536)
    #1 thread_main
/usr/local/google/home/wtc/ffmpeg-cpu-data-race-before/ffmpeg/libavutil/tests/cpu_init.c:32
(exe+0x0000000a8494)

  Previous write of size 4 at 0x7f7aa8c15d40 by thread T1:
    #0 av_get_cpu_flags
/usr/local/google/home/wtc/ffmpeg-cpu-data-race-before/ffmpeg/libavutil/cpu.c:90
(exe+0x0000000a8578)
    #1 thread_main
/usr/local/google/home/wtc/ffmpeg-cpu-data-race-before/ffmpeg/libavutil/tests/cpu_init.c:32
(exe+0x0000000a8494)

  Thread T2 (tid=66611, running) created by main thread at:
    #0 pthread_create ??:0 (exe+0x00000004d9bb)
    #1 main /usr/local/google/home/wtc/ffmpeg-cpu-data-race-before/ffmpeg/libavutil/tests/cpu_init.c:51
(exe+0x0000000a83cb)

  Thread T1 (tid=66610, running) created by main thread at:
    #0 pthread_create ??:0 (exe+0x00000004d9bb)
    #1 main /usr/local/google/home/wtc/ffmpeg-cpu-data-race-before/ffmpeg/libavutil/tests/cpu_init.c:47
(exe+0x0000000a83a9)

SUMMARY: ThreadSanitizer: data race
/usr/local/google/home/wtc/ffmpeg-cpu-data-race-before/ffmpeg/libavutil/cpu.c:78
av_get_cpu_flags
==================
==================
WARNING: ThreadSanitizer: data race (pid=66608)
  Read of size 4 at 0x7f7aa8c15d3c by thread T2:
    #0 av_get_cpu_flags
/usr/local/google/home/wtc/ffmpeg-cpu-data-race-before/ffmpeg/libavutil/cpu.c:79
(exe+0x0000000a854b)
    #1 thread_main
/usr/local/google/home/wtc/ffmpeg-cpu-data-race-before/ffmpeg/libavutil/tests/cpu_init.c:32
(exe+0x0000000a8494)

  Previous write of size 4 at 0x7f7aa8c15d3c by thread T1:
    #0 av_get_cpu_flags
/usr/local/google/home/wtc/ffmpeg-cpu-data-race-before/ffmpeg/libavutil/cpu.c:88
(exe+0x0000000a8566)
    #1 thread_main
/usr/local/google/home/wtc/ffmpeg-cpu-data-race-before/ffmpeg/libavutil/tests/cpu_init.c:32
(exe+0x0000000a8494)

  Thread T2 (tid=66611, running) created by main thread at:
    #0 pthread_create ??:0 (exe+0x00000004d9bb)
    #1 main /usr/local/google/home/wtc/ffmpeg-cpu-data-race-before/ffmpeg/libavutil/tests/cpu_init.c:51
(exe+0x0000000a83cb)

  Thread T1 (tid=66610, running) created by main thread at:
    #0 pthread_create ??:0 (exe+0x00000004d9bb)
    #1 main /usr/local/google/home/wtc/ffmpeg-cpu-data-race-before/ffmpeg/libavutil/tests/cpu_init.c:47
(exe+0x0000000a83a9)

SUMMARY: ThreadSanitizer: data race
/usr/local/google/home/wtc/ffmpeg-cpu-data-race-before/ffmpeg/libavutil/cpu.c:79
av_get_cpu_flags
==================
ThreadSanitizer: reported 2 warnings

>> The fix requires new atomic functions to get, set, and compare-and-swap
>> an integer without a memory barrier.
>
> why ?

The fix needs a compare-and-swap function for an atomic integer. There
is no such function in libavutil/atomic.h.

Although the fix can use avpriv_atomic_int_get() and
avpriv_atomic_int_set(), these two functions execute a memory barrier
with sequentially-consistent ordering, which the fix doesn't need. To
improve performance, I added avpriv_atomic_int_get_relaxed() and
avpriv_atomic_int_set_relaxed()

>> The data race fix is written by Dmitry Vyukov of Google.
>
> Then the author for the git patch should be set accordingly

I will look into this. I may just identify him as a co-author.

>> @@ -44,7 +45,20 @@
>>  #include <unistd.h>
>>  #endif
>>
>> -static int flags, checked;
>> +static int cpu_flags = -1;
>> +
>> +static int get_cpu_flags(void)
>> +{
>> +    if (ARCH_AARCH64)
>> +        return ff_get_cpu_flags_aarch64();
>> +    if (ARCH_ARM)
>> +        return ff_get_cpu_flags_arm();
>> +    if (ARCH_PPC)
>> +        return ff_get_cpu_flags_ppc();
>> +    if (ARCH_X86)
>> +        return ff_get_cpu_flags_x86();
>> +    /* Not reached. */
>
> src/libavutil/cpu.c: In function ‘get_cpu_flags’:
> src/libavutil/cpu.c:61: error: control reaches end of non-void function

Thanks. I will fix this in the next version of my patch.

Wan-Teh Chang
Michael Niedermayer Nov. 22, 2016, 9:22 p.m. UTC | #3
On Tue, Nov 22, 2016 at 11:28:48AM -0800, Wan-Teh Chang wrote:
> Hi Michael,
> 
> On Mon, Nov 21, 2016 at 4:35 PM, Michael Niedermayer
> <michael@niedermayer.cc> wrote:
> > On Mon, Nov 21, 2016 at 03:37:49PM -0800, Wan-Teh Chang wrote:
> >> Make the one-time initialization in av_get_cpu_flags() thread-safe.
> >> The fix assumes -1 is an invalid value for cpu flags.
> >
> > please explain the bug / race that occurs and how it is fixed
> 
> Note: I will include the following explanation in the next version of my patch.
> 
> The static variables |flags| and |checked| in libavutil/cpu.c are read
> and written using normal load and store instructions. These are
> considered as data races. The fix is to use atomic load and store
> instructions. The fix also eliminates the |checked| variable because
> the invalid value of -1 for |flags| can be used to indicate the same
> condition.
> 
> Our application runs into the data races because two threads call
> avformat_find_stream_info() at the same time.
> avformat_find_stream_info() calls av_parser_init(), which may
> eventually call av_get_cpu_flag(), depending on the codec.
> 
> I just wrote a minimal test case (libavutil/tests/cpu_init.c) that
> reproduces the data races. The test program creates two threads that
> call av_get_cpu_flags(). When built with --toolchain=clang-tsan, the
> test program triggers two ThreadSanitizer warnings:

ok, i see th race but do you really need the atomic operations ?
isnt merging the 2 variables into 1 as you do enough ?
(i mean the tools might still complain but would there be an actual
 race remaining?)


[...]
Wan-Teh Chang Nov. 22, 2016, 10 p.m. UTC | #4
Hi Michael,

On Tue, Nov 22, 2016 at 1:22 PM, Michael Niedermayer
<michael@niedermayer.cc> wrote:
>
> ok, i see th race but do you really need the atomic operations ?
> isnt merging the 2 variables into 1 as you do enough ?
> (i mean the tools might still complain but would there be an actual
>  race remaining?)

The atomic operations avoid the "undefined behavior" resulting from
the data races in the C source code. ThreadSanitizer analyzes the C
source code, therefore it must warn about what may be undefined
behavior according to the C standard, even though for a particular
compiler and processor, the generated code is the same.

Here is a small test program that shows gcc generates the same x86_64
assembly code for the normal and atomic (with relaxed memory ordering)
load and store operations:

==========
$ gcc --version
gcc (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

$ cat flags.c
static int flags;

int get_flags_nonatomic(void)
{
    return flags;
}

int get_flags_atomic_relaxed(void)
{
    return __atomic_load_n(&flags, __ATOMIC_RELAXED);
}

void set_flags_nonatomic(int val)
{
    flags = val;
}

void set_flags_atomic_relaxed(int val)
{
    __atomic_store_n(&flags, val, __ATOMIC_RELAXED);
}
$ gcc -Wall -O3 -std=c11 -S -c flags.c
$ cat flags.s
.file "flags.c"
.text
.p2align 4,,15
.globl get_flags_nonatomic
.type get_flags_nonatomic, @function
get_flags_nonatomic:
.LFB0:
.cfi_startproc
movl flags(%rip), %eax
ret
.cfi_endproc
.LFE0:
.size get_flags_nonatomic, .-get_flags_nonatomic
.p2align 4,,15
.globl get_flags_atomic_relaxed
.type get_flags_atomic_relaxed, @function
get_flags_atomic_relaxed:
.LFB1:
.cfi_startproc
movl flags(%rip), %eax
ret
.cfi_endproc
.LFE1:
.size get_flags_atomic_relaxed, .-get_flags_atomic_relaxed
.p2align 4,,15
.globl set_flags_nonatomic
.type set_flags_nonatomic, @function
set_flags_nonatomic:
.LFB2:
.cfi_startproc
movl %edi, flags(%rip)
ret
.cfi_endproc
.LFE2:
.size set_flags_nonatomic, .-set_flags_nonatomic
.p2align 4,,15
.globl set_flags_atomic_relaxed
.type set_flags_atomic_relaxed, @function
set_flags_atomic_relaxed:
.LFB3:
.cfi_startproc
movl %edi, flags(%rip)
ret
.cfi_endproc
.LFE3:
.size set_flags_atomic_relaxed, .-set_flags_atomic_relaxed
.local flags
.comm flags,4,16
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4"
.section .note.GNU-stack,"",@progbits
==========

Wan-Teh Chang
Michael Niedermayer Nov. 22, 2016, 10:57 p.m. UTC | #5
On Tue, Nov 22, 2016 at 02:00:12PM -0800, Wan-Teh Chang wrote:
> Hi Michael,
> 
> On Tue, Nov 22, 2016 at 1:22 PM, Michael Niedermayer
> <michael@niedermayer.cc> wrote:
> >
> > ok, i see th race but do you really need the atomic operations ?
> > isnt merging the 2 variables into 1 as you do enough ?
> > (i mean the tools might still complain but would there be an actual
> >  race remaining?)
> 
> The atomic operations avoid the "undefined behavior" resulting from
> the data races in the C source code. ThreadSanitizer analyzes the C
> source code, therefore it must warn about what may be undefined
> behavior according to the C standard, even though for a particular
> compiler and processor, the generated code is the same.
> 
> Here is a small test program that shows gcc generates the same x86_64
> assembly code for the normal and atomic (with relaxed memory ordering)
> load and store operations:

we have plenty of code that accesses fields without the extra layer
of weak atomics.
For example the progress code in the frame threading.

Can you just merge the varible ?

this also would be easier to backport if anyone wants to backport

If you still want to add weak atomics as in the patch, afterwards on
top iam not against but that is me, i wont apply it if theres no
consensus or clear majority in favour though

[...]
Wan-Teh Chang Nov. 22, 2016, 11:14 p.m. UTC | #6
Hi Michael,

On Tue, Nov 22, 2016 at 2:57 PM, Michael Niedermayer
<michael@niedermayer.cc> wrote:
>
> we have plenty of code that accesses fields without the extra layer
> of weak atomics.
> For example the progress code in the frame threading.

The progress code in the frame threading also has data races as
defined by the C standard, so ThreadSanitizer also warns about it. I
submitted patches for that code to use atomic operations before.

> Can you just merge the varible ?

Yes, I will create a patch to just merge the variables. My goal is to
fix the data races, so I still need to create another patch on top to
use atomic operations.

Wan-Teh Chang
wm4 Nov. 22, 2016, 11:30 p.m. UTC | #7
On Tue, 22 Nov 2016 23:57:15 +0100
Michael Niedermayer <michael@niedermayer.cc> wrote:

> On Tue, Nov 22, 2016 at 02:00:12PM -0800, Wan-Teh Chang wrote:
> > Hi Michael,
> > 
> > On Tue, Nov 22, 2016 at 1:22 PM, Michael Niedermayer
> > <michael@niedermayer.cc> wrote:  
> > >
> > > ok, i see th race but do you really need the atomic operations ?
> > > isnt merging the 2 variables into 1 as you do enough ?
> > > (i mean the tools might still complain but would there be an actual
> > >  race remaining?)  
> > 
> > The atomic operations avoid the "undefined behavior" resulting from
> > the data races in the C source code. ThreadSanitizer analyzes the C
> > source code, therefore it must warn about what may be undefined
> > behavior according to the C standard, even though for a particular
> > compiler and processor, the generated code is the same.
> > 
> > Here is a small test program that shows gcc generates the same x86_64
> > assembly code for the normal and atomic (with relaxed memory ordering)
> > load and store operations:  
> 
> we have plenty of code that accesses fields without the extra layer
> of weak atomics.

> For example the progress code in the frame threading.

Which was recently fixed in Libav AFAIR...

> 
> Can you just merge the varible ?
> 
> this also would be easier to backport if anyone wants to backport
> 
> If you still want to add weak atomics as in the patch, afterwards on
> top iam not against but that is me, i wont apply it if theres no
> consensus or clear majority in favour though
> 
> [...]
>
Wan-Teh Chang Nov. 23, 2016, 7:40 p.m. UTC | #8
On Tue, Nov 22, 2016 at 3:30 PM, wm4 <nfxjfg@googlemail.com> wrote:
> On Tue, 22 Nov 2016 23:57:15 +0100
> Michael Niedermayer <michael@niedermayer.cc> wrote:
>
>> For example the progress code in the frame threading.
>
> Which was recently fixed in Libav AFAIR...

You're right. libav/libavcodec/pthread_frame.c has code similar to my
ffmpeg patch http://ffmpeg.org/pipermail/ffmpeg-devel/2016-March/190454.html,
and much more.

Note: libav/libavcodec/pthread_frame.c uses unnecessary (too strong)
memory barriers in ff_thread_report_progress(). We can fix those when
the code is merged to ffmpeg.

Wan-Teh Chang
wm4 Nov. 24, 2016, 9:56 a.m. UTC | #9
On Wed, 23 Nov 2016 11:40:25 -0800
Wan-Teh Chang <wtc-at-google.com@ffmpeg.org> wrote:

> On Tue, Nov 22, 2016 at 3:30 PM, wm4 <nfxjfg@googlemail.com> wrote:
> > On Tue, 22 Nov 2016 23:57:15 +0100
> > Michael Niedermayer <michael@niedermayer.cc> wrote:
> >  
> >> For example the progress code in the frame threading.  
> >
> > Which was recently fixed in Libav AFAIR...  
> 
> You're right. libav/libavcodec/pthread_frame.c has code similar to my
> ffmpeg patch http://ffmpeg.org/pipermail/ffmpeg-devel/2016-March/190454.html,
> and much more.
> 
> Note: libav/libavcodec/pthread_frame.c uses unnecessary (too strong)
> memory barriers in ff_thread_report_progress(). We can fix those when
> the code is merged to ffmpeg.

You could also just send a patch to them.
Jean-Yves Avenard Nov. 24, 2016, 12:31 p.m. UTC | #10
Hi.

Mozilla also fixed some data races in pthread_frames.c which you may want
to look at. It's very similar to problem you mentioned

We made the code conditional on being in an TSan build.
https://dxr.mozilla.org/mozilla-central/source/media/ffvpx/libavcodec/pthread_frame.c#46

Sorry, using the new gmail app, not allowing me to reply inline. I can only
top post.

Jean-Yves

Le jeu. 24 nov. 2016 à 06:47, Wan-Teh Chang <wtc-at-google.com@ffmpeg.org>
a écrit :

> On Tue, Nov 22, 2016 at 3:30 PM, wm4 <nfxjfg@googlemail.com> wrote:
> > On Tue, 22 Nov 2016 23:57:15 +0100
> > Michael Niedermayer <michael@niedermayer.cc> wrote:
> >
> >> For example the progress code in the frame threading.
> >
> > Which was recently fixed in Libav AFAIR...
>
> You're right. libav/libavcodec/pthread_frame.c has code similar to my
> ffmpeg patch
> http://ffmpeg.org/pipermail/ffmpeg-devel/2016-March/190454.html,
> and much more.
>
> Note: libav/libavcodec/pthread_frame.c uses unnecessary (too strong)
> memory barriers in ff_thread_report_progress(). We can fix those when
> the code is merged to ffmpeg.
>
> Wan-Teh Chang
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
Wan-Teh Chang Dec. 6, 2016, 7:08 p.m. UTC | #11
On Thu, Nov 24, 2016 at 1:56 AM, wm4 <nfxjfg@googlemail.com> wrote:
> On Wed, 23 Nov 2016 11:40:25 -0800
> Wan-Teh Chang <wtc-at-google.com@ffmpeg.org> wrote:
>
>> On Tue, Nov 22, 2016 at 3:30 PM, wm4 <nfxjfg@googlemail.com> wrote:
>> > On Tue, 22 Nov 2016 23:57:15 +0100
>> > Michael Niedermayer <michael@niedermayer.cc> wrote:
>> >
>> >> For example the progress code in the frame threading.
>> >
>> > Which was recently fixed in Libav AFAIR...
>>
>> You're right. libav/libavcodec/pthread_frame.c has code similar to my
>> ffmpeg patch http://ffmpeg.org/pipermail/ffmpeg-devel/2016-March/190454.html,
>> and much more.
>>
>> Note: libav/libavcodec/pthread_frame.c uses unnecessary (too strong)
>> memory barriers in ff_thread_report_progress(). We can fix those when
>> the code is merged to ffmpeg.
>
> You could also just send a patch to them.

Done. I sent a patch to libav-devel last week:
https://lists.libav.org/pipermail/libav-devel/2016-November/080903.html

Wan-Teh Chang
diff mbox

Patch

diff --git a/libavutil/atomic.c b/libavutil/atomic.c
index 64cff25..7bce013 100644
--- a/libavutil/atomic.c
+++ b/libavutil/atomic.c
@@ -40,6 +40,11 @@  int avpriv_atomic_int_get(volatile int *ptr)
     return res;
 }
 
+int avpriv_atomic_int_get_relaxed(volatile int *ptr)
+{
+    return avpriv_atomic_int_get(ptr);
+}
+
 void avpriv_atomic_int_set(volatile int *ptr, int val)
 {
     pthread_mutex_lock(&atomic_lock);
@@ -47,6 +52,11 @@  void avpriv_atomic_int_set(volatile int *ptr, int val)
     pthread_mutex_unlock(&atomic_lock);
 }
 
+void avpriv_atomic_int_set_relaxed(volatile int *ptr, int val)
+{
+    avpriv_atomic_int_set(ptr, val);
+}
+
 int avpriv_atomic_int_add_and_fetch(volatile int *ptr, int inc)
 {
     int res;
@@ -59,6 +69,17 @@  int avpriv_atomic_int_add_and_fetch(volatile int *ptr, int inc)
     return res;
 }
 
+int avpriv_atomic_int_cas_relaxed(volatile int *ptr, int oldval, int newval)
+{
+    int ret;
+    pthread_mutex_lock(&atomic_lock);
+    ret = *ptr;
+    if (ret == oldval)
+        *ptr = newval;
+    pthread_mutex_unlock(&atomic_lock);
+    return ret;
+}
+
 void *avpriv_atomic_ptr_cas(void * volatile *ptr, void *oldval, void *newval)
 {
     void *ret;
@@ -77,17 +98,36 @@  int avpriv_atomic_int_get(volatile int *ptr)
     return *ptr;
 }
 
+int avpriv_atomic_int_get_relaxed(volatile int *ptr)
+{
+    return avpriv_atomic_int_get(ptr);
+}
+
 void avpriv_atomic_int_set(volatile int *ptr, int val)
 {
     *ptr = val;
 }
 
+void avpriv_atomic_int_set_relaxed(volatile int *ptr, int val)
+{
+    avpriv_atomic_int_set(ptr, val);
+}
+
 int avpriv_atomic_int_add_and_fetch(volatile int *ptr, int inc)
 {
     *ptr += inc;
     return *ptr;
 }
 
+int avpriv_atomic_int_cas_relaxed(volatile int *ptr, int oldval, int newval)
+{
+    if (*ptr == oldval) {
+        *ptr = newval;
+        return oldval;
+    }
+    return *ptr;
+}
+
 void *avpriv_atomic_ptr_cas(void * volatile *ptr, void *oldval, void *newval)
 {
     if (*ptr == oldval) {
diff --git a/libavutil/atomic.h b/libavutil/atomic.h
index 15906d2..c5aa1a4 100644
--- a/libavutil/atomic.h
+++ b/libavutil/atomic.h
@@ -40,20 +40,38 @@ 
  *
  * @param ptr atomic integer
  * @return the current value of the atomic integer
- * @note This acts as a memory barrier.
+ * @note This acts as a memory barrier with sequentially-consistent ordering.
  */
 int avpriv_atomic_int_get(volatile int *ptr);
 
 /**
+ * Load the current value stored in an atomic integer.
+ *
+ * @param ptr atomic integer
+ * @return the current value of the atomic integer
+ * @note This does NOT act as a memory barrier.
+ */
+int avpriv_atomic_int_get_relaxed(volatile int *ptr);
+
+/**
  * Store a new value in an atomic integer.
  *
  * @param ptr atomic integer
  * @param val the value to store in the atomic integer
- * @note This acts as a memory barrier.
+ * @note This acts as a memory barrier with sequentially-consistent ordering.
  */
 void avpriv_atomic_int_set(volatile int *ptr, int val);
 
 /**
+ * Store a new value in an atomic integer.
+ *
+ * @param ptr atomic integer
+ * @param val the value to store in the atomic integer
+ * @note This does NOT act as a memory barrier.
+ */
+void avpriv_atomic_int_set_relaxed(volatile int *ptr, int val);
+
+/**
  * Add a value to an atomic integer.
  *
  * @param ptr atomic integer
@@ -65,12 +83,24 @@  void avpriv_atomic_int_set(volatile int *ptr, int val);
 int avpriv_atomic_int_add_and_fetch(volatile int *ptr, int inc);
 
 /**
+ * Atomic integer compare and swap.
+ *
+ * @param ptr pointer to the integer to operate on
+ * @param oldval do the swap if the current value of *ptr equals to oldval
+ * @param newval value to replace *ptr with
+ * @return the value of *ptr before comparison
+ * @note This does NOT act as a memory barrier.
+ */
+int avpriv_atomic_int_cas_relaxed(volatile int *ptr, int oldval, int newval);
+
+/**
  * Atomic pointer compare and swap.
  *
  * @param ptr pointer to the pointer to operate on
  * @param oldval do the swap if the current value of *ptr equals to oldval
  * @param newval value to replace *ptr with
  * @return the value of *ptr before comparison
+ * @note This acts as a memory barrier with sequentially-consistent ordering.
  */
 void *avpriv_atomic_ptr_cas(void * volatile *ptr, void *oldval, void *newval);
 
diff --git a/libavutil/atomic_gcc.h b/libavutil/atomic_gcc.h
index 5f9fc49..15dd508 100644
--- a/libavutil/atomic_gcc.h
+++ b/libavutil/atomic_gcc.h
@@ -36,6 +36,16 @@  static inline int atomic_int_get_gcc(volatile int *ptr)
 #endif
 }
 
+#define avpriv_atomic_int_get_relaxed atomic_int_get_relaxed_gcc
+static inline int atomic_int_get_relaxed_gcc(volatile int *ptr)
+{
+#if HAVE_ATOMIC_COMPARE_EXCHANGE
+    return __atomic_load_n(ptr, __ATOMIC_RELAXED);
+#else
+    return *ptr;
+#endif
+}
+
 #define avpriv_atomic_int_set atomic_int_set_gcc
 static inline void atomic_int_set_gcc(volatile int *ptr, int val)
 {
@@ -47,6 +57,16 @@  static inline void atomic_int_set_gcc(volatile int *ptr, int val)
 #endif
 }
 
+#define avpriv_atomic_int_set_relaxed atomic_int_set_relaxed_gcc
+static inline void atomic_int_set_relaxed_gcc(volatile int *ptr, int val)
+{
+#if HAVE_ATOMIC_COMPARE_EXCHANGE
+    __atomic_store_n(ptr, val, __ATOMIC_RELAXED);
+#else
+    *ptr = val;
+#endif
+}
+
 #define avpriv_atomic_int_add_and_fetch atomic_int_add_and_fetch_gcc
 static inline int atomic_int_add_and_fetch_gcc(volatile int *ptr, int inc)
 {
@@ -57,6 +77,19 @@  static inline int atomic_int_add_and_fetch_gcc(volatile int *ptr, int inc)
 #endif
 }
 
+#define avpriv_atomic_int_cas_relaxed atomic_int_cas_relaxed_gcc
+static inline int atomic_int_cas_relaxed_gcc(volatile int *ptr,
+                                             int oldval, int newval)
+{
+#if HAVE_SYNC_VAL_COMPARE_AND_SWAP
+    return __sync_val_compare_and_swap(ptr, oldval, newval);
+#else
+    __atomic_compare_exchange_n(ptr, &oldval, newval, 0, __ATOMIC_RELAXED,
+                                __ATOMIC_RELAXED);
+    return oldval;
+#endif
+}
+
 #define avpriv_atomic_ptr_cas atomic_ptr_cas_gcc
 static inline void *atomic_ptr_cas_gcc(void * volatile *ptr,
                                        void *oldval, void *newval)
diff --git a/libavutil/atomic_suncc.h b/libavutil/atomic_suncc.h
index a75a37b..0997f23 100644
--- a/libavutil/atomic_suncc.h
+++ b/libavutil/atomic_suncc.h
@@ -31,6 +31,12 @@  static inline int atomic_int_get_suncc(volatile int *ptr)
     return *ptr;
 }
 
+#define avpriv_atomic_int_get_relaxed atomic_int_get_relaxed_suncc
+static inline int atomic_int_get_relaxed_suncc(volatile int *ptr)
+{
+    return *ptr;
+}
+
 #define avpriv_atomic_int_set atomic_int_set_suncc
 static inline void atomic_int_set_suncc(volatile int *ptr, int val)
 {
@@ -38,12 +44,25 @@  static inline void atomic_int_set_suncc(volatile int *ptr, int val)
     __machine_rw_barrier();
 }
 
+#define avpriv_atomic_int_set_relaxed atomic_int_set_relaxed_suncc
+static inline void atomic_int_set_relaxed_suncc(volatile int *ptr, int val)
+{
+    *ptr = val;
+}
+
 #define avpriv_atomic_int_add_and_fetch atomic_int_add_and_fetch_suncc
 static inline int atomic_int_add_and_fetch_suncc(volatile int *ptr, int inc)
 {
     return atomic_add_int_nv(ptr, inc);
 }
 
+#define avpriv_atomic_int_cas_relaxed atomic_int_cas_relaxed_suncc
+static inline int atomic_int_cas_relaxed_suncc(volatile int *ptr,
+                                               int oldval, int newval)
+{
+    return atomic_cas_uint((volatile uint_t *)ptr, oldval, newval);
+}
+
 #define avpriv_atomic_ptr_cas atomic_ptr_cas_suncc
 static inline void *atomic_ptr_cas_suncc(void * volatile *ptr,
                                          void *oldval, void *newval)
diff --git a/libavutil/atomic_win32.h b/libavutil/atomic_win32.h
index f729933..7ea0a56 100644
--- a/libavutil/atomic_win32.h
+++ b/libavutil/atomic_win32.h
@@ -31,6 +31,12 @@  static inline int atomic_int_get_win32(volatile int *ptr)
     return *ptr;
 }
 
+#define avpriv_atomic_int_get_relaxed atomic_int_get_relaxed_win32
+static inline int atomic_int_get_relaxed_win32(volatile int *ptr)
+{
+    return *ptr;
+}
+
 #define avpriv_atomic_int_set atomic_int_set_win32
 static inline void atomic_int_set_win32(volatile int *ptr, int val)
 {
@@ -38,12 +44,27 @@  static inline void atomic_int_set_win32(volatile int *ptr, int val)
     MemoryBarrier();
 }
 
+#define avpriv_atomic_int_set_relaxed atomic_int_set_relaxed_win32
+static inline void atomic_int_set_relaxed_win32(volatile int *ptr, int val)
+{
+    *ptr = val;
+}
+
 #define avpriv_atomic_int_add_and_fetch atomic_int_add_and_fetch_win32
 static inline int atomic_int_add_and_fetch_win32(volatile int *ptr, int inc)
 {
     return inc + InterlockedExchangeAdd(ptr, inc);
 }
 
+#define avpriv_atomic_int_cas_relaxed atomic_int_cas_relaxed_win32
+static inline int atomic_int_cas_relaxed_win32(volatile int *ptr,
+                                               int oldval, int newval)
+{
+    /* InterlockedCompareExchangeNoFence is available on Windows 8 or later
+     * only. */
+    return InterlockedCompareExchange((volatile LONG *)ptr, newval, oldval);
+}
+
 #define avpriv_atomic_ptr_cas atomic_ptr_cas_win32
 static inline void *atomic_ptr_cas_win32(void * volatile *ptr,
                                          void *oldval, void *newval)
diff --git a/libavutil/cpu.c b/libavutil/cpu.c
index f5785fc..cbcca4c 100644
--- a/libavutil/cpu.c
+++ b/libavutil/cpu.c
@@ -23,6 +23,7 @@ 
 #include "config.h"
 #include "opt.h"
 #include "common.h"
+#include "atomic.h"
 
 #if HAVE_SCHED_GETAFFINITY
 #ifndef _GNU_SOURCE
@@ -44,7 +45,20 @@ 
 #include <unistd.h>
 #endif
 
-static int flags, checked;
+static int cpu_flags = -1;
+
+static int get_cpu_flags(void)
+{
+    if (ARCH_AARCH64)
+        return ff_get_cpu_flags_aarch64();
+    if (ARCH_ARM)
+        return ff_get_cpu_flags_arm();
+    if (ARCH_PPC)
+        return ff_get_cpu_flags_ppc();
+    if (ARCH_X86)
+        return ff_get_cpu_flags_x86();
+    /* Not reached. */
+}
 
 void av_force_cpu_flags(int arg){
     if (   (arg & ( AV_CPU_FLAG_3DNOW    |
@@ -69,33 +83,22 @@  void av_force_cpu_flags(int arg){
         arg |= AV_CPU_FLAG_MMX;
     }
 
-    flags   = arg;
-    checked = arg != -1;
+    avpriv_atomic_int_set_relaxed(&cpu_flags, arg);
 }
 
 int av_get_cpu_flags(void)
 {
-    if (checked)
-        return flags;
-
-    if (ARCH_AARCH64)
-        flags = ff_get_cpu_flags_aarch64();
-    if (ARCH_ARM)
-        flags = ff_get_cpu_flags_arm();
-    if (ARCH_PPC)
-        flags = ff_get_cpu_flags_ppc();
-    if (ARCH_X86)
-        flags = ff_get_cpu_flags_x86();
-
-    checked = 1;
+    int flags = avpriv_atomic_int_get_relaxed(&cpu_flags);
+    if (flags == -1) {
+        flags = get_cpu_flags();
+        avpriv_atomic_int_cas_relaxed(&cpu_flags, -1, flags);
+    }
     return flags;
 }
 
 void av_set_cpu_flags_mask(int mask)
 {
-    checked       = 0;
-    flags         = av_get_cpu_flags() & mask;
-    checked       = 1;
+    avpriv_atomic_int_set_relaxed(&cpu_flags, get_cpu_flags() & mask);
 }
 
 int av_parse_cpu_flags(const char *s)
diff --git a/libavutil/tests/atomic.c b/libavutil/tests/atomic.c
index c92f220..5a12439 100644
--- a/libavutil/tests/atomic.c
+++ b/libavutil/tests/atomic.c
@@ -26,9 +26,22 @@  int main(void)
 
     res = avpriv_atomic_int_add_and_fetch(&val, 1);
     av_assert0(res == 2);
+    res = avpriv_atomic_int_add_and_fetch(&val, -1);
+    av_assert0(res == 1);
     avpriv_atomic_int_set(&val, 3);
     res = avpriv_atomic_int_get(&val);
     av_assert0(res == 3);
+    avpriv_atomic_int_set_relaxed(&val, 4);
+    res = avpriv_atomic_int_get_relaxed(&val);
+    av_assert0(res == 4);
+    res = avpriv_atomic_int_cas_relaxed(&val, 3, 5);
+    av_assert0(res == 4);
+    res = avpriv_atomic_int_get_relaxed(&val);
+    av_assert0(res == 4);
+    res = avpriv_atomic_int_cas_relaxed(&val, 4, 5);
+    av_assert0(res == 4);
+    res = avpriv_atomic_int_get_relaxed(&val);
+    av_assert0(res == 5);
 
     return 0;
 }