diff mbox series

[FFmpeg-devel] Stop using _explicit atomic operations where not necessary.

Message ID 20210605142740.7996-1-anton@khirnov.net
State New
Headers show
Series [FFmpeg-devel] Stop using _explicit atomic operations where not necessary. | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Anton Khirnov June 5, 2021, 2:27 p.m. UTC
Memory ordering constraints other than the default (sequentially
consistent) can behave in very unintuitive and unexpected ways, and so
should be avoided unless there is a strong (typically performance)
reason for using them. This holds especially for memory_order_relaxed,
which imposes no ordering constraints.
---
 libavformat/fifo.c |  8 ++++----
 libavutil/cpu.c    |  8 ++++----
 libavutil/mem.c    | 10 +++++-----
 3 files changed, 13 insertions(+), 13 deletions(-)

Comments

James Almer June 5, 2021, 2:46 p.m. UTC | #1
On 6/5/2021 11:27 AM, Anton Khirnov wrote:
> Memory ordering constraints other than the default (sequentially
> consistent) can behave in very unintuitive and unexpected ways, and so
> should be avoided unless there is a strong (typically performance)
> reason for using them. This holds especially for memory_order_relaxed,
> which imposes no ordering constraints.

Performance is important for FIFO, though. Could they use a laxer order 
at least for the fetch_sub and fetch_add cases?

Also, commit fed50c4304e, which made cpu_flags an atomic type, states 
that ThreadSanitizer stopped reporting races on tests/cpu_init after 
that change, so maybe memory order is not important for it, only atomicity.
In any case these cpu and mem functions are hardly a bottleneck 
anywhere, and rarely called outside of debugging, so this change is ok.

> ---
>   libavformat/fifo.c |  8 ++++----
>   libavutil/cpu.c    |  8 ++++----
>   libavutil/mem.c    | 10 +++++-----
>   3 files changed, 13 insertions(+), 13 deletions(-)
> 
> diff --git a/libavformat/fifo.c b/libavformat/fifo.c
> index 50656f78b7..2ac14207b8 100644
> --- a/libavformat/fifo.c
> +++ b/libavformat/fifo.c
> @@ -185,7 +185,7 @@ static int fifo_thread_write_packet(FifoThreadContext *ctx, AVPacket *pkt)
>       int ret, s_idx;
>   
>       if (fifo->timeshift && pkt->dts != AV_NOPTS_VALUE)
> -        atomic_fetch_sub_explicit(&fifo->queue_duration, next_duration(avf, pkt, &ctx->last_received_dts), memory_order_relaxed);
> +        atomic_fetch_sub(&fifo->queue_duration, next_duration(avf, pkt, &ctx->last_received_dts));
>   
>       if (ctx->drop_until_keyframe) {
>           if (pkt->flags & AV_PKT_FLAG_KEY) {
> @@ -454,7 +454,7 @@ static void *fifo_consumer_thread(void *data)
>               av_log(avf, AV_LOG_INFO, "FIFO queue flushed\n");
>   
>           if (fifo->timeshift)
> -            while (atomic_load_explicit(&fifo->queue_duration, memory_order_relaxed) < fifo->timeshift)
> +            while (atomic_load(&fifo->queue_duration) < fifo->timeshift)
>                   av_usleep(10000);
>   
>           ret = av_thread_message_queue_recv(queue, &msg, 0);
> @@ -594,7 +594,7 @@ static int fifo_write_packet(AVFormatContext *avf, AVPacket *pkt)
>       }
>   
>       if (fifo->timeshift && pkt && pkt->dts != AV_NOPTS_VALUE)
> -        atomic_fetch_add_explicit(&fifo->queue_duration, next_duration(avf, pkt, &fifo->last_sent_dts), memory_order_relaxed);
> +        atomic_fetch_add(&fifo->queue_duration, next_duration(avf, pkt, &fifo->last_sent_dts));
>   
>       return ret;
>   fail:
> @@ -621,7 +621,7 @@ static int fifo_write_trailer(AVFormatContext *avf)
>               } else {
>                   now += delay;
>               }
> -            atomic_fetch_add_explicit(&fifo->queue_duration, delay, memory_order_relaxed);
> +            atomic_fetch_add(&fifo->queue_duration, delay);
>               elapsed += delay;
>               if (elapsed > fifo->timeshift)
>                   break;
> diff --git a/libavutil/cpu.c b/libavutil/cpu.c
> index 8960415d00..8a6af81ae4 100644
> --- a/libavutil/cpu.c
> +++ b/libavutil/cpu.c
> @@ -89,15 +89,15 @@ void av_force_cpu_flags(int arg){
>           arg |= AV_CPU_FLAG_MMX;
>       }
>   
> -    atomic_store_explicit(&cpu_flags, arg, memory_order_relaxed);
> +    atomic_store(&cpu_flags, arg);
>   }
>   
>   int av_get_cpu_flags(void)
>   {
> -    int flags = atomic_load_explicit(&cpu_flags, memory_order_relaxed);
> +    int flags = atomic_load(&cpu_flags);
>       if (flags == -1) {
>           flags = get_cpu_flags();
> -        atomic_store_explicit(&cpu_flags, flags, memory_order_relaxed);
> +        atomic_store(&cpu_flags, flags);
>       }
>       return flags;
>   }
> @@ -221,7 +221,7 @@ int av_cpu_count(void)
>       nb_cpus = sysinfo.dwNumberOfProcessors;
>   #endif
>   
> -    if (!atomic_exchange_explicit(&printed, 1, memory_order_relaxed))
> +    if (!atomic_exchange(&printed, 1))
>           av_log(NULL, AV_LOG_DEBUG, "detected %d logical cores\n", nb_cpus);
>   
>       return nb_cpus;
> diff --git a/libavutil/mem.c b/libavutil/mem.c
> index a52d33d4a6..c216c0314f 100644
> --- a/libavutil/mem.c
> +++ b/libavutil/mem.c
> @@ -72,14 +72,14 @@ void  free(void *ptr);
>   static atomic_size_t max_alloc_size = ATOMIC_VAR_INIT(INT_MAX);
>   
>   void av_max_alloc(size_t max){
> -    atomic_store_explicit(&max_alloc_size, max, memory_order_relaxed);
> +    atomic_store(&max_alloc_size, max);
>   }
>   
>   void *av_malloc(size_t size)
>   {
>       void *ptr = NULL;
>   
> -    if (size > atomic_load_explicit(&max_alloc_size, memory_order_relaxed))
> +    if (size > atomic_load(&max_alloc_size))
>           return NULL;
>   
>   #if HAVE_POSIX_MEMALIGN
> @@ -135,7 +135,7 @@ void *av_malloc(size_t size)
>   void *av_realloc(void *ptr, size_t size)
>   {
>       void *ret;
> -    if (size > atomic_load_explicit(&max_alloc_size, memory_order_relaxed))
> +    if (size > atomic_load(&max_alloc_size))
>           return NULL;
>   
>   #if HAVE_ALIGNED_MALLOC
> @@ -489,7 +489,7 @@ void *av_fast_realloc(void *ptr, unsigned int *size, size_t min_size)
>       if (min_size <= *size)
>           return ptr;
>   
> -    max_size = atomic_load_explicit(&max_alloc_size, memory_order_relaxed);
> +    max_size = atomic_load(&max_alloc_size);
>   
>       if (min_size > max_size) {
>           *size = 0;
> @@ -521,7 +521,7 @@ static inline void fast_malloc(void *ptr, unsigned int *size, size_t min_size, i
>           return;
>       }
>   
> -    max_size = atomic_load_explicit(&max_alloc_size, memory_order_relaxed);
> +    max_size = atomic_load(&max_alloc_size);
>   
>       if (min_size > max_size) {
>           av_freep(ptr);
>
Anton Khirnov June 5, 2021, 4:49 p.m. UTC | #2
Quoting James Almer (2021-06-05 16:46:01)
> On 6/5/2021 11:27 AM, Anton Khirnov wrote:
> > Memory ordering constraints other than the default (sequentially
> > consistent) can behave in very unintuitive and unexpected ways, and so
> > should be avoided unless there is a strong (typically performance)
> > reason for using them. This holds especially for memory_order_relaxed,
> > which imposes no ordering constraints.
> 
> Performance is important for FIFO, though. Could they use a laxer order 
> at least for the fetch_sub and fetch_add cases?

Given that it uses AVThreadMessageQueue to send packets to the consumer
thread, and operations on AVThreadMessageQueue involve a full mutex
lock+unlock, I really doubt that atomic operations would be the
bottleneck here.

> 
> Also, commit fed50c4304e, which made cpu_flags an atomic type, states 
> that ThreadSanitizer stopped reporting races on tests/cpu_init after 
> that change, so maybe memory order is not important for it, only atomicity.
> In any case these cpu and mem functions are hardly a bottleneck 
> anywhere, and rarely called outside of debugging, so this change is ok.

AFAIU tsan will only find actual races. With atomisc you will not get
races, but the results might not be what you expect.
Andreas Rheinhardt June 6, 2021, 9:13 a.m. UTC | #3
Anton Khirnov:
> Memory ordering constraints other than the default (sequentially
> consistent) can behave in very unintuitive and unexpected ways, and so
> should be avoided unless there is a strong (typically performance)
> reason for using them. This holds especially for memory_order_relaxed,
> which imposes no ordering constraints.

I disagree with this: Adding ordering constraints is wrong, as it can
provide synchronization between different threads and this can mask
races. Providing synchronization as a byproduct should be avoided: Users
should know what they get. And if a user wants an allocation limit in
thread A to be in effect for allocations in thread B, then the user
should provide the synchronization him/herself.

Here is an example of a program that contains a data race that is masked
by this patch: If all allocations happen before av_max_alloc(3), then
tsan will complain about a data race; otherwise, it is silent, as the
write in av_max_alloc() synchronizes with the read in av_malloc(), so
that race = 1 is visible in the main thread later.


#include <pthread.h>
#include <stdio.h>
#include "libavutil/mem.h"


int race;

static void *second_thread(void *arg)
{
    race = 1;
    av_max_alloc(3);
    return NULL;
}

int main()
{
    pthread_t thread;

    pthread_create(&thread, NULL, second_thread, NULL);

    for (int i = 0; i < 150; i++)
        av_malloc(i);

    printf("%d\n", race);
    pthread_join(thread, NULL);
}

- Andreas

> ---
>  libavformat/fifo.c |  8 ++++----
>  libavutil/cpu.c    |  8 ++++----
>  libavutil/mem.c    | 10 +++++-----
>  3 files changed, 13 insertions(+), 13 deletions(-)
> 
> diff --git a/libavformat/fifo.c b/libavformat/fifo.c
> index 50656f78b7..2ac14207b8 100644
> --- a/libavformat/fifo.c
> +++ b/libavformat/fifo.c
> @@ -185,7 +185,7 @@ static int fifo_thread_write_packet(FifoThreadContext *ctx, AVPacket *pkt)
>      int ret, s_idx;
>  
>      if (fifo->timeshift && pkt->dts != AV_NOPTS_VALUE)
> -        atomic_fetch_sub_explicit(&fifo->queue_duration, next_duration(avf, pkt, &ctx->last_received_dts), memory_order_relaxed);
> +        atomic_fetch_sub(&fifo->queue_duration, next_duration(avf, pkt, &ctx->last_received_dts));
>  
>      if (ctx->drop_until_keyframe) {
>          if (pkt->flags & AV_PKT_FLAG_KEY) {
> @@ -454,7 +454,7 @@ static void *fifo_consumer_thread(void *data)
>              av_log(avf, AV_LOG_INFO, "FIFO queue flushed\n");
>  
>          if (fifo->timeshift)
> -            while (atomic_load_explicit(&fifo->queue_duration, memory_order_relaxed) < fifo->timeshift)
> +            while (atomic_load(&fifo->queue_duration) < fifo->timeshift)
>                  av_usleep(10000);
>  
>          ret = av_thread_message_queue_recv(queue, &msg, 0);
> @@ -594,7 +594,7 @@ static int fifo_write_packet(AVFormatContext *avf, AVPacket *pkt)
>      }
>  
>      if (fifo->timeshift && pkt && pkt->dts != AV_NOPTS_VALUE)
> -        atomic_fetch_add_explicit(&fifo->queue_duration, next_duration(avf, pkt, &fifo->last_sent_dts), memory_order_relaxed);
> +        atomic_fetch_add(&fifo->queue_duration, next_duration(avf, pkt, &fifo->last_sent_dts));
>  
>      return ret;
>  fail:
> @@ -621,7 +621,7 @@ static int fifo_write_trailer(AVFormatContext *avf)
>              } else {
>                  now += delay;
>              }
> -            atomic_fetch_add_explicit(&fifo->queue_duration, delay, memory_order_relaxed);
> +            atomic_fetch_add(&fifo->queue_duration, delay);
>              elapsed += delay;
>              if (elapsed > fifo->timeshift)
>                  break;
> diff --git a/libavutil/cpu.c b/libavutil/cpu.c
> index 8960415d00..8a6af81ae4 100644
> --- a/libavutil/cpu.c
> +++ b/libavutil/cpu.c
> @@ -89,15 +89,15 @@ void av_force_cpu_flags(int arg){
>          arg |= AV_CPU_FLAG_MMX;
>      }
>  
> -    atomic_store_explicit(&cpu_flags, arg, memory_order_relaxed);
> +    atomic_store(&cpu_flags, arg);
>  }
>  
>  int av_get_cpu_flags(void)
>  {
> -    int flags = atomic_load_explicit(&cpu_flags, memory_order_relaxed);
> +    int flags = atomic_load(&cpu_flags);
>      if (flags == -1) {
>          flags = get_cpu_flags();
> -        atomic_store_explicit(&cpu_flags, flags, memory_order_relaxed);
> +        atomic_store(&cpu_flags, flags);
>      }
>      return flags;
>  }
> @@ -221,7 +221,7 @@ int av_cpu_count(void)
>      nb_cpus = sysinfo.dwNumberOfProcessors;
>  #endif
>  
> -    if (!atomic_exchange_explicit(&printed, 1, memory_order_relaxed))
> +    if (!atomic_exchange(&printed, 1))
>          av_log(NULL, AV_LOG_DEBUG, "detected %d logical cores\n", nb_cpus);
>  
>      return nb_cpus;
> diff --git a/libavutil/mem.c b/libavutil/mem.c
> index a52d33d4a6..c216c0314f 100644
> --- a/libavutil/mem.c
> +++ b/libavutil/mem.c
> @@ -72,14 +72,14 @@ void  free(void *ptr);
>  static atomic_size_t max_alloc_size = ATOMIC_VAR_INIT(INT_MAX);
>  
>  void av_max_alloc(size_t max){
> -    atomic_store_explicit(&max_alloc_size, max, memory_order_relaxed);
> +    atomic_store(&max_alloc_size, max);
>  }
>  
>  void *av_malloc(size_t size)
>  {
>      void *ptr = NULL;
>  
> -    if (size > atomic_load_explicit(&max_alloc_size, memory_order_relaxed))
> +    if (size > atomic_load(&max_alloc_size))
>          return NULL;
>  
>  #if HAVE_POSIX_MEMALIGN
> @@ -135,7 +135,7 @@ void *av_malloc(size_t size)
>  void *av_realloc(void *ptr, size_t size)
>  {
>      void *ret;
> -    if (size > atomic_load_explicit(&max_alloc_size, memory_order_relaxed))
> +    if (size > atomic_load(&max_alloc_size))
>          return NULL;
>  
>  #if HAVE_ALIGNED_MALLOC
> @@ -489,7 +489,7 @@ void *av_fast_realloc(void *ptr, unsigned int *size, size_t min_size)
>      if (min_size <= *size)
>          return ptr;
>  
> -    max_size = atomic_load_explicit(&max_alloc_size, memory_order_relaxed);
> +    max_size = atomic_load(&max_alloc_size);
>  
>      if (min_size > max_size) {
>          *size = 0;
> @@ -521,7 +521,7 @@ static inline void fast_malloc(void *ptr, unsigned int *size, size_t min_size, i
>          return;
>      }
>  
> -    max_size = atomic_load_explicit(&max_alloc_size, memory_order_relaxed);
> +    max_size = atomic_load(&max_alloc_size);
>  
>      if (min_size > max_size) {
>          av_freep(ptr);
>
Andreas Rheinhardt June 6, 2021, 11:39 a.m. UTC | #4
James Almer:
> On 6/5/2021 11:27 AM, Anton Khirnov wrote:
>> Memory ordering constraints other than the default (sequentially
>> consistent) can behave in very unintuitive and unexpected ways, and so
>> should be avoided unless there is a strong (typically performance)
>> reason for using them. This holds especially for memory_order_relaxed,
>> which imposes no ordering constraints.
> 
> Performance is important for FIFO, though. Could they use a laxer order
> at least for the fetch_sub and fetch_add cases?
> 
> Also, commit fed50c4304e, which made cpu_flags an atomic type, states
> that ThreadSanitizer stopped reporting races on tests/cpu_init after
> that change, so maybe memory order is not important for it, only atomicity.

This is true: There can by definition be no data race involving an
atomic variable that is exclusively accessed via atomic operations: "The
execution of a program contains a data race if it contains two
conflicting actions in different threads, at least one of which is *not
atomic*, and neither happens before the other." (C11, 5.1.2.4.25)

Therefore fed50c4304eecb352e29ce789cdb96ea84d6162f and
a2a38b160620d91bc3f895dadc4501c589998b9c fixed data races (the latter
also a race condition); using sequentially consistent ordering meanwhile
brings no gain: It may provide synchronization between two threads
calling e.g. av_get_cpu_flags(), but the only effect of this is that
tsan might overlook buggy code that actually contains races. It notably
does not really establish an ordering: If two threads call
av_cpu_count() concurrently, then it is indetermined which of the two
threads will call av_log (if it has not already been called), but with
this patch it is certain that the second thread synchronizes with the
first thread and sees everything that the first thread has done so far
(which can mask bugs); and if we look at the test in
libavutil/tests/cpu_init.c, then all the possibilities that existed with
memory_order_relaxed also exist with memory_order_seq_cst:
 - Thread 1 can read first, see that cpu_flags is uninitialized and
initialize it, followed by thread 2 seeing and reusing the already
initialized value.
 - Thread 1 can read first, followed by thread 2 reading the value (with
both threads seeing that cpu_flags is uninitialized), followed by thread
1 storing its value, followed by thread 2 overwriting the value again;
the order of writes can also be the other way around.
 - Same as the last one, but with the order of stores reversed.
 - Exchanging the roles of thread 1 and thread 2 in the above scenarios
leads to the remaining cases.

> In any case these cpu and mem functions are hardly a bottleneck
> anywhere, and rarely called outside of debugging, so this change is ok.
> 
Mem functions are rarely called outside of debugging?

>> ---
>>   libavformat/fifo.c |  8 ++++----

The threading in fifo.c is weird: A variable (overflow_flag) that is
guarded by a mutex is marked as volatile; moreover it appears that one
could avoid said mutex altogether by making overflow_flag atomic. And
finally, the worker thread is only destroyed in write_trailer, but if
this is never called, then it will never be destroyed. Furtermore, in
this case the worker thread might wait on a condition variable that is
destroyed in av_thread_message_queue_free(), which is undefined behaviour.
I don't know whether the accesses to queue_duration need a
memory_ordering different than memory_order_relaxed.

>>   libavutil/cpu.c    |  8 ++++----
>>   libavutil/mem.c    | 10 +++++-----
>>   3 files changed, 13 insertions(+), 13 deletions(-)
>>
>> diff --git a/libavformat/fifo.c b/libavformat/fifo.c
>> index 50656f78b7..2ac14207b8 100644
>> --- a/libavformat/fifo.c
>> +++ b/libavformat/fifo.c
>> @@ -185,7 +185,7 @@ static int
>> fifo_thread_write_packet(FifoThreadContext *ctx, AVPacket *pkt)
>>       int ret, s_idx;
>>         if (fifo->timeshift && pkt->dts != AV_NOPTS_VALUE)
>> -        atomic_fetch_sub_explicit(&fifo->queue_duration,
>> next_duration(avf, pkt, &ctx->last_received_dts), memory_order_relaxed);
>> +        atomic_fetch_sub(&fifo->queue_duration, next_duration(avf,
>> pkt, &ctx->last_received_dts));
>>         if (ctx->drop_until_keyframe) {
>>           if (pkt->flags & AV_PKT_FLAG_KEY) {
>> @@ -454,7 +454,7 @@ static void *fifo_consumer_thread(void *data)
>>               av_log(avf, AV_LOG_INFO, "FIFO queue flushed\n");
>>             if (fifo->timeshift)
>> -            while (atomic_load_explicit(&fifo->queue_duration,
>> memory_order_relaxed) < fifo->timeshift)
>> +            while (atomic_load(&fifo->queue_duration) < fifo->timeshift)
>>                   av_usleep(10000);
>>             ret = av_thread_message_queue_recv(queue, &msg, 0);
>> @@ -594,7 +594,7 @@ static int fifo_write_packet(AVFormatContext *avf,
>> AVPacket *pkt)
>>       }
>>         if (fifo->timeshift && pkt && pkt->dts != AV_NOPTS_VALUE)
>> -        atomic_fetch_add_explicit(&fifo->queue_duration,
>> next_duration(avf, pkt, &fifo->last_sent_dts), memory_order_relaxed);
>> +        atomic_fetch_add(&fifo->queue_duration, next_duration(avf,
>> pkt, &fifo->last_sent_dts));
>>         return ret;
>>   fail:
>> @@ -621,7 +621,7 @@ static int fifo_write_trailer(AVFormatContext *avf)
>>               } else {
>>                   now += delay;
>>               }
>> -            atomic_fetch_add_explicit(&fifo->queue_duration, delay,
>> memory_order_relaxed);
>> +            atomic_fetch_add(&fifo->queue_duration, delay);
>>               elapsed += delay;
>>               if (elapsed > fifo->timeshift)
>>                   break;
>> diff --git a/libavutil/cpu.c b/libavutil/cpu.c
>> index 8960415d00..8a6af81ae4 100644
>> --- a/libavutil/cpu.c
>> +++ b/libavutil/cpu.c
>> @@ -89,15 +89,15 @@ void av_force_cpu_flags(int arg){
>>           arg |= AV_CPU_FLAG_MMX;
>>       }
>>   -    atomic_store_explicit(&cpu_flags, arg, memory_order_relaxed);
>> +    atomic_store(&cpu_flags, arg);
>>   }
>>     int av_get_cpu_flags(void)
>>   {
>> -    int flags = atomic_load_explicit(&cpu_flags, memory_order_relaxed);
>> +    int flags = atomic_load(&cpu_flags);
>>       if (flags == -1) {
>>           flags = get_cpu_flags();
>> -        atomic_store_explicit(&cpu_flags, flags, memory_order_relaxed);
>> +        atomic_store(&cpu_flags, flags);
>>       }
>>       return flags;
>>   }
>> @@ -221,7 +221,7 @@ int av_cpu_count(void)
>>       nb_cpus = sysinfo.dwNumberOfProcessors;
>>   #endif
>>   -    if (!atomic_exchange_explicit(&printed, 1, memory_order_relaxed))
>> +    if (!atomic_exchange(&printed, 1))
>>           av_log(NULL, AV_LOG_DEBUG, "detected %d logical cores\n",
>> nb_cpus);
>>         return nb_cpus;
>> diff --git a/libavutil/mem.c b/libavutil/mem.c
>> index a52d33d4a6..c216c0314f 100644
>> --- a/libavutil/mem.c
>> +++ b/libavutil/mem.c
>> @@ -72,14 +72,14 @@ void  free(void *ptr);
>>   static atomic_size_t max_alloc_size = ATOMIC_VAR_INIT(INT_MAX);
>>     void av_max_alloc(size_t max){
>> -    atomic_store_explicit(&max_alloc_size, max, memory_order_relaxed);
>> +    atomic_store(&max_alloc_size, max);
>>   }
>>     void *av_malloc(size_t size)
>>   {
>>       void *ptr = NULL;
>>   -    if (size > atomic_load_explicit(&max_alloc_size,
>> memory_order_relaxed))
>> +    if (size > atomic_load(&max_alloc_size))
>>           return NULL;
>>     #if HAVE_POSIX_MEMALIGN
>> @@ -135,7 +135,7 @@ void *av_malloc(size_t size)
>>   void *av_realloc(void *ptr, size_t size)
>>   {
>>       void *ret;
>> -    if (size > atomic_load_explicit(&max_alloc_size,
>> memory_order_relaxed))
>> +    if (size > atomic_load(&max_alloc_size))
>>           return NULL;
>>     #if HAVE_ALIGNED_MALLOC
>> @@ -489,7 +489,7 @@ void *av_fast_realloc(void *ptr, unsigned int
>> *size, size_t min_size)
>>       if (min_size <= *size)
>>           return ptr;
>>   -    max_size = atomic_load_explicit(&max_alloc_size,
>> memory_order_relaxed);
>> +    max_size = atomic_load(&max_alloc_size);
>>         if (min_size > max_size) {
>>           *size = 0;
>> @@ -521,7 +521,7 @@ static inline void fast_malloc(void *ptr, unsigned
>> int *size, size_t min_size, i
>>           return;
>>       }
>>   -    max_size = atomic_load_explicit(&max_alloc_size,
>> memory_order_relaxed);
>> +    max_size = atomic_load(&max_alloc_size);
>>         if (min_size > max_size) {
>>           av_freep(ptr);
>>
>
James Almer June 6, 2021, 12:17 p.m. UTC | #5
On 6/6/2021 8:39 AM, Andreas Rheinhardt wrote:
> James Almer:
>> On 6/5/2021 11:27 AM, Anton Khirnov wrote:
>>> Memory ordering constraints other than the default (sequentially
>>> consistent) can behave in very unintuitive and unexpected ways, and so
>>> should be avoided unless there is a strong (typically performance)
>>> reason for using them. This holds especially for memory_order_relaxed,
>>> which imposes no ordering constraints.
>>
>> Performance is important for FIFO, though. Could they use a laxer order
>> at least for the fetch_sub and fetch_add cases?
>>
>> Also, commit fed50c4304e, which made cpu_flags an atomic type, states
>> that ThreadSanitizer stopped reporting races on tests/cpu_init after
>> that change, so maybe memory order is not important for it, only atomicity.
> 
> This is true: There can by definition be no data race involving an
> atomic variable that is exclusively accessed via atomic operations: "The
> execution of a program contains a data race if it contains two
> conflicting actions in different threads, at least one of which is *not
> atomic*, and neither happens before the other." (C11, 5.1.2.4.25)
> 
> Therefore fed50c4304eecb352e29ce789cdb96ea84d6162f and
> a2a38b160620d91bc3f895dadc4501c589998b9c fixed data races (the latter
> also a race condition); using sequentially consistent ordering meanwhile
> brings no gain: It may provide synchronization between two threads
> calling e.g. av_get_cpu_flags(), but the only effect of this is that
> tsan might overlook buggy code that actually contains races. It notably
> does not really establish an ordering: If two threads call
> av_cpu_count() concurrently, then it is indetermined which of the two
> threads will call av_log (if it has not already been called), but with
> this patch it is certain that the second thread synchronizes with the
> first thread and sees everything that the first thread has done so far
> (which can mask bugs); and if we look at the test in
> libavutil/tests/cpu_init.c, then all the possibilities that existed with
> memory_order_relaxed also exist with memory_order_seq_cst:
>   - Thread 1 can read first, see that cpu_flags is uninitialized and
> initialize it, followed by thread 2 seeing and reusing the already
> initialized value.
>   - Thread 1 can read first, followed by thread 2 reading the value (with
> both threads seeing that cpu_flags is uninitialized), followed by thread
> 1 storing its value, followed by thread 2 overwriting the value again;
> the order of writes can also be the other way around.
>   - Same as the last one, but with the order of stores reversed.
>   - Exchanging the roles of thread 1 and thread 2 in the above scenarios
> leads to the remaining cases.
> 
>> In any case these cpu and mem functions are hardly a bottleneck
>> anywhere, and rarely called outside of debugging, so this change is ok.
>>
> Mem functions are rarely called outside of debugging?

I was specifically thinking about about av_max_alloc(), but overlooked 
the fact max_alloc_size is read by av_malloc() an co, which are 
obviously not for debug only. Sorry.
Anton Khirnov June 6, 2021, 12:54 p.m. UTC | #6
Quoting Andreas Rheinhardt (2021-06-06 11:13:00)
> Anton Khirnov:
> > Memory ordering constraints other than the default (sequentially
> > consistent) can behave in very unintuitive and unexpected ways, and so
> > should be avoided unless there is a strong (typically performance)
> > reason for using them. This holds especially for memory_order_relaxed,
> > which imposes no ordering constraints.
> 
> I disagree with this: Adding ordering constraints is wrong, as it can
> provide synchronization between different threads and this can mask
> races. Providing synchronization as a byproduct should be avoided: Users
> should know what they get. And if a user wants an allocation limit in
> thread A to be in effect for allocations in thread B, then the user
> should provide the synchronization him/herself.

I do not agree. C11 default is sequential consistency, therefore our
public API should provide sequential consistency.

My other concern is that people are cargo culting this code around
without understanding what it really does. Again, sequentially
consistent atomics make that safer, since they do what you'd expect them
to.

> 
> Here is an example of a program that contains a data race that is masked
> by this patch: If all allocations happen before av_max_alloc(3), then
> tsan will complain about a data race; otherwise, it is silent, as the
> write in av_max_alloc() synchronizes with the read in av_malloc(), so
> that race = 1 is visible in the main thread later.

This example is highly artificial and IMO not relevant, since
- nothing in the libraries should call av_max_malloc() or other
  functions that modify global state
- it is not our job to look for races in user programs
Andreas Rheinhardt June 6, 2021, 5:20 p.m. UTC | #7
Anton Khirnov:
> Quoting Andreas Rheinhardt (2021-06-06 11:13:00)
>> Anton Khirnov:
>>> Memory ordering constraints other than the default (sequentially
>>> consistent) can behave in very unintuitive and unexpected ways, and so
>>> should be avoided unless there is a strong (typically performance)
>>> reason for using them. This holds especially for memory_order_relaxed,
>>> which imposes no ordering constraints.
>>
>> I disagree with this: Adding ordering constraints is wrong, as it can
>> provide synchronization between different threads and this can mask
>> races. Providing synchronization as a byproduct should be avoided: Users
>> should know what they get. And if a user wants an allocation limit in
>> thread A to be in effect for allocations in thread B, then the user
>> should provide the synchronization him/herself.
> 
> I do not agree. C11 default is sequential consistency, therefore our
> public API should provide sequential consistency.
> 

1. The atomics here are not part of the public API at all: The
documentation of none of the functions involved mentions atomics or
synchronization etc. at all (which means that if a user wants
synchronization between threads, he should take care of it himself). We
could change the implementation to not use atomics at all (e.g. by
wrapping everything in mutexes, which needn't be sequentially consistent).
(Furthermore, in case of libavformat/fifo.c this does not even come
close to affecting the public API, given that we are talking about
synchronization between the user thread and a thread owned by us; in
case of av_cpu_count() the atomics only exist because of an av_log(),
there is no reason for a user to believe that synchronization is even
involved.)
2. I don't buy the argument:
Which memory ordering to use should be decided based upon what one wants
and needs; one should not blindly use one because it is the default or
recommended by some authority (doing so smells like cargo cult). In case
of av_get_cpu_flags()+av_force_cpu_flags(), all one wants is that one
can just load and write the value; given that there is no restriction
loads a valid value (i.e. no torn reads/writes) and one wants that one
can access this function by multiple threads simultaneously without
races. Atomics provide this even with memory_order_relaxed. In case of
av_cpu_count() all that is desired is that av_log() is only called once;
an atomic_exchange provides this automatically, even with
memory_order_relaxed. The av_max_alloc()+av_malloc/... combination is
just like av_get/force_cpu_flags().


> My other concern is that people are cargo culting this code around
> without understanding what it really does. Again, sequentially
> consistent atomics make that safer, since they do what you'd expect them
> to.
> 

Are you really advocating that we write our code in such a way that
makes it safe for cargo-culters to copy it?

>>
>> Here is an example of a program that contains a data race that is masked
>> by this patch: If all allocations happen before av_max_alloc(3), then
>> tsan will complain about a data race; otherwise, it is silent, as the
>> write in av_max_alloc() synchronizes with the read in av_malloc(), so
>> that race = 1 is visible in the main thread later.
> 
> This example is highly artificial and IMO not relevant, since
> - nothing in the libraries should call av_max_malloc() or other
>   functions that modify global state

Here is an example with a function that we actually use:


#include <pthread.h>
#include <stdio.h>
#include "libavutil/cpu.h"
#include "libavutil/mem.h"

int race;

static void *second_thread(void *arg)
{
    race = 1;
    printf("Second thread CPU count %d\n", av_cpu_count());
    return NULL;
}

int main()
{
    pthread_t thread;

    pthread_create(&thread, NULL, second_thread, NULL);

    /* Do some work/waste some time */
    for (int i = 0; i < 150; i++)
        av_malloc(i);

    int count = av_cpu_count();
    printf("%d, count %d\n", race, count);
    pthread_join(thread, NULL);
}

(The difference between memory_order_seq_cst and using pairs of
memory_order_release-memory_order_acquire (or memory_order_acq_rel) is
that there is a single (i.e. global) total order for all
memory_order_seq_cst operations; so using these is a bit like a
modification of global state.)

> - it is not our job to look for races in user programs
> 
True, but making it more difficult for users to find races is not nice.

- Andreas
Anton Khirnov June 10, 2021, 8:20 a.m. UTC | #8
Quoting Andreas Rheinhardt (2021-06-06 19:20:38)
> Anton Khirnov:
> > Quoting Andreas Rheinhardt (2021-06-06 11:13:00)
> >> Anton Khirnov:
> >>> Memory ordering constraints other than the default (sequentially
> >>> consistent) can behave in very unintuitive and unexpected ways, and so
> >>> should be avoided unless there is a strong (typically performance)
> >>> reason for using them. This holds especially for memory_order_relaxed,
> >>> which imposes no ordering constraints.
> >>
> >> I disagree with this: Adding ordering constraints is wrong, as it can
> >> provide synchronization between different threads and this can mask
> >> races. Providing synchronization as a byproduct should be avoided: Users
> >> should know what they get. And if a user wants an allocation limit in
> >> thread A to be in effect for allocations in thread B, then the user
> >> should provide the synchronization him/herself.
> > 
> > I do not agree. C11 default is sequential consistency, therefore our
> > public API should provide sequential consistency.
> > 
> 
> 1. The atomics here are not part of the public API at all: The
> documentation of none of the functions involved mentions atomics or
> synchronization etc. at all (which means that if a user wants
> synchronization between threads, he should take care of it himself). We
> could change the implementation to not use atomics at all (e.g. by
> wrapping everything in mutexes, which needn't be sequentially consistent).
> (Furthermore, in case of libavformat/fifo.c this does not even come
> close to affecting the public API, given that we are talking about
> synchronization between the user thread and a thread owned by us; in
> case of av_cpu_count() the atomics only exist because of an av_log(),
> there is no reason for a user to believe that synchronization is even
> involved.)
> 2. I don't buy the argument:
> Which memory ordering to use should be decided based upon what one wants
> and needs; one should not blindly use one because it is the default or
> recommended by some authority (doing so smells like cargo cult).

That is an overly idealistic view of the situation. I think very few
people around here have an indepth understanding of concurrency issues,
e.g. of the difference between using memory_order_seq_cst and
memory_order_relaxed. This stuff is complicated and unintuitive,
according to pretty much everyone who ever dealt with it.

And you cannot expect that everyone will study this topic in detail just
because they need to touch an atomic operation. So it seems perfectly
reasonable to me to defer to a suitable authority (like the people who
designed c11 atomics) and use their recommended defaults unless there is
a strong reason to do otherwise.

> In case
> of av_get_cpu_flags()+av_force_cpu_flags(), all one wants is that one
> can just load and write the value; given that there is no restriction
> loads a valid value (i.e. no torn reads/writes) and one wants that one
> can access this function by multiple threads simultaneously without
> races. Atomics provide this even with memory_order_relaxed. In case of
> av_cpu_count() all that is desired is that av_log() is only called once;
> an atomic_exchange provides this automatically, even with
> memory_order_relaxed. The av_max_alloc()+av_malloc/... combination is
> just like av_get/force_cpu_flags().
> 
> 
> > My other concern is that people are cargo culting this code around
> > without understanding what it really does. Again, sequentially
> > consistent atomics make that safer, since they do what you'd expect them
> > to.
> > 
> 
> Are you really advocating that we write our code in such a way that
> makes it safe for cargo-culters to copy it?

Yes I am.

People copy code around, that's just a fact of life. Two separate people
told me recently that their patches used memory_order_relaxed only
because they based them off some other code that was already using it.

So IMO we should strive to write code that is not only safe and correct,
but also robust against future modifications and being copied around.
That includes not using potentially-unsafe operations without a good
reason.

There is also the question of code readability. When I see
atomic_load(&foo);
then it's just an atomic load. But when I see
atomic_load_explicit(&foo, memory_order_bar);
I also need to think about the reason the author chose to use that
specific memory order.

As for your example program, you could also just remove the av_log()
from av_cpu_count(). Which should be done anyway because av_log(NULL is
evil.
Andreas Rheinhardt June 12, 2021, 4:04 p.m. UTC | #9
Anton Khirnov:
> Quoting Andreas Rheinhardt (2021-06-06 19:20:38)
>> Anton Khirnov:
>>> Quoting Andreas Rheinhardt (2021-06-06 11:13:00)
>>>> Anton Khirnov:
>>>>> Memory ordering constraints other than the default (sequentially
>>>>> consistent) can behave in very unintuitive and unexpected ways, and so
>>>>> should be avoided unless there is a strong (typically performance)
>>>>> reason for using them. This holds especially for memory_order_relaxed,
>>>>> which imposes no ordering constraints.
>>>>
>>>> I disagree with this: Adding ordering constraints is wrong, as it can
>>>> provide synchronization between different threads and this can mask
>>>> races. Providing synchronization as a byproduct should be avoided: Users
>>>> should know what they get. And if a user wants an allocation limit in
>>>> thread A to be in effect for allocations in thread B, then the user
>>>> should provide the synchronization him/herself.
>>>
>>> I do not agree. C11 default is sequential consistency, therefore our
>>> public API should provide sequential consistency.
>>>
>>
>> 1. The atomics here are not part of the public API at all: The
>> documentation of none of the functions involved mentions atomics or
>> synchronization etc. at all (which means that if a user wants
>> synchronization between threads, he should take care of it himself). We
>> could change the implementation to not use atomics at all (e.g. by
>> wrapping everything in mutexes, which needn't be sequentially consistent).
>> (Furthermore, in case of libavformat/fifo.c this does not even come
>> close to affecting the public API, given that we are talking about
>> synchronization between the user thread and a thread owned by us; in
>> case of av_cpu_count() the atomics only exist because of an av_log(),
>> there is no reason for a user to believe that synchronization is even
>> involved.)
>> 2. I don't buy the argument:
>> Which memory ordering to use should be decided based upon what one wants
>> and needs; one should not blindly use one because it is the default or
>> recommended by some authority (doing so smells like cargo cult).
> 
> That is an overly idealistic view of the situation. I think very few
> people around here have an indepth understanding of concurrency issues,
> e.g. of the difference between using memory_order_seq_cst and
> memory_order_relaxed. This stuff is complicated and unintuitive,
> according to pretty much everyone who ever dealt with it.
> 
> And you cannot expect that everyone will study this topic in detail just
> because they need to touch an atomic operation. So it seems perfectly
> reasonable to me to defer to a suitable authority (like the people who
> designed c11 atomics) and use their recommended defaults unless there is
> a strong reason to do otherwise.
> 
>> In case
>> of av_get_cpu_flags()+av_force_cpu_flags(), all one wants is that one
>> can just load and write the value; given that there is no restriction
>> loads a valid value (i.e. no torn reads/writes) and one wants that one
>> can access this function by multiple threads simultaneously without
>> races. Atomics provide this even with memory_order_relaxed. In case of
>> av_cpu_count() all that is desired is that av_log() is only called once;
>> an atomic_exchange provides this automatically, even with
>> memory_order_relaxed. The av_max_alloc()+av_malloc/... combination is
>> just like av_get/force_cpu_flags().
>>
>>
>>> My other concern is that people are cargo culting this code around
>>> without understanding what it really does. Again, sequentially
>>> consistent atomics make that safer, since they do what you'd expect them
>>> to.
>>>
>>
>> Are you really advocating that we write our code in such a way that
>> makes it safe for cargo-culters to copy it?
> 
> Yes I am.
> 
> People copy code around, that's just a fact of life. Two separate people
> told me recently that their patches used memory_order_relaxed only
> because they based them off some other code that was already using it.
> 
> So IMO we should strive to write code that is not only safe and correct,
> but also robust against future modifications and being copied around.
> That includes not using potentially-unsafe operations without a good
> reason.

IMO I gave a good reason for the cases considered: None of these
libavutil functions are intended to provide synchronization; they are
actually incapable of doing so and trying to do so makes no sense and
may mask bugs.

> 
> There is also the question of code readability. When I see
> atomic_load(&foo);
> then it's just an atomic load. But when I see
> atomic_load_explicit(&foo, memory_order_bar);
> I also need to think about the reason the author chose to use that
> specific memory order.

Strange, it's the other way around for me: If I see a
atomic_load/store(), I immediately ask myself: "Is the global total
order of modifications really necessary/intended? Does this code work if
one uses any of our compat/atomics that translates an atomic_load to
something weaker?"

> 
> As for your example program, you could also just remove the av_log()
> from av_cpu_count(). Which should be done anyway because av_log(NULL is
> evil.
> 
Another sample program where a race condition can be masked by your
changes; it has nothing to do with av_log(NULL:

#include <pthread.h>
#include <stdio.h>
#include "libavutil/cpu.h"
#include "libavutil/mem.h"

int race;

static void *second_thread(void *arg)
{
    race = 1;
    printf("Second thread CPU flags %x\n", av_get_cpu_flags());
    return NULL;
}

int main()
{
    pthread_t thread;

    pthread_create(&thread, NULL, second_thread, NULL);

    /* Do some work/waste some time */
    for (int i = 0; i < 200; i++)
        av_malloc(i);

    int flags = av_get_cpu_flags();
    printf("%d, flags %x\n", race, flags);
    pthread_join(thread, NULL);
}

(Orthogonally to the discussion about memory_order: The implementation
of av_get_cpu_flags() allows to overwrite cpu_flags that have already
been set, even when the earlier cpu_flags have been set by
av_force_cpu_flags(); in a scenario in which thread A calls
av_get_cpu_flags() and thread B calls av_force_cpu_flags() and later
av_get_cpu_flags(), then thread B might have the legitimate expectation
that the cpu flags it has forced are in effect for its later
av_get_cpu_flags() call if there is no second call to
av_force_cpu_flags(); yet this needn't be so, namely if the write of
thread B happens between the read and the write in thread A. Maybe we
should use atomic_compare_exchange_strong(_explicit) and only overwrite
cpu_flags if it is still -1?)

- Andreas
diff mbox series

Patch

diff --git a/libavformat/fifo.c b/libavformat/fifo.c
index 50656f78b7..2ac14207b8 100644
--- a/libavformat/fifo.c
+++ b/libavformat/fifo.c
@@ -185,7 +185,7 @@  static int fifo_thread_write_packet(FifoThreadContext *ctx, AVPacket *pkt)
     int ret, s_idx;
 
     if (fifo->timeshift && pkt->dts != AV_NOPTS_VALUE)
-        atomic_fetch_sub_explicit(&fifo->queue_duration, next_duration(avf, pkt, &ctx->last_received_dts), memory_order_relaxed);
+        atomic_fetch_sub(&fifo->queue_duration, next_duration(avf, pkt, &ctx->last_received_dts));
 
     if (ctx->drop_until_keyframe) {
         if (pkt->flags & AV_PKT_FLAG_KEY) {
@@ -454,7 +454,7 @@  static void *fifo_consumer_thread(void *data)
             av_log(avf, AV_LOG_INFO, "FIFO queue flushed\n");
 
         if (fifo->timeshift)
-            while (atomic_load_explicit(&fifo->queue_duration, memory_order_relaxed) < fifo->timeshift)
+            while (atomic_load(&fifo->queue_duration) < fifo->timeshift)
                 av_usleep(10000);
 
         ret = av_thread_message_queue_recv(queue, &msg, 0);
@@ -594,7 +594,7 @@  static int fifo_write_packet(AVFormatContext *avf, AVPacket *pkt)
     }
 
     if (fifo->timeshift && pkt && pkt->dts != AV_NOPTS_VALUE)
-        atomic_fetch_add_explicit(&fifo->queue_duration, next_duration(avf, pkt, &fifo->last_sent_dts), memory_order_relaxed);
+        atomic_fetch_add(&fifo->queue_duration, next_duration(avf, pkt, &fifo->last_sent_dts));
 
     return ret;
 fail:
@@ -621,7 +621,7 @@  static int fifo_write_trailer(AVFormatContext *avf)
             } else {
                 now += delay;
             }
-            atomic_fetch_add_explicit(&fifo->queue_duration, delay, memory_order_relaxed);
+            atomic_fetch_add(&fifo->queue_duration, delay);
             elapsed += delay;
             if (elapsed > fifo->timeshift)
                 break;
diff --git a/libavutil/cpu.c b/libavutil/cpu.c
index 8960415d00..8a6af81ae4 100644
--- a/libavutil/cpu.c
+++ b/libavutil/cpu.c
@@ -89,15 +89,15 @@  void av_force_cpu_flags(int arg){
         arg |= AV_CPU_FLAG_MMX;
     }
 
-    atomic_store_explicit(&cpu_flags, arg, memory_order_relaxed);
+    atomic_store(&cpu_flags, arg);
 }
 
 int av_get_cpu_flags(void)
 {
-    int flags = atomic_load_explicit(&cpu_flags, memory_order_relaxed);
+    int flags = atomic_load(&cpu_flags);
     if (flags == -1) {
         flags = get_cpu_flags();
-        atomic_store_explicit(&cpu_flags, flags, memory_order_relaxed);
+        atomic_store(&cpu_flags, flags);
     }
     return flags;
 }
@@ -221,7 +221,7 @@  int av_cpu_count(void)
     nb_cpus = sysinfo.dwNumberOfProcessors;
 #endif
 
-    if (!atomic_exchange_explicit(&printed, 1, memory_order_relaxed))
+    if (!atomic_exchange(&printed, 1))
         av_log(NULL, AV_LOG_DEBUG, "detected %d logical cores\n", nb_cpus);
 
     return nb_cpus;
diff --git a/libavutil/mem.c b/libavutil/mem.c
index a52d33d4a6..c216c0314f 100644
--- a/libavutil/mem.c
+++ b/libavutil/mem.c
@@ -72,14 +72,14 @@  void  free(void *ptr);
 static atomic_size_t max_alloc_size = ATOMIC_VAR_INIT(INT_MAX);
 
 void av_max_alloc(size_t max){
-    atomic_store_explicit(&max_alloc_size, max, memory_order_relaxed);
+    atomic_store(&max_alloc_size, max);
 }
 
 void *av_malloc(size_t size)
 {
     void *ptr = NULL;
 
-    if (size > atomic_load_explicit(&max_alloc_size, memory_order_relaxed))
+    if (size > atomic_load(&max_alloc_size))
         return NULL;
 
 #if HAVE_POSIX_MEMALIGN
@@ -135,7 +135,7 @@  void *av_malloc(size_t size)
 void *av_realloc(void *ptr, size_t size)
 {
     void *ret;
-    if (size > atomic_load_explicit(&max_alloc_size, memory_order_relaxed))
+    if (size > atomic_load(&max_alloc_size))
         return NULL;
 
 #if HAVE_ALIGNED_MALLOC
@@ -489,7 +489,7 @@  void *av_fast_realloc(void *ptr, unsigned int *size, size_t min_size)
     if (min_size <= *size)
         return ptr;
 
-    max_size = atomic_load_explicit(&max_alloc_size, memory_order_relaxed);
+    max_size = atomic_load(&max_alloc_size);
 
     if (min_size > max_size) {
         *size = 0;
@@ -521,7 +521,7 @@  static inline void fast_malloc(void *ptr, unsigned int *size, size_t min_size, i
         return;
     }
 
-    max_size = atomic_load_explicit(&max_alloc_size, memory_order_relaxed);
+    max_size = atomic_load(&max_alloc_size);
 
     if (min_size > max_size) {
         av_freep(ptr);