Message ID | 20220810225154.8435-1-timo@rothenpieler.org |
---|---|
State | New |
Headers | show |
Series | None | expand |
On 8/10/2022 7:51 PM, Timo Rothenpieler wrote: > _Float16 support was available on arm/aarch64 for a while, and with gcc > 12 was enabled on x86 as long as SSE2 is supported. > > If the target arch supports f16c, gcc emits fairly efficient assembly, > taking advantage of it. This is the case on x86-64-v3 or higher. > Same goes on arm, which has native float16 support. > On x86, without f16c, it emulates it in software using sse2 instructions. > > This has shown to perform rather poorly: > > _Float16 full SSE2 emulation: > frame=50074 fps=848 q=-0.0 size=N/A time=00:33:22.96 bitrate=N/A speed=33.9x > > _Float16 f16c accelerated (Zen2, --cpu=znver2): > frame=50636 fps=1965 q=-0.0 Lsize=N/A time=00:33:45.40 bitrate=N/A speed=78.6x > > classic half2float full software implementation: > frame=49926 fps=1605 q=-0.0 Lsize=N/A time=00:33:17.00 bitrate=N/A speed=64.2x > > Hence an additional check was introduced, that only enables use of > _Float16 on x86 if f16c is being utilized. > > On aarch64, a similar uplift in performance is seen: > > RPi4 half2float full software implementation: > frame= 6088 fps=126 q=-0.0 Lsize=N/A time=00:04:03.48 bitrate=N/A speed=5.06x > > RPi4 _Float16: > frame= 6103 fps=158 q=-0.0 Lsize=N/A time=00:04:04.08 bitrate=N/A speed=6.32x > > Since arm/aarch64 always natively support 16 bit floats, it can always > be considered fast there. > > I'm not aware of any additional platforms that currently support > _Float16. And if there are, they should be considered non-fast until > proven fast. > --- > configure | 13 +++++++++++++ > libavutil/float2half.c | 2 ++ > libavutil/float2half.h | 16 ++++++++++++++++ > libavutil/half2float.c | 4 ++++ > libavutil/half2float.h | 16 ++++++++++++++++ > 5 files changed, 51 insertions(+) > > diff --git a/configure b/configure > index 6761d0cb32..6ede9a5a8f 100755 > --- a/configure > +++ b/configure > @@ -2143,6 +2143,8 @@ ARCH_FEATURES=" > fast_64bit > fast_clz > fast_cmov > + fast_float16 > + float16 If HAVE_FLOAT16 is not going to be used, then don't export it here. Leave it as a configure internal variable. > local_aligned > simd_align_16 > simd_align_32 > @@ -5125,6 +5127,8 @@ elif enabled arm; then > ;; > esac > > + test_cflags -mfp16-format=ieee && add_cflags -mfp16-format=ieee > + > elif enabled avr32; then > > case $cpu in > @@ -6229,6 +6233,15 @@ check_builtin sync_val_compare_and_swap "" "int *ptr; int oldval, newval; __sync > check_builtin gmtime_r time.h "time_t *time; struct tm *tm; gmtime_r(time, tm)" > check_builtin localtime_r time.h "time_t *time; struct tm *tm; localtime_r(time, tm)" > > +check_builtin float16 "" "_Float16 f16var" > +if enabled float16; then > + if enabled x86; then > + test_cpp_condition stddef.h "defined(__F16C__)" && enable fast_float16 > + elif enabled arm || enabled aarch64; then > + enable fast_float16 > + fi > +fi > + > case "$custom_allocator" in > jemalloc) > # jemalloc by default does not use a prefix > diff --git a/libavutil/float2half.c b/libavutil/float2half.c > index dba14cef5d..7002612194 100644 > --- a/libavutil/float2half.c > +++ b/libavutil/float2half.c > @@ -20,6 +20,7 @@ > > void ff_init_float2half_tables(float2half_tables *t) > { > +#if !HAVE_FAST_FLOAT16 > for (int i = 0; i < 256; i++) { > int e = i - 127; > > @@ -50,4 +51,5 @@ void ff_init_float2half_tables(float2half_tables *t) > t->shifttable[i|0x100] = 13; > } > } > +#endif > } > diff --git a/libavutil/float2half.h b/libavutil/float2half.h > index b8c9cdfc4f..437666966b 100644 > --- a/libavutil/float2half.h > +++ b/libavutil/float2half.h > @@ -20,21 +20,37 @@ > #define AVUTIL_FLOAT2HALF_H > > #include <stdint.h> > +#include "intfloat.h" > + > +#include "config.h" > > typedef struct float2half_tables { > +#if HAVE_FAST_FLOAT16 > + uint8_t dummy; > +#else > uint16_t basetable[512]; > uint8_t shifttable[512]; > +#endif > } float2half_tables; > > void ff_init_float2half_tables(float2half_tables *t); > > static inline uint16_t float2half(uint32_t f, const float2half_tables *t) > { > +#if HAVE_FAST_FLOAT16 > + union { > + _Float16 f; > + uint16_t i; > + } u; > + u.f = av_int2float(f); > + return u.i; > +#else > uint16_t h; > > h = t->basetable[(f >> 23) & 0x1ff] + ((f & 0x007fffff) >> t->shifttable[(f >> 23) & 0x1ff]); > > return h; > +#endif > } > > #endif /* AVUTIL_FLOAT2HALF_H */ > diff --git a/libavutil/half2float.c b/libavutil/half2float.c > index baac8e4093..ff198a8187 100644 > --- a/libavutil/half2float.c > +++ b/libavutil/half2float.c > @@ -18,6 +18,7 @@ > > #include "libavutil/half2float.h" > > +#if !HAVE_FAST_FLOAT16 > static uint32_t convertmantissa(uint32_t i) > { > int32_t m = i << 13; // Zero pad mantissa bits > @@ -33,9 +34,11 @@ static uint32_t convertmantissa(uint32_t i) > > return m | e; // Return combined number > } > +#endif > > void ff_init_half2float_tables(half2float_tables *t) > { > +#if !HAVE_FAST_FLOAT16 > t->mantissatable[0] = 0; > for (int i = 1; i < 1024; i++) > t->mantissatable[i] = convertmantissa(i); > @@ -60,4 +63,5 @@ void ff_init_half2float_tables(half2float_tables *t) > t->offsettable[31] = 2048; > t->offsettable[32] = 0; > t->offsettable[63] = 2048; > +#endif > } > diff --git a/libavutil/half2float.h b/libavutil/half2float.h > index cb58e44a1c..57ee8372fe 100644 > --- a/libavutil/half2float.h > +++ b/libavutil/half2float.h > @@ -20,22 +20,38 @@ > #define AVUTIL_HALF2FLOAT_H > > #include <stdint.h> > +#include "intfloat.h" > + > +#include "config.h" > > typedef struct half2float_tables { > +#if HAVE_FAST_FLOAT16 > + uint8_t dummy; > +#else > uint32_t mantissatable[3072]; > uint32_t exponenttable[64]; > uint16_t offsettable[64]; > +#endif > } half2float_tables; > > void ff_init_half2float_tables(half2float_tables *t); > > static inline uint32_t half2float(uint16_t h, const half2float_tables *t) > { > +#if HAVE_FAST_FLOAT16 > + union { > + _Float16 f; > + uint16_t i; > + } u; > + u.i = h; > + return av_float2int(u.f); > +#else > uint32_t f; > > f = t->mantissatable[t->offsettable[h >> 10] + (h & 0x3ff)] + t->exponenttable[h >> 10]; > > return f; > +#endif > } > > #endif /* AVUTIL_HALF2FLOAT_H */
On 11/08/2022 02:14, James Almer wrote: > On 8/10/2022 7:51 PM, Timo Rothenpieler wrote: >> _Float16 support was available on arm/aarch64 for a while, and with gcc >> 12 was enabled on x86 as long as SSE2 is supported. >> >> If the target arch supports f16c, gcc emits fairly efficient assembly, >> taking advantage of it. This is the case on x86-64-v3 or higher. >> Same goes on arm, which has native float16 support. >> On x86, without f16c, it emulates it in software using sse2 instructions. >> >> This has shown to perform rather poorly: >> >> _Float16 full SSE2 emulation: >> frame=50074 fps=848 q=-0.0 size=N/A time=00:33:22.96 bitrate=N/A >> speed=33.9x >> >> _Float16 f16c accelerated (Zen2, --cpu=znver2): >> frame=50636 fps=1965 q=-0.0 Lsize=N/A time=00:33:45.40 bitrate=N/A >> speed=78.6x >> >> classic half2float full software implementation: >> frame=49926 fps=1605 q=-0.0 Lsize=N/A time=00:33:17.00 bitrate=N/A >> speed=64.2x >> >> Hence an additional check was introduced, that only enables use of >> _Float16 on x86 if f16c is being utilized. >> >> On aarch64, a similar uplift in performance is seen: >> >> RPi4 half2float full software implementation: >> frame= 6088 fps=126 q=-0.0 Lsize=N/A time=00:04:03.48 bitrate=N/A >> speed=5.06x >> >> RPi4 _Float16: >> frame= 6103 fps=158 q=-0.0 Lsize=N/A time=00:04:04.08 bitrate=N/A >> speed=6.32x >> >> Since arm/aarch64 always natively support 16 bit floats, it can always >> be considered fast there. >> >> I'm not aware of any additional platforms that currently support >> _Float16. And if there are, they should be considered non-fast until >> proven fast. >> --- >> configure | 13 +++++++++++++ >> libavutil/float2half.c | 2 ++ >> libavutil/float2half.h | 16 ++++++++++++++++ >> libavutil/half2float.c | 4 ++++ >> libavutil/half2float.h | 16 ++++++++++++++++ >> 5 files changed, 51 insertions(+) >> >> diff --git a/configure b/configure >> index 6761d0cb32..6ede9a5a8f 100755 >> --- a/configure >> +++ b/configure >> @@ -2143,6 +2143,8 @@ ARCH_FEATURES=" >> fast_64bit >> fast_clz >> fast_cmov >> + fast_float16 >> + float16 > > If HAVE_FLOAT16 is not going to be used, then don't export it here. > Leave it as a configure internal variable. > Good point, fixed locally. >> local_aligned >> simd_align_16 >> simd_align_32 >> @@ -5125,6 +5127,8 @@ elif enabled arm; then >> ;; >> esac >> + test_cflags -mfp16-format=ieee && add_cflags -mfp16-format=ieee >> + >> elif enabled avr32; then >> case $cpu in >> @@ -6229,6 +6233,15 @@ check_builtin sync_val_compare_and_swap "" "int >> *ptr; int oldval, newval; __sync >> check_builtin gmtime_r time.h "time_t *time; struct tm *tm; >> gmtime_r(time, tm)" >> check_builtin localtime_r time.h "time_t *time; struct tm *tm; >> localtime_r(time, tm)" >> +check_builtin float16 "" "_Float16 f16var" >> +if enabled float16; then >> + if enabled x86; then >> + test_cpp_condition stddef.h "defined(__F16C__)" && enable >> fast_float16 >> + elif enabled arm || enabled aarch64; then >> + enable fast_float16 >> + fi >> +fi >> + >> case "$custom_allocator" in >> jemalloc) >> # jemalloc by default does not use a prefix >> diff --git a/libavutil/float2half.c b/libavutil/float2half.c >> index dba14cef5d..7002612194 100644 >> --- a/libavutil/float2half.c >> +++ b/libavutil/float2half.c >> @@ -20,6 +20,7 @@ >> void ff_init_float2half_tables(float2half_tables *t) >> { >> +#if !HAVE_FAST_FLOAT16 >> for (int i = 0; i < 256; i++) { >> int e = i - 127; >> @@ -50,4 +51,5 @@ void ff_init_float2half_tables(float2half_tables *t) >> t->shifttable[i|0x100] = 13; >> } >> } >> +#endif >> } >> diff --git a/libavutil/float2half.h b/libavutil/float2half.h >> index b8c9cdfc4f..437666966b 100644 >> --- a/libavutil/float2half.h >> +++ b/libavutil/float2half.h >> @@ -20,21 +20,37 @@ >> #define AVUTIL_FLOAT2HALF_H >> #include <stdint.h> >> +#include "intfloat.h" >> + >> +#include "config.h" >> typedef struct float2half_tables { >> +#if HAVE_FAST_FLOAT16 >> + uint8_t dummy; >> +#else >> uint16_t basetable[512]; >> uint8_t shifttable[512]; >> +#endif >> } float2half_tables; >> void ff_init_float2half_tables(float2half_tables *t); >> static inline uint16_t float2half(uint32_t f, const >> float2half_tables *t) >> { >> +#if HAVE_FAST_FLOAT16 >> + union { >> + _Float16 f; >> + uint16_t i; >> + } u; >> + u.f = av_int2float(f); >> + return u.i; >> +#else >> uint16_t h; >> h = t->basetable[(f >> 23) & 0x1ff] + ((f & 0x007fffff) >> >> t->shifttable[(f >> 23) & 0x1ff]); >> return h; >> +#endif >> } >> #endif /* AVUTIL_FLOAT2HALF_H */ >> diff --git a/libavutil/half2float.c b/libavutil/half2float.c >> index baac8e4093..ff198a8187 100644 >> --- a/libavutil/half2float.c >> +++ b/libavutil/half2float.c >> @@ -18,6 +18,7 @@ >> #include "libavutil/half2float.h" >> +#if !HAVE_FAST_FLOAT16 >> static uint32_t convertmantissa(uint32_t i) >> { >> int32_t m = i << 13; // Zero pad mantissa bits >> @@ -33,9 +34,11 @@ static uint32_t convertmantissa(uint32_t i) >> return m | e; // Return combined number >> } >> +#endif >> void ff_init_half2float_tables(half2float_tables *t) >> { >> +#if !HAVE_FAST_FLOAT16 >> t->mantissatable[0] = 0; >> for (int i = 1; i < 1024; i++) >> t->mantissatable[i] = convertmantissa(i); >> @@ -60,4 +63,5 @@ void ff_init_half2float_tables(half2float_tables *t) >> t->offsettable[31] = 2048; >> t->offsettable[32] = 0; >> t->offsettable[63] = 2048; >> +#endif >> } >> diff --git a/libavutil/half2float.h b/libavutil/half2float.h >> index cb58e44a1c..57ee8372fe 100644 >> --- a/libavutil/half2float.h >> +++ b/libavutil/half2float.h >> @@ -20,22 +20,38 @@ >> #define AVUTIL_HALF2FLOAT_H >> #include <stdint.h> >> +#include "intfloat.h" >> + >> +#include "config.h" >> typedef struct half2float_tables { >> +#if HAVE_FAST_FLOAT16 >> + uint8_t dummy; >> +#else >> uint32_t mantissatable[3072]; >> uint32_t exponenttable[64]; >> uint16_t offsettable[64]; >> +#endif >> } half2float_tables; >> void ff_init_half2float_tables(half2float_tables *t); >> static inline uint32_t half2float(uint16_t h, const >> half2float_tables *t) >> { >> +#if HAVE_FAST_FLOAT16 >> + union { >> + _Float16 f; >> + uint16_t i; >> + } u; >> + u.i = h; >> + return av_float2int(u.f); >> +#else >> uint32_t f; >> f = t->mantissatable[t->offsettable[h >> 10] + (h & 0x3ff)] + >> t->exponenttable[h >> 10]; >> return f; >> +#endif >> } >> #endif /* AVUTIL_HALF2FLOAT_H */ > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff --git a/configure b/configure index 6761d0cb32..6ede9a5a8f 100755 --- a/configure +++ b/configure @@ -2143,6 +2143,8 @@ ARCH_FEATURES=" fast_64bit fast_clz fast_cmov + fast_float16 + float16 local_aligned simd_align_16 simd_align_32 @@ -5125,6 +5127,8 @@ elif enabled arm; then ;; esac + test_cflags -mfp16-format=ieee && add_cflags -mfp16-format=ieee + elif enabled avr32; then case $cpu in @@ -6229,6 +6233,15 @@ check_builtin sync_val_compare_and_swap "" "int *ptr; int oldval, newval; __sync check_builtin gmtime_r time.h "time_t *time; struct tm *tm; gmtime_r(time, tm)" check_builtin localtime_r time.h "time_t *time; struct tm *tm; localtime_r(time, tm)" +check_builtin float16 "" "_Float16 f16var" +if enabled float16; then + if enabled x86; then + test_cpp_condition stddef.h "defined(__F16C__)" && enable fast_float16 + elif enabled arm || enabled aarch64; then + enable fast_float16 + fi +fi + case "$custom_allocator" in jemalloc) # jemalloc by default does not use a prefix diff --git a/libavutil/float2half.c b/libavutil/float2half.c index dba14cef5d..7002612194 100644 --- a/libavutil/float2half.c +++ b/libavutil/float2half.c @@ -20,6 +20,7 @@ void ff_init_float2half_tables(float2half_tables *t) { +#if !HAVE_FAST_FLOAT16 for (int i = 0; i < 256; i++) { int e = i - 127; @@ -50,4 +51,5 @@ void ff_init_float2half_tables(float2half_tables *t) t->shifttable[i|0x100] = 13; } } +#endif } diff --git a/libavutil/float2half.h b/libavutil/float2half.h index b8c9cdfc4f..437666966b 100644 --- a/libavutil/float2half.h +++ b/libavutil/float2half.h @@ -20,21 +20,37 @@ #define AVUTIL_FLOAT2HALF_H #include <stdint.h> +#include "intfloat.h" + +#include "config.h" typedef struct float2half_tables { +#if HAVE_FAST_FLOAT16 + uint8_t dummy; +#else uint16_t basetable[512]; uint8_t shifttable[512]; +#endif } float2half_tables; void ff_init_float2half_tables(float2half_tables *t); static inline uint16_t float2half(uint32_t f, const float2half_tables *t) { +#if HAVE_FAST_FLOAT16 + union { + _Float16 f; + uint16_t i; + } u; + u.f = av_int2float(f); + return u.i; +#else uint16_t h; h = t->basetable[(f >> 23) & 0x1ff] + ((f & 0x007fffff) >> t->shifttable[(f >> 23) & 0x1ff]); return h; +#endif } #endif /* AVUTIL_FLOAT2HALF_H */ diff --git a/libavutil/half2float.c b/libavutil/half2float.c index baac8e4093..ff198a8187 100644 --- a/libavutil/half2float.c +++ b/libavutil/half2float.c @@ -18,6 +18,7 @@ #include "libavutil/half2float.h" +#if !HAVE_FAST_FLOAT16 static uint32_t convertmantissa(uint32_t i) { int32_t m = i << 13; // Zero pad mantissa bits @@ -33,9 +34,11 @@ static uint32_t convertmantissa(uint32_t i) return m | e; // Return combined number } +#endif void ff_init_half2float_tables(half2float_tables *t) { +#if !HAVE_FAST_FLOAT16 t->mantissatable[0] = 0; for (int i = 1; i < 1024; i++) t->mantissatable[i] = convertmantissa(i); @@ -60,4 +63,5 @@ void ff_init_half2float_tables(half2float_tables *t) t->offsettable[31] = 2048; t->offsettable[32] = 0; t->offsettable[63] = 2048; +#endif } diff --git a/libavutil/half2float.h b/libavutil/half2float.h index cb58e44a1c..57ee8372fe 100644 --- a/libavutil/half2float.h +++ b/libavutil/half2float.h @@ -20,22 +20,38 @@ #define AVUTIL_HALF2FLOAT_H #include <stdint.h> +#include "intfloat.h" + +#include "config.h" typedef struct half2float_tables { +#if HAVE_FAST_FLOAT16 + uint8_t dummy; +#else uint32_t mantissatable[3072]; uint32_t exponenttable[64]; uint16_t offsettable[64]; +#endif } half2float_tables; void ff_init_half2float_tables(half2float_tables *t); static inline uint32_t half2float(uint16_t h, const half2float_tables *t) { +#if HAVE_FAST_FLOAT16 + union { + _Float16 f; + uint16_t i; + } u; + u.i = h; + return av_float2int(u.f); +#else uint32_t f; f = t->mantissatable[t->offsettable[h >> 10] + (h & 0x3ff)] + t->exponenttable[h >> 10]; return f; +#endif } #endif /* AVUTIL_HALF2FLOAT_H */