[FFmpeg-devel,10/11] avcodec/blockdsp: add AVX-512 version of clear_block(s)

Submitted by James Darnley on Nov. 9, 2017, 11:58 a.m.

Details

Message ID 20171109115837.32618-11-jdarnley@obe.tv
State New
Headers show

Commit Message

James Darnley Nov. 9, 2017, 11:58 a.m.
From: James Darnley <james.darnley@gmail.com>

Also adjust alignment requirements where nessecary.
---
Whether this patch is committed or not the change to 4xm.c should be picked to
master because the alignment is wrong for the AVX version of this function.  I
assume it hasn't been noticed yet because it manages to be 32-byte aligned
without intervention.

 libavcodec/4xm.c               |  2 +-
 libavcodec/asv.h               |  3 ++-
 libavcodec/bink.c              |  4 ++--
 libavcodec/dnxhdenc.h          |  2 +-
 libavcodec/eamad.c             |  2 +-
 libavcodec/eatqi.c             |  2 +-
 libavcodec/g2meet.c            |  2 +-
 libavcodec/ituh263dec.c        |  2 +-
 libavcodec/mdec.c              |  2 +-
 libavcodec/mimic.c             |  2 +-
 libavcodec/mjpegdec.h          |  3 ++-
 libavcodec/proresdec2.c        |  6 +++---
 libavcodec/speedhq.c           |  2 +-
 libavcodec/wmv2.h              |  2 +-
 libavcodec/x86/blockdsp.asm    | 14 ++++++++++++++
 libavcodec/x86/blockdsp_init.c |  8 ++++++++
 libavutil/internal.h           |  6 ++++++
 tests/checkasm/blockdsp.c      |  4 ++--
 18 files changed, 49 insertions(+), 19 deletions(-)

Comments

Martin Vignali Nov. 9, 2017, 7:35 p.m.
2017-11-09 12:58 GMT+01:00 James Darnley <jdarnley@obe.tv>:

> From: James Darnley <james.darnley@gmail.com>
>
> Also adjust alignment requirements where nessecary.
> ---
> Whether this patch is committed or not the change to 4xm.c should be
> picked to
> master because the alignment is wrong for the AVX version of this
> function.  I
> assume it hasn't been noticed yet because it manages to be 32-byte aligned
> without intervention.
>
>
Thanks for fixing, the 4xm, i miss it in the avx patch

Just by curiosity : can you post the checkasm result (i can't test AVX512) ?

lgtm

Martin
James Almer Nov. 10, 2017, 2:36 p.m.
On 11/10/2017 10:28 AM, James Darnley wrote:
> On 2017-11-09 20:35, Martin Vignali wrote:
>> 2017-11-09 12:58 GMT+01:00 James Darnley <jdarnley@obe.tv>:
>>
>>> From: James Darnley <james.darnley@gmail.com>
>>>
>>> Also adjust alignment requirements where nessecary.
>>> ---
>>> Whether this patch is committed or not the change to 4xm.c should be
>>> picked to
>>> master because the alignment is wrong for the AVX version of this
>>> function.  I
>>> assume it hasn't been noticed yet because it manages to be 32-byte aligned
>>> without intervention.
>>>
>>>
>> Thanks for fixing, the 4xm, i miss it in the avx patch
>>
>> Just by curiosity : can you post the checkasm result (i can't test AVX512) ?
> 
> I certainly can.
> 
>> $ ./tests/checkasm/checkasm --bench --test=blockdsp
>> benchmarking with native FFmpeg timers
>> nop: 26.0
>> checkasm: using random seed 402373647
>> MMX:
>>  - blockdsp.blockdsp [OK]
>> SSE:
>>  - blockdsp.blockdsp [OK]
>> AVX:
>>  - blockdsp.blockdsp [OK]
>> AVX-512:
>>  - blockdsp.blockdsp [OK]
>> checkasm: all 8 tests passed
>> blockdsp.clear_block_c: 23.5
>> blockdsp.clear_block_mmx: 11.5
>> blockdsp.clear_block_sse: 5.5
>> blockdsp.clear_block_avx: 3.0
>> blockdsp.clear_block_avx512: 5.0

This sounds like it's not worth adding.

>> blockdsp.clear_blocks_c: 48.0
>> blockdsp.clear_blocks_mmx: 77.0
>> blockdsp.clear_blocks_sse: 38.0
>> blockdsp.clear_blocks_avx: 18.5
>> blockdsp.clear_blocks_avx512: 11.0

This one is better, but a perf run to check how much CPU time is spent
in this function is needed, because I'm not sure it's important enough
to justify having the CPU throttled just to run avx512 code...

Patch hide | download patch | download mbox

diff --git a/libavcodec/4xm.c b/libavcodec/4xm.c
index 5547dfd87f..a97f051232 100644
--- a/libavcodec/4xm.c
+++ b/libavcodec/4xm.c
@@ -145,7 +145,7 @@  typedef struct FourXContext {
     int mv[256];
     VLC pre_vlc;
     int last_dc;
-    DECLARE_ALIGNED(16, int16_t, block)[6][64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[6][64];
     void *bitstream_buffer;
     unsigned int bitstream_buffer_size;
     int version;
diff --git a/libavcodec/asv.h b/libavcodec/asv.h
index a1366b6fe4..024ffa349a 100644
--- a/libavcodec/asv.h
+++ b/libavcodec/asv.h
@@ -35,6 +35,7 @@ 
 #include "bswapdsp.h"
 #include "fdctdsp.h"
 #include "idctdsp.h"
+#include "internal.h"
 #include "get_bits.h"
 #include "pixblockdsp.h"
 #include "put_bits.h"
@@ -54,7 +55,7 @@  typedef struct ASV1Context {
     int mb_height;
     int mb_width2;
     int mb_height2;
-    DECLARE_ALIGNED(32, int16_t, block)[6][64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[6][64];
     uint16_t intra_matrix[64];
     int q_intra_matrix[64];
     uint8_t *bitstream_buffer;
diff --git a/libavcodec/bink.c b/libavcodec/bink.c
index c4cf617a8b..eee19512f7 100644
--- a/libavcodec/bink.c
+++ b/libavcodec/bink.c
@@ -818,7 +818,7 @@  static int binkb_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
     int v, col[2];
     const uint8_t *scan;
     int xoff, yoff;
-    LOCAL_ALIGNED_32(int16_t, block, [64]);
+    LOCAL_ALIGNED_64(int16_t, block, [64]);
     LOCAL_ALIGNED_16(int32_t, dctblock, [64]);
     int coordmap[64];
     int ybias = is_key ? -15 : 0;
@@ -985,7 +985,7 @@  static int bink_decode_plane(BinkContext *c, AVFrame *frame, GetBitContext *gb,
     uint8_t *dst, *prev, *ref_start, *ref_end;
     int v, col[2];
     const uint8_t *scan;
-    LOCAL_ALIGNED_32(int16_t, block, [64]);
+    LOCAL_ALIGNED_64(int16_t, block, [64]);
     LOCAL_ALIGNED_16(uint8_t, ublock, [64]);
     LOCAL_ALIGNED_16(int32_t, dctblock, [64]);
     int coordmap[64], quant_idx, coef_count, coef_idx[64];
diff --git a/libavcodec/dnxhdenc.h b/libavcodec/dnxhdenc.h
index 963821ac81..4567a4aa8a 100644
--- a/libavcodec/dnxhdenc.h
+++ b/libavcodec/dnxhdenc.h
@@ -74,7 +74,7 @@  typedef struct DNXHDEncContext {
     unsigned min_padding;
     int intra_quant_bias;
 
-    DECLARE_ALIGNED(32, int16_t, blocks)[12][64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, blocks)[12][64];
     DECLARE_ALIGNED(16, uint8_t, edge_buf_y)[512]; // has to hold 16x16 uint16 when depth=10
     DECLARE_ALIGNED(16, uint8_t, edge_buf_uv)[2][512]; // has to hold 16x16 uint16_t when depth=10
 
diff --git a/libavcodec/eamad.c b/libavcodec/eamad.c
index 7f28abbafe..0f638b43f6 100644
--- a/libavcodec/eamad.c
+++ b/libavcodec/eamad.c
@@ -54,7 +54,7 @@  typedef struct MadContext {
     GetBitContext gb;
     void *bitstream_buf;
     unsigned int bitstream_buf_size;
-    DECLARE_ALIGNED(32, int16_t, block)[64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[64];
     ScanTable scantable;
     uint16_t quant_matrix[64];
     int mb_x;
diff --git a/libavcodec/eatqi.c b/libavcodec/eatqi.c
index 1a847a35da..9527bc2713 100644
--- a/libavcodec/eatqi.c
+++ b/libavcodec/eatqi.c
@@ -51,7 +51,7 @@  typedef struct TqiContext {
     uint16_t intra_matrix[64];
     int last_dc[3];
 
-    DECLARE_ALIGNED(32, int16_t, block)[6][64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[6][64];
 } TqiContext;
 
 static av_cold int tqi_decode_init(AVCodecContext *avctx)
diff --git a/libavcodec/g2meet.c b/libavcodec/g2meet.c
index 842095ba3b..57ff92c002 100644
--- a/libavcodec/g2meet.c
+++ b/libavcodec/g2meet.c
@@ -122,7 +122,7 @@  typedef struct JPGContext {
 
     VLC        dc_vlc[2], ac_vlc[2];
     int        prev_dc[3];
-    DECLARE_ALIGNED(32, int16_t, block)[6][64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[6][64];
 
     uint8_t    *buf;
 } JPGContext;
diff --git a/libavcodec/ituh263dec.c b/libavcodec/ituh263dec.c
index fc95a532ce..c5079fe30d 100644
--- a/libavcodec/ituh263dec.c
+++ b/libavcodec/ituh263dec.c
@@ -574,7 +574,7 @@  not_coded:
 
 static int h263_skip_b_part(MpegEncContext *s, int cbp)
 {
-    LOCAL_ALIGNED_32(int16_t, dblock, [64]);
+    LOCAL_ALIGNED_64(int16_t, dblock, [64]);
     int i, mbi;
     int bli[6];
 
diff --git a/libavcodec/mdec.c b/libavcodec/mdec.c
index 330b761279..51708cdf34 100644
--- a/libavcodec/mdec.c
+++ b/libavcodec/mdec.c
@@ -48,7 +48,7 @@  typedef struct MDECContext {
     int mb_width;
     int mb_height;
     int mb_x, mb_y;
-    DECLARE_ALIGNED(32, int16_t, block)[6][64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[6][64];
     DECLARE_ALIGNED(16, uint16_t, quant_matrix)[64];
     uint8_t *bitstream_buffer;
     unsigned int bitstream_buffer_size;
diff --git a/libavcodec/mimic.c b/libavcodec/mimic.c
index 1d463e9962..5133860f55 100644
--- a/libavcodec/mimic.c
+++ b/libavcodec/mimic.c
@@ -49,7 +49,7 @@  typedef struct MimicContext {
 
     ThreadFrame     frames     [16];
 
-    DECLARE_ALIGNED(32, int16_t, dct_block)[64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, dct_block)[64];
 
     GetBitContext   gb;
     ScanTable       scantable;
diff --git a/libavcodec/mjpegdec.h b/libavcodec/mjpegdec.h
index c84a40aa6e..5b1d6c8f94 100644
--- a/libavcodec/mjpegdec.h
+++ b/libavcodec/mjpegdec.h
@@ -38,6 +38,7 @@ 
 #include "get_bits.h"
 #include "hpeldsp.h"
 #include "idctdsp.h"
+#include "internal.h"
 
 #define MAX_COMPONENTS 4
 
@@ -98,7 +99,7 @@  typedef struct MJpegDecodeContext {
     int got_picture;                                ///< we found a SOF and picture is valid, too.
     int linesize[MAX_COMPONENTS];                   ///< linesize << interlaced
     int8_t *qscale_table;
-    DECLARE_ALIGNED(32, int16_t, block)[64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, block)[64];
     int16_t (*blocks[MAX_COMPONENTS])[64]; ///< intermediate sums (progressive mode)
     uint8_t *last_nnz[MAX_COMPONENTS];
     uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode)
diff --git a/libavcodec/proresdec2.c b/libavcodec/proresdec2.c
index e9e0153ee9..662b92d1b7 100644
--- a/libavcodec/proresdec2.c
+++ b/libavcodec/proresdec2.c
@@ -368,7 +368,7 @@  static int decode_slice_luma(AVCodecContext *avctx, SliceContext *slice,
                              const int16_t *qmat)
 {
     ProresContext *ctx = avctx->priv_data;
-    LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
+    LOCAL_ALIGNED_64(int16_t, blocks, [8*4*64]);
     int16_t *block;
     GetBitContext gb;
     int i, blocks_per_slice = slice->mb_count<<2;
@@ -402,7 +402,7 @@  static int decode_slice_chroma(AVCodecContext *avctx, SliceContext *slice,
                                const int16_t *qmat, int log2_blocks_per_mb)
 {
     ProresContext *ctx = avctx->priv_data;
-    LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
+    LOCAL_ALIGNED_64(int16_t, blocks, [8*4*64]);
     int16_t *block;
     GetBitContext gb;
     int i, j, blocks_per_slice = slice->mb_count << log2_blocks_per_mb;
@@ -485,7 +485,7 @@  static void decode_slice_alpha(ProresContext *ctx,
 {
     GetBitContext gb;
     int i;
-    LOCAL_ALIGNED_32(int16_t, blocks, [8*4*64]);
+    LOCAL_ALIGNED_64(int16_t, blocks, [8*4*64]);
     int16_t *block;
 
     for (i = 0; i < blocks_per_slice<<2; i++)
diff --git a/libavcodec/speedhq.c b/libavcodec/speedhq.c
index 6d3487ca19..1f70602acc 100644
--- a/libavcodec/speedhq.c
+++ b/libavcodec/speedhq.c
@@ -224,7 +224,7 @@  static inline int decode_dct_block(const SHQContext *s, GetBitContext *gb, int l
 {
     const int *quant_matrix = s->quant_matrix;
     const uint8_t *scantable = s->intra_scantable.permutated;
-    LOCAL_ALIGNED_32(int16_t, block, [64]);
+    LOCAL_ALIGNED_64(int16_t, block, [64]);
     int dc_offset;
 
     s->bdsp.clear_block(block);
diff --git a/libavcodec/wmv2.h b/libavcodec/wmv2.h
index 0f459ae5ae..eb3e808ba2 100644
--- a/libavcodec/wmv2.h
+++ b/libavcodec/wmv2.h
@@ -51,7 +51,7 @@  typedef struct Wmv2Context {
     int hshift;
 
     ScanTable abt_scantable[2];
-    DECLARE_ALIGNED(32, int16_t, abt_block2)[6][64];
+    DECLARE_ALIGNED(STRIDE_ALIGN, int16_t, abt_block2)[6][64];
 } Wmv2Context;
 
 void ff_wmv2_common_init(Wmv2Context *w);
diff --git a/libavcodec/x86/blockdsp.asm b/libavcodec/x86/blockdsp.asm
index 9d0e8a3242..930ef7c833 100644
--- a/libavcodec/x86/blockdsp.asm
+++ b/libavcodec/x86/blockdsp.asm
@@ -52,6 +52,11 @@  CLEAR_BLOCK 1, 8
 INIT_YMM avx
 CLEAR_BLOCK 1, 4
 
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+CLEAR_BLOCK 1, 2
+%endif
+
 ;-----------------------------------------
 ; void ff_clear_blocks(int16_t *blocks);
 ;-----------------------------------------
@@ -68,9 +73,13 @@  cglobal clear_blocks, 1, 2, %1, blocks, len
     mova  [blocksq+lenq+mmsize*3], m0
     mova  [blocksq+lenq+mmsize*4], m0
     mova  [blocksq+lenq+mmsize*5], m0
+%if mmsize != 64
     mova  [blocksq+lenq+mmsize*6], m0
     mova  [blocksq+lenq+mmsize*7], m0
     add   lenq, mmsize*8
+%else
+    add   lenq, mmsize*6
+%endif
     js .loop
     RET
 %endmacro
@@ -83,3 +92,8 @@  INIT_XMM sse
 CLEAR_BLOCKS 1
 INIT_YMM avx
 CLEAR_BLOCKS 1
+
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+CLEAR_BLOCKS 1
+%endif
diff --git a/libavcodec/x86/blockdsp_init.c b/libavcodec/x86/blockdsp_init.c
index 8b01a447cd..2bcf5320d7 100644
--- a/libavcodec/x86/blockdsp_init.c
+++ b/libavcodec/x86/blockdsp_init.c
@@ -29,9 +29,11 @@ 
 void ff_clear_block_mmx(int16_t *block);
 void ff_clear_block_sse(int16_t *block);
 void ff_clear_block_avx(int16_t *block);
+void ff_clear_block_avx512(int16_t *block);
 void ff_clear_blocks_mmx(int16_t *blocks);
 void ff_clear_blocks_sse(int16_t *blocks);
 void ff_clear_blocks_avx(int16_t *blocks);
+void ff_clear_blocks_avx512(int16_t *blocks);
 
 av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
                                   AVCodecContext *avctx)
@@ -56,5 +58,11 @@  av_cold void ff_blockdsp_init_x86(BlockDSPContext *c,
         c->clear_block  = ff_clear_block_avx;
         c->clear_blocks = ff_clear_blocks_avx;
     }
+
+    if (EXTERNAL_AVX512(cpu_flags)) {
+        c->clear_block  = ff_clear_block_avx512;
+        c->clear_blocks = ff_clear_blocks_avx512;
+    }
+
 #endif /* HAVE_X86ASM */
 }
diff --git a/libavutil/internal.h b/libavutil/internal.h
index 6f92f71e8e..40c76a1fc4 100644
--- a/libavutil/internal.h
+++ b/libavutil/internal.h
@@ -137,6 +137,12 @@ 
 #   define LOCAL_ALIGNED_32(t, v, ...) E1(LOCAL_ALIGNED_A(32, t, v, __VA_ARGS__,,))
 #endif
 
+#if HAVE_LOCAL_ALIGNED
+#   define LOCAL_ALIGNED_64(t, v, ...) E1(LOCAL_ALIGNED_D(64, t, v, __VA_ARGS__,,))
+#else
+#   define LOCAL_ALIGNED_64(t, v, ...) E1(LOCAL_ALIGNED_A(64, t, v, __VA_ARGS__,,))
+#endif
+
 #define FF_ALLOC_OR_GOTO(ctx, p, size, label)\
 {\
     p = av_malloc(size);\
diff --git a/tests/checkasm/blockdsp.c b/tests/checkasm/blockdsp.c
index c753506b3c..2344dbdbed 100644
--- a/tests/checkasm/blockdsp.c
+++ b/tests/checkasm/blockdsp.c
@@ -53,8 +53,8 @@  do {                                                                \
 
 void checkasm_check_blockdsp(void)
 {
-    LOCAL_ALIGNED_32(uint16_t, buf0, [6 * 8 * 8]);
-    LOCAL_ALIGNED_32(uint16_t, buf1, [6 * 8 * 8]);
+    LOCAL_ALIGNED_64(uint16_t, buf0, [6 * 8 * 8]);
+    LOCAL_ALIGNED_64(uint16_t, buf1, [6 * 8 * 8]);
 
     AVCodecContext avctx = { 0 };
     BlockDSPContext h;