diff mbox series

[FFmpeg-devel,v4,2/2] dnn_backend_native_layer_conv2d.c:Add mutithread function

Message ID 20200904150511.5789-2-xujunzz@sjtu.edu.cn
State Superseded
Headers show
Series [FFmpeg-devel,v4,1/2] dnn_backend_native.c: parse options in native backend | expand

Checks

Context Check Description
andriy/default pending
andriy/make success Make finished
andriy/make_fate success Make fate finished

Commit Message

Xu Jun Sept. 4, 2020, 3:05 p.m. UTC
From: Xu Jun <xujunzz@sjtu.edu.cn>

Use pthread to multithread dnn_execute_layer_conv2d.
Can be tested with command "./ffmpeg_g -i input.png -vf \
format=yuvj420p,dnn_processing=dnn_backend=native:model= \
espcn.model:input=x:output=y:options=conv2d_threads=23 \
 -y sr_native.jpg -benchmark"

before patch: utime=11.238s stime=0.005s rtime=11.248s
after patch:  utime=20.817s stime=0.047s rtime=1.051s
on my 3900X 12c24t @4.2GHz

About the increase of utime, it's because that CPU HyperThreading
technology makes logical cores twice of physical cores while cpu's
counting performance improves less than double. And utime sums
all cpu's logical cores' runtime. As a result, using threads num
near cpu's logical core's number will double utime, while reduce
rtime less than half for HyperThreading CPUs.

Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
---
v2: add check for HAVE_PTHREAD_CANCEL and modify FATE test
dnn-layer-conv2d-test.c
v4: use extern to call dnn_native_class in dnn-layer-conv2d-test.c

 .../dnn/dnn_backend_native_layer_conv2d.c     | 107 ++++++++++++++++--
 tests/dnn/dnn-layer-conv2d-test.c             |  14 ++-
 2 files changed, 108 insertions(+), 13 deletions(-)

Comments

Steven Liu Sept. 4, 2020, 10:07 p.m. UTC | #1
<xujunzz@sjtu.edu.cn> 于2020年9月4日周五 下午11:09写道:
>
> From: Xu Jun <xujunzz@sjtu.edu.cn>
>
> Use pthread to multithread dnn_execute_layer_conv2d.
> Can be tested with command "./ffmpeg_g -i input.png -vf \
> format=yuvj420p,dnn_processing=dnn_backend=native:model= \
> espcn.model:input=x:output=y:options=conv2d_threads=23 \
>  -y sr_native.jpg -benchmark"
>
> before patch: utime=11.238s stime=0.005s rtime=11.248s
> after patch:  utime=20.817s stime=0.047s rtime=1.051s
> on my 3900X 12c24t @4.2GHz
>
> About the increase of utime, it's because that CPU HyperThreading
> technology makes logical cores twice of physical cores while cpu's
> counting performance improves less than double. And utime sums
> all cpu's logical cores' runtime. As a result, using threads num
> near cpu's logical core's number will double utime, while reduce
> rtime less than half for HyperThreading CPUs.
>
> Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
> ---
> v2: add check for HAVE_PTHREAD_CANCEL and modify FATE test
> dnn-layer-conv2d-test.c
> v4: use extern to call dnn_native_class in dnn-layer-conv2d-test.c
>
>  .../dnn/dnn_backend_native_layer_conv2d.c     | 107 ++++++++++++++++--
>  tests/dnn/dnn-layer-conv2d-test.c             |  14 ++-
>  2 files changed, 108 insertions(+), 13 deletions(-)
>
> diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
> index d079795bf8..4068a13ab4 100644
> --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
> +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
> @@ -19,10 +19,27 @@
>   */
>
>  #include "libavutil/avassert.h"
> +#include "libavutil/thread.h"
> +#include "libavutil/cpu.h"
>  #include "dnn_backend_native_layer_conv2d.h"
>
>  #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
>
> +//struct to pass parameters
> +typedef struct thread_common_param{
> +    DnnOperand *operands;
> +    const int32_t *input_operand_indexes;
> +    int32_t output_operand_index;
> +    const void *parameters;
> +    NativeContext *ctx;
> +    int thread_num;
> +} thread_common_param;
> +
> +typedef struct thread_param{
> +    thread_common_param *thread_common_param;
> +    int thread_index;
> +} thread_param;
> +
>  int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num)
>  {
>      ConvolutionalParams *conv_params;
> @@ -88,17 +105,20 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int fil
>      return dnn_size;
>  }
>
> -int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes,
> -                             int32_t output_operand_index, const void *parameters, NativeContext *ctx)
> +static void * dnn_execute_layer_conv2d_thread(void *threadarg)
>  {
> +    //pass parameters
> +    thread_param *thread_param = (struct thread_param *)threadarg;
> +    thread_common_param *thread_common_param = thread_param->thread_common_param;
> +    DnnOperand *operands = thread_common_param->operands;
>      float *output;
> -    int32_t input_operand_index = input_operand_indexes[0];
> +    int32_t input_operand_index = thread_common_param->input_operand_indexes[0];
>      int number = operands[input_operand_index].dims[0];
>      int height = operands[input_operand_index].dims[1];
>      int width = operands[input_operand_index].dims[2];
>      int channel = operands[input_operand_index].dims[3];
>      const float *input = operands[input_operand_index].data;
> -    const ConvolutionalParams *conv_params = (const ConvolutionalParams *)parameters;
> +    const ConvolutionalParams *conv_params = (const ConvolutionalParams *)(thread_common_param->parameters);
>
>      int radius = conv_params->kernel_size >> 1;
>      int src_linesize = width * conv_params->input_num;
> @@ -106,7 +126,11 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
>      int filter_size = conv_params->kernel_size * filter_linesize;
>      int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
>
> -    DnnOperand *output_operand = &operands[output_operand_index];
> +    int thread_stride = (height - pad_size * 2) / thread_common_param->thread_num;
> +    int thread_start = thread_stride * thread_param->thread_index + pad_size;
> +    int thread_end = (thread_param->thread_index == thread_common_param->thread_num - 1) ? (height - pad_size) : (thread_start + thread_stride);
> +
> +    DnnOperand *output_operand = &operands[thread_common_param->output_operand_index];
>      output_operand->dims[0] = number;
>      output_operand->dims[1] = height - pad_size * 2;
>      output_operand->dims[2] = width - pad_size * 2;
> @@ -114,19 +138,21 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
>      output_operand->data_type = operands[input_operand_index].data_type;
>      output_operand->length = calculate_operand_data_length(output_operand);
>      if (output_operand->length <= 0) {
> -        av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n");
> -        return DNN_ERROR;
> +        av_log(thread_common_param->ctx, AV_LOG_ERROR, "The output data length overflow\n");
> +        return (void *)DNN_ERROR;
>      }
>      output_operand->data = av_realloc(output_operand->data, output_operand->length);
>      if (!output_operand->data) {
> -        av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n");
> -        return DNN_ERROR;
> +        av_log(thread_common_param->ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n");
> +        return (void *)DNN_ERROR;
>      }
> +
>      output = output_operand->data;
> +    output += (conv_params->output_num) * (width - 2 * pad_size) * (thread_start - pad_size);
>
>      av_assert0(channel == conv_params->input_num);
>
> -    for (int y = pad_size; y < height - pad_size; ++y) {
> +    for (int y = thread_start; y < thread_end; ++y) {
>          for (int x = pad_size; x < width - pad_size; ++x) {
>              for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) {
>                  if (conv_params->has_bias)
> @@ -174,5 +200,64 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
>              output += conv_params->output_num;
>          }
>      }
> -    return 0;
> +    return (void *)0;
why do you return a (void *) 0, I saw dnn_execute_layer_conv2d is int type.
> +}
> +
> +
> +int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes,
> +                             int32_t output_operand_index, const void *parameters, NativeContext *ctx)
> +{
> +    int thread_num = (ctx->options.conv2d_threads <= 0 || ctx->options.conv2d_threads > av_cpu_count())
> +        ? (av_cpu_count() + 1) : (ctx->options.conv2d_threads);
> +#if HAVE_PTHREAD_CANCEL
> +    pthread_t *thread_id = av_malloc(thread_num * sizeof(pthread_t));
> +#endif
> +    thread_param **thread_param = av_malloc(thread_num * sizeof(*thread_param));
> +    void *res;
> +    int error_flag = 0;
> +
> +    //struct used to pass parameters
> +    thread_common_param thread_common_param;
> +    thread_common_param.operands = operands;
> +    thread_common_param.input_operand_indexes = input_operand_indexes;
> +    thread_common_param.output_operand_index = output_operand_index;
> +    thread_common_param.parameters = parameters;
> +    thread_common_param.ctx = ctx;
> +#if HAVE_PTHREAD_CANCEL
> +    thread_common_param.thread_num = thread_num;
> +
> +    //create threads
> +    for (int i = 0; i < thread_num; i++){
> +        thread_param[i] = av_malloc(sizeof(thread_param));
> +        thread_param[i]->thread_common_param = &thread_common_param;
> +        thread_param[i]->thread_index = i;
> +        pthread_create(&thread_id[i], NULL, dnn_execute_layer_conv2d_thread, (void *)thread_param[i]);
> +    }
> +
> +    //join threads, res gets function return
> +    for (int i = 0; i < thread_num; i++){
> +        pthread_join(thread_id[i], &res);
> +        if ((int)res != 0)
> +            error_flag = (int)res;
> +    }
> +
> +    //release memory
> +    av_free(thread_id);
> +
> +    for (int i = 0; i < thread_num; i++){
> +        av_free(thread_param[i]);
> +    }
> +#else
> +    thread_common_param.thread_num = 1;
> +    thread_param[0] = av_malloc(sizeof(thread_param));
> +    thread_param[0]->thread_common_param = &thread_common_param;
> +    thread_param[0]->thread_index = 0;
> +    res = dnn_execute_layer_conv2d_thread((void *)thread_param[0]);
> +    if ((int)res != 0)
> +        error_flag = (int)res;
> +    av_free(thread_param[0]);
> +#endif
> +
> +    av_free(thread_param);
> +    return error_flag;
>  }
> diff --git a/tests/dnn/dnn-layer-conv2d-test.c b/tests/dnn/dnn-layer-conv2d-test.c
> index 836839cc64..378a05eafc 100644
> --- a/tests/dnn/dnn-layer-conv2d-test.c
> +++ b/tests/dnn/dnn-layer-conv2d-test.c
> @@ -25,6 +25,8 @@
>
>  #define EPSON 0.00001
>
> +extern const AVClass dnn_native_class;
> +
>  static int test_with_same_dilate(void)
>  {
>      // the input data and expected data are generated with below python code.
> @@ -96,6 +98,10 @@ static int test_with_same_dilate(void)
>      };
>      float bias[2] = { -1.6574852, -0.72915393 };
>
> +    NativeContext ctx;
> +    ctx.class = &dnn_native_class;
> +    ctx.options.conv2d_threads = 1;
> +
>      params.activation = TANH;
>      params.has_bias = 1;
>      params.biases = bias;
> @@ -114,7 +120,7 @@ static int test_with_same_dilate(void)
>      operands[1].data = NULL;
>
>      input_indexes[0] = 0;
> -    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, NULL);
> +    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, &ctx);
>
>      output = operands[1].data;
>      for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
> @@ -196,6 +202,10 @@ static int test_with_valid(void)
>      };
>      float bias[2] = { -0.4773722, -0.19620377 };
>
> +    NativeContext ctx;
> +    ctx.class = &dnn_native_class;
> +    ctx.options.conv2d_threads = 1;
> +
>      params.activation = TANH;
>      params.has_bias = 1;
>      params.biases = bias;
> @@ -214,7 +224,7 @@ static int test_with_valid(void)
>      operands[1].data = NULL;
>
>      input_indexes[0] = 0;
> -    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, NULL);
> +    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, &ctx);
>
>      output = operands[1].data;
>      for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
> --
> 2.28.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".


Thanks
Steven
Xu Jun Sept. 6, 2020, 12:19 p.m. UTC | #2
Hi, Steven

----- Original Message -----
> From: "Steven Liu" <lingjiujianke@gmail.com>
> To: "FFmpeg development discussions and patches" <ffmpeg-devel@ffmpeg.org>
> Sent: Saturday, September 5, 2020 6:07:45 AM
> Subject: Re: [FFmpeg-devel] [PATCH v4 2/2] dnn_backend_native_layer_conv2d.c:Add mutithread function

> <xujunzz@sjtu.edu.cn> 于2020年9月4日周五 下午11:09写道:
>>
>> From: Xu Jun <xujunzz@sjtu.edu.cn>
>>
>> Use pthread to multithread dnn_execute_layer_conv2d.
>> Can be tested with command "./ffmpeg_g -i input.png -vf \
>> format=yuvj420p,dnn_processing=dnn_backend=native:model= \
>> espcn.model:input=x:output=y:options=conv2d_threads=23 \
>>  -y sr_native.jpg -benchmark"
>>
>> before patch: utime=11.238s stime=0.005s rtime=11.248s
>> after patch:  utime=20.817s stime=0.047s rtime=1.051s
>> on my 3900X 12c24t @4.2GHz
>>
>> About the increase of utime, it's because that CPU HyperThreading
>> technology makes logical cores twice of physical cores while cpu's
>> counting performance improves less than double. And utime sums
>> all cpu's logical cores' runtime. As a result, using threads num
>> near cpu's logical core's number will double utime, while reduce
>> rtime less than half for HyperThreading CPUs.
>>
>> Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn>
>> ---
>> v2: add check for HAVE_PTHREAD_CANCEL and modify FATE test
>> dnn-layer-conv2d-test.c
>> v4: use extern to call dnn_native_class in dnn-layer-conv2d-test.c
>>
>>  .../dnn/dnn_backend_native_layer_conv2d.c     | 107 ++++++++++++++++--
>>  tests/dnn/dnn-layer-conv2d-test.c             |  14 ++-
>>  2 files changed, 108 insertions(+), 13 deletions(-)
>>
>> diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
>> b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
>> index d079795bf8..4068a13ab4 100644
>> --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
>> +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
>> @@ -19,10 +19,27 @@
>>   */
>>
>>  #include "libavutil/avassert.h"
>> +#include "libavutil/thread.h"
>> +#include "libavutil/cpu.h"
>>  #include "dnn_backend_native_layer_conv2d.h"
>>
>>  #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
>>
>> +//struct to pass parameters
>> +typedef struct thread_common_param{
>> +    DnnOperand *operands;
>> +    const int32_t *input_operand_indexes;
>> +    int32_t output_operand_index;
>> +    const void *parameters;
>> +    NativeContext *ctx;
>> +    int thread_num;
>> +} thread_common_param;
>> +
>> +typedef struct thread_param{
>> +    thread_common_param *thread_common_param;
>> +    int thread_index;
>> +} thread_param;
>> +
>>  int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int
>>  file_size, int operands_num)
>>  {
>>      ConvolutionalParams *conv_params;
>> @@ -88,17 +105,20 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext
>> *model_file_context, int fil
>>      return dnn_size;
>>  }
>>
>> -int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t
>> *input_operand_indexes,
>> -                             int32_t output_operand_index, const void
>> *parameters, NativeContext *ctx)
>> +static void * dnn_execute_layer_conv2d_thread(void *threadarg)
>>  {
>> +    //pass parameters
>> +    thread_param *thread_param = (struct thread_param *)threadarg;
>> +    thread_common_param *thread_common_param =
>> thread_param->thread_common_param;
>> +    DnnOperand *operands = thread_common_param->operands;
>>      float *output;
>> -    int32_t input_operand_index = input_operand_indexes[0];
>> +    int32_t input_operand_index =
>> thread_common_param->input_operand_indexes[0];
>>      int number = operands[input_operand_index].dims[0];
>>      int height = operands[input_operand_index].dims[1];
>>      int width = operands[input_operand_index].dims[2];
>>      int channel = operands[input_operand_index].dims[3];
>>      const float *input = operands[input_operand_index].data;
>> -    const ConvolutionalParams *conv_params = (const ConvolutionalParams
>> *)parameters;
>> +    const ConvolutionalParams *conv_params = (const ConvolutionalParams
>> *)(thread_common_param->parameters);
>>
>>      int radius = conv_params->kernel_size >> 1;
>>      int src_linesize = width * conv_params->input_num;
>> @@ -106,7 +126,11 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const
>> int32_t *input_operand_
>>      int filter_size = conv_params->kernel_size * filter_linesize;
>>      int pad_size = (conv_params->padding_method == VALID) ?
>>      (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
>>
>> -    DnnOperand *output_operand = &operands[output_operand_index];
>> +    int thread_stride = (height - pad_size * 2) /
>> thread_common_param->thread_num;
>> +    int thread_start = thread_stride * thread_param->thread_index + pad_size;
>> +    int thread_end = (thread_param->thread_index ==
>> thread_common_param->thread_num - 1) ? (height - pad_size) : (thread_start +
>> thread_stride);
>> +
>> +    DnnOperand *output_operand =
>> &operands[thread_common_param->output_operand_index];
>>      output_operand->dims[0] = number;
>>      output_operand->dims[1] = height - pad_size * 2;
>>      output_operand->dims[2] = width - pad_size * 2;
>> @@ -114,19 +138,21 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const
>> int32_t *input_operand_
>>      output_operand->data_type = operands[input_operand_index].data_type;
>>      output_operand->length = calculate_operand_data_length(output_operand);
>>      if (output_operand->length <= 0) {
>> -        av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n");
>> -        return DNN_ERROR;
>> +        av_log(thread_common_param->ctx, AV_LOG_ERROR, "The output data length
>> overflow\n");
>> +        return (void *)DNN_ERROR;
>>      }
>>      output_operand->data = av_realloc(output_operand->data, output_operand->length);
>>      if (!output_operand->data) {
>> -        av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n");
>> -        return DNN_ERROR;
>> +        av_log(thread_common_param->ctx, AV_LOG_ERROR, "Failed to reallocate
>> memory for output\n");
>> +        return (void *)DNN_ERROR;
>>      }
>> +
>>      output = output_operand->data;
>> +    output += (conv_params->output_num) * (width - 2 * pad_size) *
>> (thread_start - pad_size);
>>
>>      av_assert0(channel == conv_params->input_num);
>>
>> -    for (int y = pad_size; y < height - pad_size; ++y) {
>> +    for (int y = thread_start; y < thread_end; ++y) {
>>          for (int x = pad_size; x < width - pad_size; ++x) {
>>              for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) {
>>                  if (conv_params->has_bias)
>> @@ -174,5 +200,64 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const
>> int32_t *input_operand_
>>              output += conv_params->output_num;
>>          }
>>      }
>> -    return 0;
>> +    return (void *)0;
> why do you return a (void *) 0, I saw dnn_execute_layer_conv2d is int type.

Actually this should return a (void *)DNN_SUCCESS to be consistent with other codes.
Thank you for pointing that out!

>> +}
>> +
>> +
>> +int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t
>> *input_operand_indexes,
>> +                             int32_t output_operand_index, const void
>> *parameters, NativeContext *ctx)
>> +{
>> +    int thread_num = (ctx->options.conv2d_threads <= 0 ||
>> ctx->options.conv2d_threads > av_cpu_count())
>> +        ? (av_cpu_count() + 1) : (ctx->options.conv2d_threads);
>> +#if HAVE_PTHREAD_CANCEL
>> +    pthread_t *thread_id = av_malloc(thread_num * sizeof(pthread_t));
>> +#endif
>> +    thread_param **thread_param = av_malloc(thread_num *
>> sizeof(*thread_param));
>> +    void *res;
>> +    int error_flag = 0;
>> +
>> +    //struct used to pass parameters
>> +    thread_common_param thread_common_param;
>> +    thread_common_param.operands = operands;
>> +    thread_common_param.input_operand_indexes = input_operand_indexes;
>> +    thread_common_param.output_operand_index = output_operand_index;
>> +    thread_common_param.parameters = parameters;
>> +    thread_common_param.ctx = ctx;
>> +#if HAVE_PTHREAD_CANCEL
>> +    thread_common_param.thread_num = thread_num;
>> +
>> +    //create threads
>> +    for (int i = 0; i < thread_num; i++){
>> +        thread_param[i] = av_malloc(sizeof(thread_param));
>> +        thread_param[i]->thread_common_param = &thread_common_param;
>> +        thread_param[i]->thread_index = i;
>> +        pthread_create(&thread_id[i], NULL, dnn_execute_layer_conv2d_thread,
>> (void *)thread_param[i]);
>> +    }
>> +
>> +    //join threads, res gets function return
>> +    for (int i = 0; i < thread_num; i++){
>> +        pthread_join(thread_id[i], &res);
>> +        if ((int)res != 0)
>> +            error_flag = (int)res;
>> +    }
>> +
>> +    //release memory
>> +    av_free(thread_id);
>> +
>> +    for (int i = 0; i < thread_num; i++){
>> +        av_free(thread_param[i]);
>> +    }
>> +#else
>> +    thread_common_param.thread_num = 1;
>> +    thread_param[0] = av_malloc(sizeof(thread_param));
>> +    thread_param[0]->thread_common_param = &thread_common_param;
>> +    thread_param[0]->thread_index = 0;
>> +    res = dnn_execute_layer_conv2d_thread((void *)thread_param[0]);
>> +    if ((int)res != 0)
>> +        error_flag = (int)res;
>> +    av_free(thread_param[0]);
>> +#endif
>> +
>> +    av_free(thread_param);
>> +    return error_flag;
>>  }
>> diff --git a/tests/dnn/dnn-layer-conv2d-test.c
>> b/tests/dnn/dnn-layer-conv2d-test.c
>> index 836839cc64..378a05eafc 100644
>> --- a/tests/dnn/dnn-layer-conv2d-test.c
>> +++ b/tests/dnn/dnn-layer-conv2d-test.c
>> @@ -25,6 +25,8 @@
>>
>>  #define EPSON 0.00001
>>
>> +extern const AVClass dnn_native_class;
>> +
>>  static int test_with_same_dilate(void)
>>  {
>>      // the input data and expected data are generated with below python code.
>> @@ -96,6 +98,10 @@ static int test_with_same_dilate(void)
>>      };
>>      float bias[2] = { -1.6574852, -0.72915393 };
>>
>> +    NativeContext ctx;
>> +    ctx.class = &dnn_native_class;
>> +    ctx.options.conv2d_threads = 1;
>> +
>>      params.activation = TANH;
>>      params.has_bias = 1;
>>      params.biases = bias;
>> @@ -114,7 +120,7 @@ static int test_with_same_dilate(void)
>>      operands[1].data = NULL;
>>
>>      input_indexes[0] = 0;
>> -    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, NULL);
>> +    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, &ctx);
>>
>>      output = operands[1].data;
>>      for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
>> @@ -196,6 +202,10 @@ static int test_with_valid(void)
>>      };
>>      float bias[2] = { -0.4773722, -0.19620377 };
>>
>> +    NativeContext ctx;
>> +    ctx.class = &dnn_native_class;
>> +    ctx.options.conv2d_threads = 1;
>> +
>>      params.activation = TANH;
>>      params.has_bias = 1;
>>      params.biases = bias;
>> @@ -214,7 +224,7 @@ static int test_with_valid(void)
>>      operands[1].data = NULL;
>>
>>      input_indexes[0] = 0;
>> -    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, NULL);
>> +    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, &ctx);
>>
>>      output = operands[1].data;
>>      for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
>> --
>> 2.28.0
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> 
> 
> Thanks
> Steven
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".

Thanks
- Xu Jun
diff mbox series

Patch

diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
index d079795bf8..4068a13ab4 100644
--- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
+++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
@@ -19,10 +19,27 @@ 
  */
 
 #include "libavutil/avassert.h"
+#include "libavutil/thread.h"
+#include "libavutil/cpu.h"
 #include "dnn_backend_native_layer_conv2d.h"
 
 #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
 
+//struct to pass parameters
+typedef struct thread_common_param{
+    DnnOperand *operands;
+    const int32_t *input_operand_indexes;
+    int32_t output_operand_index;
+    const void *parameters;
+    NativeContext *ctx;
+    int thread_num;
+} thread_common_param;
+
+typedef struct thread_param{
+    thread_common_param *thread_common_param;
+    int thread_index;
+} thread_param;
+
 int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num)
 {
     ConvolutionalParams *conv_params;
@@ -88,17 +105,20 @@  int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int fil
     return dnn_size;
 }
 
-int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes,
-                             int32_t output_operand_index, const void *parameters, NativeContext *ctx)
+static void * dnn_execute_layer_conv2d_thread(void *threadarg)
 {
+    //pass parameters
+    thread_param *thread_param = (struct thread_param *)threadarg;
+    thread_common_param *thread_common_param = thread_param->thread_common_param;
+    DnnOperand *operands = thread_common_param->operands;
     float *output;
-    int32_t input_operand_index = input_operand_indexes[0];
+    int32_t input_operand_index = thread_common_param->input_operand_indexes[0];
     int number = operands[input_operand_index].dims[0];
     int height = operands[input_operand_index].dims[1];
     int width = operands[input_operand_index].dims[2];
     int channel = operands[input_operand_index].dims[3];
     const float *input = operands[input_operand_index].data;
-    const ConvolutionalParams *conv_params = (const ConvolutionalParams *)parameters;
+    const ConvolutionalParams *conv_params = (const ConvolutionalParams *)(thread_common_param->parameters);
 
     int radius = conv_params->kernel_size >> 1;
     int src_linesize = width * conv_params->input_num;
@@ -106,7 +126,11 @@  int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
     int filter_size = conv_params->kernel_size * filter_linesize;
     int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
 
-    DnnOperand *output_operand = &operands[output_operand_index];
+    int thread_stride = (height - pad_size * 2) / thread_common_param->thread_num;
+    int thread_start = thread_stride * thread_param->thread_index + pad_size;
+    int thread_end = (thread_param->thread_index == thread_common_param->thread_num - 1) ? (height - pad_size) : (thread_start + thread_stride);
+
+    DnnOperand *output_operand = &operands[thread_common_param->output_operand_index];
     output_operand->dims[0] = number;
     output_operand->dims[1] = height - pad_size * 2;
     output_operand->dims[2] = width - pad_size * 2;
@@ -114,19 +138,21 @@  int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
     output_operand->data_type = operands[input_operand_index].data_type;
     output_operand->length = calculate_operand_data_length(output_operand);
     if (output_operand->length <= 0) {
-        av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n");
-        return DNN_ERROR;
+        av_log(thread_common_param->ctx, AV_LOG_ERROR, "The output data length overflow\n");
+        return (void *)DNN_ERROR;
     }
     output_operand->data = av_realloc(output_operand->data, output_operand->length);
     if (!output_operand->data) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n");
-        return DNN_ERROR;
+        av_log(thread_common_param->ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n");
+        return (void *)DNN_ERROR;
     }
+
     output = output_operand->data;
+    output += (conv_params->output_num) * (width - 2 * pad_size) * (thread_start - pad_size);
 
     av_assert0(channel == conv_params->input_num);
 
-    for (int y = pad_size; y < height - pad_size; ++y) {
+    for (int y = thread_start; y < thread_end; ++y) {
         for (int x = pad_size; x < width - pad_size; ++x) {
             for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) {
                 if (conv_params->has_bias)
@@ -174,5 +200,64 @@  int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
             output += conv_params->output_num;
         }
     }
-    return 0;
+    return (void *)0;
+}
+
+
+int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes,
+                             int32_t output_operand_index, const void *parameters, NativeContext *ctx)
+{
+    int thread_num = (ctx->options.conv2d_threads <= 0 || ctx->options.conv2d_threads > av_cpu_count())
+        ? (av_cpu_count() + 1) : (ctx->options.conv2d_threads);
+#if HAVE_PTHREAD_CANCEL
+    pthread_t *thread_id = av_malloc(thread_num * sizeof(pthread_t));
+#endif
+    thread_param **thread_param = av_malloc(thread_num * sizeof(*thread_param));
+    void *res;
+    int error_flag = 0;
+
+    //struct used to pass parameters
+    thread_common_param thread_common_param;
+    thread_common_param.operands = operands;
+    thread_common_param.input_operand_indexes = input_operand_indexes;
+    thread_common_param.output_operand_index = output_operand_index;
+    thread_common_param.parameters = parameters;
+    thread_common_param.ctx = ctx;
+#if HAVE_PTHREAD_CANCEL
+    thread_common_param.thread_num = thread_num;
+
+    //create threads
+    for (int i = 0; i < thread_num; i++){
+        thread_param[i] = av_malloc(sizeof(thread_param));
+        thread_param[i]->thread_common_param = &thread_common_param;
+        thread_param[i]->thread_index = i;
+        pthread_create(&thread_id[i], NULL, dnn_execute_layer_conv2d_thread, (void *)thread_param[i]);
+    }
+
+    //join threads, res gets function return
+    for (int i = 0; i < thread_num; i++){
+        pthread_join(thread_id[i], &res);
+        if ((int)res != 0)
+            error_flag = (int)res;
+    }
+
+    //release memory
+    av_free(thread_id);
+
+    for (int i = 0; i < thread_num; i++){
+        av_free(thread_param[i]);
+    }
+#else
+    thread_common_param.thread_num = 1;
+    thread_param[0] = av_malloc(sizeof(thread_param));
+    thread_param[0]->thread_common_param = &thread_common_param;
+    thread_param[0]->thread_index = 0;
+    res = dnn_execute_layer_conv2d_thread((void *)thread_param[0]);
+    if ((int)res != 0)
+        error_flag = (int)res;
+    av_free(thread_param[0]);
+#endif
+
+    av_free(thread_param);
+    return error_flag;
 }
diff --git a/tests/dnn/dnn-layer-conv2d-test.c b/tests/dnn/dnn-layer-conv2d-test.c
index 836839cc64..378a05eafc 100644
--- a/tests/dnn/dnn-layer-conv2d-test.c
+++ b/tests/dnn/dnn-layer-conv2d-test.c
@@ -25,6 +25,8 @@ 
 
 #define EPSON 0.00001
 
+extern const AVClass dnn_native_class;
+
 static int test_with_same_dilate(void)
 {
     // the input data and expected data are generated with below python code.
@@ -96,6 +98,10 @@  static int test_with_same_dilate(void)
     };
     float bias[2] = { -1.6574852, -0.72915393 };
 
+    NativeContext ctx;
+    ctx.class = &dnn_native_class;
+    ctx.options.conv2d_threads = 1;
+
     params.activation = TANH;
     params.has_bias = 1;
     params.biases = bias;
@@ -114,7 +120,7 @@  static int test_with_same_dilate(void)
     operands[1].data = NULL;
 
     input_indexes[0] = 0;
-    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, NULL);
+    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, &ctx);
 
     output = operands[1].data;
     for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {
@@ -196,6 +202,10 @@  static int test_with_valid(void)
     };
     float bias[2] = { -0.4773722, -0.19620377 };
 
+    NativeContext ctx;
+    ctx.class = &dnn_native_class;
+    ctx.options.conv2d_threads = 1;
+
     params.activation = TANH;
     params.has_bias = 1;
     params.biases = bias;
@@ -214,7 +224,7 @@  static int test_with_valid(void)
     operands[1].data = NULL;
 
     input_indexes[0] = 0;
-    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, NULL);
+    dnn_execute_layer_conv2d(operands, input_indexes, 1, &params, &ctx);
 
     output = operands[1].data;
     for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {