Message ID | 20200904150511.5789-2-xujunzz@sjtu.edu.cn |
---|---|
State | Superseded |
Headers | show |
Series | [FFmpeg-devel,v4,1/2] dnn_backend_native.c: parse options in native backend | expand |
Context | Check | Description |
---|---|---|
andriy/default | pending | |
andriy/make | success | Make finished |
andriy/make_fate | success | Make fate finished |
<xujunzz@sjtu.edu.cn> 于2020年9月4日周五 下午11:09写道: > > From: Xu Jun <xujunzz@sjtu.edu.cn> > > Use pthread to multithread dnn_execute_layer_conv2d. > Can be tested with command "./ffmpeg_g -i input.png -vf \ > format=yuvj420p,dnn_processing=dnn_backend=native:model= \ > espcn.model:input=x:output=y:options=conv2d_threads=23 \ > -y sr_native.jpg -benchmark" > > before patch: utime=11.238s stime=0.005s rtime=11.248s > after patch: utime=20.817s stime=0.047s rtime=1.051s > on my 3900X 12c24t @4.2GHz > > About the increase of utime, it's because that CPU HyperThreading > technology makes logical cores twice of physical cores while cpu's > counting performance improves less than double. And utime sums > all cpu's logical cores' runtime. As a result, using threads num > near cpu's logical core's number will double utime, while reduce > rtime less than half for HyperThreading CPUs. > > Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn> > --- > v2: add check for HAVE_PTHREAD_CANCEL and modify FATE test > dnn-layer-conv2d-test.c > v4: use extern to call dnn_native_class in dnn-layer-conv2d-test.c > > .../dnn/dnn_backend_native_layer_conv2d.c | 107 ++++++++++++++++-- > tests/dnn/dnn-layer-conv2d-test.c | 14 ++- > 2 files changed, 108 insertions(+), 13 deletions(-) > > diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c > index d079795bf8..4068a13ab4 100644 > --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c > +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c > @@ -19,10 +19,27 @@ > */ > > #include "libavutil/avassert.h" > +#include "libavutil/thread.h" > +#include "libavutil/cpu.h" > #include "dnn_backend_native_layer_conv2d.h" > > #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x))) > > +//struct to pass parameters > +typedef struct thread_common_param{ > + DnnOperand *operands; > + const int32_t *input_operand_indexes; > + int32_t output_operand_index; > + const void *parameters; > + NativeContext *ctx; > + int thread_num; > +} thread_common_param; > + > +typedef struct thread_param{ > + thread_common_param *thread_common_param; > + int thread_index; > +} thread_param; > + > int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num) > { > ConvolutionalParams *conv_params; > @@ -88,17 +105,20 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int fil > return dnn_size; > } > > -int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes, > - int32_t output_operand_index, const void *parameters, NativeContext *ctx) > +static void * dnn_execute_layer_conv2d_thread(void *threadarg) > { > + //pass parameters > + thread_param *thread_param = (struct thread_param *)threadarg; > + thread_common_param *thread_common_param = thread_param->thread_common_param; > + DnnOperand *operands = thread_common_param->operands; > float *output; > - int32_t input_operand_index = input_operand_indexes[0]; > + int32_t input_operand_index = thread_common_param->input_operand_indexes[0]; > int number = operands[input_operand_index].dims[0]; > int height = operands[input_operand_index].dims[1]; > int width = operands[input_operand_index].dims[2]; > int channel = operands[input_operand_index].dims[3]; > const float *input = operands[input_operand_index].data; > - const ConvolutionalParams *conv_params = (const ConvolutionalParams *)parameters; > + const ConvolutionalParams *conv_params = (const ConvolutionalParams *)(thread_common_param->parameters); > > int radius = conv_params->kernel_size >> 1; > int src_linesize = width * conv_params->input_num; > @@ -106,7 +126,11 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ > int filter_size = conv_params->kernel_size * filter_linesize; > int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0; > > - DnnOperand *output_operand = &operands[output_operand_index]; > + int thread_stride = (height - pad_size * 2) / thread_common_param->thread_num; > + int thread_start = thread_stride * thread_param->thread_index + pad_size; > + int thread_end = (thread_param->thread_index == thread_common_param->thread_num - 1) ? (height - pad_size) : (thread_start + thread_stride); > + > + DnnOperand *output_operand = &operands[thread_common_param->output_operand_index]; > output_operand->dims[0] = number; > output_operand->dims[1] = height - pad_size * 2; > output_operand->dims[2] = width - pad_size * 2; > @@ -114,19 +138,21 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ > output_operand->data_type = operands[input_operand_index].data_type; > output_operand->length = calculate_operand_data_length(output_operand); > if (output_operand->length <= 0) { > - av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n"); > - return DNN_ERROR; > + av_log(thread_common_param->ctx, AV_LOG_ERROR, "The output data length overflow\n"); > + return (void *)DNN_ERROR; > } > output_operand->data = av_realloc(output_operand->data, output_operand->length); > if (!output_operand->data) { > - av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n"); > - return DNN_ERROR; > + av_log(thread_common_param->ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n"); > + return (void *)DNN_ERROR; > } > + > output = output_operand->data; > + output += (conv_params->output_num) * (width - 2 * pad_size) * (thread_start - pad_size); > > av_assert0(channel == conv_params->input_num); > > - for (int y = pad_size; y < height - pad_size; ++y) { > + for (int y = thread_start; y < thread_end; ++y) { > for (int x = pad_size; x < width - pad_size; ++x) { > for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) { > if (conv_params->has_bias) > @@ -174,5 +200,64 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ > output += conv_params->output_num; > } > } > - return 0; > + return (void *)0; why do you return a (void *) 0, I saw dnn_execute_layer_conv2d is int type. > +} > + > + > +int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes, > + int32_t output_operand_index, const void *parameters, NativeContext *ctx) > +{ > + int thread_num = (ctx->options.conv2d_threads <= 0 || ctx->options.conv2d_threads > av_cpu_count()) > + ? (av_cpu_count() + 1) : (ctx->options.conv2d_threads); > +#if HAVE_PTHREAD_CANCEL > + pthread_t *thread_id = av_malloc(thread_num * sizeof(pthread_t)); > +#endif > + thread_param **thread_param = av_malloc(thread_num * sizeof(*thread_param)); > + void *res; > + int error_flag = 0; > + > + //struct used to pass parameters > + thread_common_param thread_common_param; > + thread_common_param.operands = operands; > + thread_common_param.input_operand_indexes = input_operand_indexes; > + thread_common_param.output_operand_index = output_operand_index; > + thread_common_param.parameters = parameters; > + thread_common_param.ctx = ctx; > +#if HAVE_PTHREAD_CANCEL > + thread_common_param.thread_num = thread_num; > + > + //create threads > + for (int i = 0; i < thread_num; i++){ > + thread_param[i] = av_malloc(sizeof(thread_param)); > + thread_param[i]->thread_common_param = &thread_common_param; > + thread_param[i]->thread_index = i; > + pthread_create(&thread_id[i], NULL, dnn_execute_layer_conv2d_thread, (void *)thread_param[i]); > + } > + > + //join threads, res gets function return > + for (int i = 0; i < thread_num; i++){ > + pthread_join(thread_id[i], &res); > + if ((int)res != 0) > + error_flag = (int)res; > + } > + > + //release memory > + av_free(thread_id); > + > + for (int i = 0; i < thread_num; i++){ > + av_free(thread_param[i]); > + } > +#else > + thread_common_param.thread_num = 1; > + thread_param[0] = av_malloc(sizeof(thread_param)); > + thread_param[0]->thread_common_param = &thread_common_param; > + thread_param[0]->thread_index = 0; > + res = dnn_execute_layer_conv2d_thread((void *)thread_param[0]); > + if ((int)res != 0) > + error_flag = (int)res; > + av_free(thread_param[0]); > +#endif > + > + av_free(thread_param); > + return error_flag; > } > diff --git a/tests/dnn/dnn-layer-conv2d-test.c b/tests/dnn/dnn-layer-conv2d-test.c > index 836839cc64..378a05eafc 100644 > --- a/tests/dnn/dnn-layer-conv2d-test.c > +++ b/tests/dnn/dnn-layer-conv2d-test.c > @@ -25,6 +25,8 @@ > > #define EPSON 0.00001 > > +extern const AVClass dnn_native_class; > + > static int test_with_same_dilate(void) > { > // the input data and expected data are generated with below python code. > @@ -96,6 +98,10 @@ static int test_with_same_dilate(void) > }; > float bias[2] = { -1.6574852, -0.72915393 }; > > + NativeContext ctx; > + ctx.class = &dnn_native_class; > + ctx.options.conv2d_threads = 1; > + > params.activation = TANH; > params.has_bias = 1; > params.biases = bias; > @@ -114,7 +120,7 @@ static int test_with_same_dilate(void) > operands[1].data = NULL; > > input_indexes[0] = 0; > - dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, NULL); > + dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, &ctx); > > output = operands[1].data; > for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) { > @@ -196,6 +202,10 @@ static int test_with_valid(void) > }; > float bias[2] = { -0.4773722, -0.19620377 }; > > + NativeContext ctx; > + ctx.class = &dnn_native_class; > + ctx.options.conv2d_threads = 1; > + > params.activation = TANH; > params.has_bias = 1; > params.biases = bias; > @@ -214,7 +224,7 @@ static int test_with_valid(void) > operands[1].data = NULL; > > input_indexes[0] = 0; > - dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, NULL); > + dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, &ctx); > > output = operands[1].data; > for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) { > -- > 2.28.0 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". Thanks Steven
Hi, Steven ----- Original Message ----- > From: "Steven Liu" <lingjiujianke@gmail.com> > To: "FFmpeg development discussions and patches" <ffmpeg-devel@ffmpeg.org> > Sent: Saturday, September 5, 2020 6:07:45 AM > Subject: Re: [FFmpeg-devel] [PATCH v4 2/2] dnn_backend_native_layer_conv2d.c:Add mutithread function > <xujunzz@sjtu.edu.cn> 于2020年9月4日周五 下午11:09写道: >> >> From: Xu Jun <xujunzz@sjtu.edu.cn> >> >> Use pthread to multithread dnn_execute_layer_conv2d. >> Can be tested with command "./ffmpeg_g -i input.png -vf \ >> format=yuvj420p,dnn_processing=dnn_backend=native:model= \ >> espcn.model:input=x:output=y:options=conv2d_threads=23 \ >> -y sr_native.jpg -benchmark" >> >> before patch: utime=11.238s stime=0.005s rtime=11.248s >> after patch: utime=20.817s stime=0.047s rtime=1.051s >> on my 3900X 12c24t @4.2GHz >> >> About the increase of utime, it's because that CPU HyperThreading >> technology makes logical cores twice of physical cores while cpu's >> counting performance improves less than double. And utime sums >> all cpu's logical cores' runtime. As a result, using threads num >> near cpu's logical core's number will double utime, while reduce >> rtime less than half for HyperThreading CPUs. >> >> Signed-off-by: Xu Jun <xujunzz@sjtu.edu.cn> >> --- >> v2: add check for HAVE_PTHREAD_CANCEL and modify FATE test >> dnn-layer-conv2d-test.c >> v4: use extern to call dnn_native_class in dnn-layer-conv2d-test.c >> >> .../dnn/dnn_backend_native_layer_conv2d.c | 107 ++++++++++++++++-- >> tests/dnn/dnn-layer-conv2d-test.c | 14 ++- >> 2 files changed, 108 insertions(+), 13 deletions(-) >> >> diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c >> b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c >> index d079795bf8..4068a13ab4 100644 >> --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c >> +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c >> @@ -19,10 +19,27 @@ >> */ >> >> #include "libavutil/avassert.h" >> +#include "libavutil/thread.h" >> +#include "libavutil/cpu.h" >> #include "dnn_backend_native_layer_conv2d.h" >> >> #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x))) >> >> +//struct to pass parameters >> +typedef struct thread_common_param{ >> + DnnOperand *operands; >> + const int32_t *input_operand_indexes; >> + int32_t output_operand_index; >> + const void *parameters; >> + NativeContext *ctx; >> + int thread_num; >> +} thread_common_param; >> + >> +typedef struct thread_param{ >> + thread_common_param *thread_common_param; >> + int thread_index; >> +} thread_param; >> + >> int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int >> file_size, int operands_num) >> { >> ConvolutionalParams *conv_params; >> @@ -88,17 +105,20 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext >> *model_file_context, int fil >> return dnn_size; >> } >> >> -int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t >> *input_operand_indexes, >> - int32_t output_operand_index, const void >> *parameters, NativeContext *ctx) >> +static void * dnn_execute_layer_conv2d_thread(void *threadarg) >> { >> + //pass parameters >> + thread_param *thread_param = (struct thread_param *)threadarg; >> + thread_common_param *thread_common_param = >> thread_param->thread_common_param; >> + DnnOperand *operands = thread_common_param->operands; >> float *output; >> - int32_t input_operand_index = input_operand_indexes[0]; >> + int32_t input_operand_index = >> thread_common_param->input_operand_indexes[0]; >> int number = operands[input_operand_index].dims[0]; >> int height = operands[input_operand_index].dims[1]; >> int width = operands[input_operand_index].dims[2]; >> int channel = operands[input_operand_index].dims[3]; >> const float *input = operands[input_operand_index].data; >> - const ConvolutionalParams *conv_params = (const ConvolutionalParams >> *)parameters; >> + const ConvolutionalParams *conv_params = (const ConvolutionalParams >> *)(thread_common_param->parameters); >> >> int radius = conv_params->kernel_size >> 1; >> int src_linesize = width * conv_params->input_num; >> @@ -106,7 +126,11 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const >> int32_t *input_operand_ >> int filter_size = conv_params->kernel_size * filter_linesize; >> int pad_size = (conv_params->padding_method == VALID) ? >> (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0; >> >> - DnnOperand *output_operand = &operands[output_operand_index]; >> + int thread_stride = (height - pad_size * 2) / >> thread_common_param->thread_num; >> + int thread_start = thread_stride * thread_param->thread_index + pad_size; >> + int thread_end = (thread_param->thread_index == >> thread_common_param->thread_num - 1) ? (height - pad_size) : (thread_start + >> thread_stride); >> + >> + DnnOperand *output_operand = >> &operands[thread_common_param->output_operand_index]; >> output_operand->dims[0] = number; >> output_operand->dims[1] = height - pad_size * 2; >> output_operand->dims[2] = width - pad_size * 2; >> @@ -114,19 +138,21 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const >> int32_t *input_operand_ >> output_operand->data_type = operands[input_operand_index].data_type; >> output_operand->length = calculate_operand_data_length(output_operand); >> if (output_operand->length <= 0) { >> - av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n"); >> - return DNN_ERROR; >> + av_log(thread_common_param->ctx, AV_LOG_ERROR, "The output data length >> overflow\n"); >> + return (void *)DNN_ERROR; >> } >> output_operand->data = av_realloc(output_operand->data, output_operand->length); >> if (!output_operand->data) { >> - av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n"); >> - return DNN_ERROR; >> + av_log(thread_common_param->ctx, AV_LOG_ERROR, "Failed to reallocate >> memory for output\n"); >> + return (void *)DNN_ERROR; >> } >> + >> output = output_operand->data; >> + output += (conv_params->output_num) * (width - 2 * pad_size) * >> (thread_start - pad_size); >> >> av_assert0(channel == conv_params->input_num); >> >> - for (int y = pad_size; y < height - pad_size; ++y) { >> + for (int y = thread_start; y < thread_end; ++y) { >> for (int x = pad_size; x < width - pad_size; ++x) { >> for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) { >> if (conv_params->has_bias) >> @@ -174,5 +200,64 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const >> int32_t *input_operand_ >> output += conv_params->output_num; >> } >> } >> - return 0; >> + return (void *)0; > why do you return a (void *) 0, I saw dnn_execute_layer_conv2d is int type. Actually this should return a (void *)DNN_SUCCESS to be consistent with other codes. Thank you for pointing that out! >> +} >> + >> + >> +int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t >> *input_operand_indexes, >> + int32_t output_operand_index, const void >> *parameters, NativeContext *ctx) >> +{ >> + int thread_num = (ctx->options.conv2d_threads <= 0 || >> ctx->options.conv2d_threads > av_cpu_count()) >> + ? (av_cpu_count() + 1) : (ctx->options.conv2d_threads); >> +#if HAVE_PTHREAD_CANCEL >> + pthread_t *thread_id = av_malloc(thread_num * sizeof(pthread_t)); >> +#endif >> + thread_param **thread_param = av_malloc(thread_num * >> sizeof(*thread_param)); >> + void *res; >> + int error_flag = 0; >> + >> + //struct used to pass parameters >> + thread_common_param thread_common_param; >> + thread_common_param.operands = operands; >> + thread_common_param.input_operand_indexes = input_operand_indexes; >> + thread_common_param.output_operand_index = output_operand_index; >> + thread_common_param.parameters = parameters; >> + thread_common_param.ctx = ctx; >> +#if HAVE_PTHREAD_CANCEL >> + thread_common_param.thread_num = thread_num; >> + >> + //create threads >> + for (int i = 0; i < thread_num; i++){ >> + thread_param[i] = av_malloc(sizeof(thread_param)); >> + thread_param[i]->thread_common_param = &thread_common_param; >> + thread_param[i]->thread_index = i; >> + pthread_create(&thread_id[i], NULL, dnn_execute_layer_conv2d_thread, >> (void *)thread_param[i]); >> + } >> + >> + //join threads, res gets function return >> + for (int i = 0; i < thread_num; i++){ >> + pthread_join(thread_id[i], &res); >> + if ((int)res != 0) >> + error_flag = (int)res; >> + } >> + >> + //release memory >> + av_free(thread_id); >> + >> + for (int i = 0; i < thread_num; i++){ >> + av_free(thread_param[i]); >> + } >> +#else >> + thread_common_param.thread_num = 1; >> + thread_param[0] = av_malloc(sizeof(thread_param)); >> + thread_param[0]->thread_common_param = &thread_common_param; >> + thread_param[0]->thread_index = 0; >> + res = dnn_execute_layer_conv2d_thread((void *)thread_param[0]); >> + if ((int)res != 0) >> + error_flag = (int)res; >> + av_free(thread_param[0]); >> +#endif >> + >> + av_free(thread_param); >> + return error_flag; >> } >> diff --git a/tests/dnn/dnn-layer-conv2d-test.c >> b/tests/dnn/dnn-layer-conv2d-test.c >> index 836839cc64..378a05eafc 100644 >> --- a/tests/dnn/dnn-layer-conv2d-test.c >> +++ b/tests/dnn/dnn-layer-conv2d-test.c >> @@ -25,6 +25,8 @@ >> >> #define EPSON 0.00001 >> >> +extern const AVClass dnn_native_class; >> + >> static int test_with_same_dilate(void) >> { >> // the input data and expected data are generated with below python code. >> @@ -96,6 +98,10 @@ static int test_with_same_dilate(void) >> }; >> float bias[2] = { -1.6574852, -0.72915393 }; >> >> + NativeContext ctx; >> + ctx.class = &dnn_native_class; >> + ctx.options.conv2d_threads = 1; >> + >> params.activation = TANH; >> params.has_bias = 1; >> params.biases = bias; >> @@ -114,7 +120,7 @@ static int test_with_same_dilate(void) >> operands[1].data = NULL; >> >> input_indexes[0] = 0; >> - dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, NULL); >> + dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, &ctx); >> >> output = operands[1].data; >> for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) { >> @@ -196,6 +202,10 @@ static int test_with_valid(void) >> }; >> float bias[2] = { -0.4773722, -0.19620377 }; >> >> + NativeContext ctx; >> + ctx.class = &dnn_native_class; >> + ctx.options.conv2d_threads = 1; >> + >> params.activation = TANH; >> params.has_bias = 1; >> params.biases = bias; >> @@ -214,7 +224,7 @@ static int test_with_valid(void) >> operands[1].data = NULL; >> >> input_indexes[0] = 0; >> - dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, NULL); >> + dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, &ctx); >> >> output = operands[1].data; >> for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) { >> -- >> 2.28.0 >> >> _______________________________________________ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> >> To unsubscribe, visit link above, or email >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > > > Thanks > Steven > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". Thanks - Xu Jun
diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c index d079795bf8..4068a13ab4 100644 --- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c +++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c @@ -19,10 +19,27 @@ */ #include "libavutil/avassert.h" +#include "libavutil/thread.h" +#include "libavutil/cpu.h" #include "dnn_backend_native_layer_conv2d.h" #define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x))) +//struct to pass parameters +typedef struct thread_common_param{ + DnnOperand *operands; + const int32_t *input_operand_indexes; + int32_t output_operand_index; + const void *parameters; + NativeContext *ctx; + int thread_num; +} thread_common_param; + +typedef struct thread_param{ + thread_common_param *thread_common_param; + int thread_index; +} thread_param; + int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num) { ConvolutionalParams *conv_params; @@ -88,17 +105,20 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int fil return dnn_size; } -int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes, - int32_t output_operand_index, const void *parameters, NativeContext *ctx) +static void * dnn_execute_layer_conv2d_thread(void *threadarg) { + //pass parameters + thread_param *thread_param = (struct thread_param *)threadarg; + thread_common_param *thread_common_param = thread_param->thread_common_param; + DnnOperand *operands = thread_common_param->operands; float *output; - int32_t input_operand_index = input_operand_indexes[0]; + int32_t input_operand_index = thread_common_param->input_operand_indexes[0]; int number = operands[input_operand_index].dims[0]; int height = operands[input_operand_index].dims[1]; int width = operands[input_operand_index].dims[2]; int channel = operands[input_operand_index].dims[3]; const float *input = operands[input_operand_index].data; - const ConvolutionalParams *conv_params = (const ConvolutionalParams *)parameters; + const ConvolutionalParams *conv_params = (const ConvolutionalParams *)(thread_common_param->parameters); int radius = conv_params->kernel_size >> 1; int src_linesize = width * conv_params->input_num; @@ -106,7 +126,11 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ int filter_size = conv_params->kernel_size * filter_linesize; int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0; - DnnOperand *output_operand = &operands[output_operand_index]; + int thread_stride = (height - pad_size * 2) / thread_common_param->thread_num; + int thread_start = thread_stride * thread_param->thread_index + pad_size; + int thread_end = (thread_param->thread_index == thread_common_param->thread_num - 1) ? (height - pad_size) : (thread_start + thread_stride); + + DnnOperand *output_operand = &operands[thread_common_param->output_operand_index]; output_operand->dims[0] = number; output_operand->dims[1] = height - pad_size * 2; output_operand->dims[2] = width - pad_size * 2; @@ -114,19 +138,21 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ output_operand->data_type = operands[input_operand_index].data_type; output_operand->length = calculate_operand_data_length(output_operand); if (output_operand->length <= 0) { - av_log(ctx, AV_LOG_ERROR, "The output data length overflow\n"); - return DNN_ERROR; + av_log(thread_common_param->ctx, AV_LOG_ERROR, "The output data length overflow\n"); + return (void *)DNN_ERROR; } output_operand->data = av_realloc(output_operand->data, output_operand->length); if (!output_operand->data) { - av_log(ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n"); - return DNN_ERROR; + av_log(thread_common_param->ctx, AV_LOG_ERROR, "Failed to reallocate memory for output\n"); + return (void *)DNN_ERROR; } + output = output_operand->data; + output += (conv_params->output_num) * (width - 2 * pad_size) * (thread_start - pad_size); av_assert0(channel == conv_params->input_num); - for (int y = pad_size; y < height - pad_size; ++y) { + for (int y = thread_start; y < thread_end; ++y) { for (int x = pad_size; x < width - pad_size; ++x) { for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) { if (conv_params->has_bias) @@ -174,5 +200,64 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_ output += conv_params->output_num; } } - return 0; + return (void *)0; +} + + +int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_indexes, + int32_t output_operand_index, const void *parameters, NativeContext *ctx) +{ + int thread_num = (ctx->options.conv2d_threads <= 0 || ctx->options.conv2d_threads > av_cpu_count()) + ? (av_cpu_count() + 1) : (ctx->options.conv2d_threads); +#if HAVE_PTHREAD_CANCEL + pthread_t *thread_id = av_malloc(thread_num * sizeof(pthread_t)); +#endif + thread_param **thread_param = av_malloc(thread_num * sizeof(*thread_param)); + void *res; + int error_flag = 0; + + //struct used to pass parameters + thread_common_param thread_common_param; + thread_common_param.operands = operands; + thread_common_param.input_operand_indexes = input_operand_indexes; + thread_common_param.output_operand_index = output_operand_index; + thread_common_param.parameters = parameters; + thread_common_param.ctx = ctx; +#if HAVE_PTHREAD_CANCEL + thread_common_param.thread_num = thread_num; + + //create threads + for (int i = 0; i < thread_num; i++){ + thread_param[i] = av_malloc(sizeof(thread_param)); + thread_param[i]->thread_common_param = &thread_common_param; + thread_param[i]->thread_index = i; + pthread_create(&thread_id[i], NULL, dnn_execute_layer_conv2d_thread, (void *)thread_param[i]); + } + + //join threads, res gets function return + for (int i = 0; i < thread_num; i++){ + pthread_join(thread_id[i], &res); + if ((int)res != 0) + error_flag = (int)res; + } + + //release memory + av_free(thread_id); + + for (int i = 0; i < thread_num; i++){ + av_free(thread_param[i]); + } +#else + thread_common_param.thread_num = 1; + thread_param[0] = av_malloc(sizeof(thread_param)); + thread_param[0]->thread_common_param = &thread_common_param; + thread_param[0]->thread_index = 0; + res = dnn_execute_layer_conv2d_thread((void *)thread_param[0]); + if ((int)res != 0) + error_flag = (int)res; + av_free(thread_param[0]); +#endif + + av_free(thread_param); + return error_flag; } diff --git a/tests/dnn/dnn-layer-conv2d-test.c b/tests/dnn/dnn-layer-conv2d-test.c index 836839cc64..378a05eafc 100644 --- a/tests/dnn/dnn-layer-conv2d-test.c +++ b/tests/dnn/dnn-layer-conv2d-test.c @@ -25,6 +25,8 @@ #define EPSON 0.00001 +extern const AVClass dnn_native_class; + static int test_with_same_dilate(void) { // the input data and expected data are generated with below python code. @@ -96,6 +98,10 @@ static int test_with_same_dilate(void) }; float bias[2] = { -1.6574852, -0.72915393 }; + NativeContext ctx; + ctx.class = &dnn_native_class; + ctx.options.conv2d_threads = 1; + params.activation = TANH; params.has_bias = 1; params.biases = bias; @@ -114,7 +120,7 @@ static int test_with_same_dilate(void) operands[1].data = NULL; input_indexes[0] = 0; - dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, NULL); + dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, &ctx); output = operands[1].data; for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) { @@ -196,6 +202,10 @@ static int test_with_valid(void) }; float bias[2] = { -0.4773722, -0.19620377 }; + NativeContext ctx; + ctx.class = &dnn_native_class; + ctx.options.conv2d_threads = 1; + params.activation = TANH; params.has_bias = 1; params.biases = bias; @@ -214,7 +224,7 @@ static int test_with_valid(void) operands[1].data = NULL; input_indexes[0] = 0; - dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, NULL); + dnn_execute_layer_conv2d(operands, input_indexes, 1, ¶ms, &ctx); output = operands[1].data; for (int i = 0; i < sizeof(expected_output) / sizeof(float); i++) {