diff --git a/libavfilter/dnn/dnn_backend_openvino.c b/libavfilter/dnn/dnn_backend_openvino.c index ae53488837..10520cd765 100644 --- a/libavfilter/dnn/dnn_backend_openvino.c +++ b/libavfilter/dnn/dnn_backend_openvino.c @@ -46,6 +46,8 @@ typedef struct OVOptions{ int batch_size; int input_resizable; DNNLayout layout; + float scale; + float mean; } OVOptions; typedef struct OVContext { @@ -105,6 +107,8 @@ static const AVOption dnn_openvino_options[] = { { "none", "none", 0, AV_OPT_TYPE_CONST, { .i64 = DL_NONE }, 0, 0, FLAGS, "layout"}, { "nchw", "nchw", 0, AV_OPT_TYPE_CONST, { .i64 = DL_NCHW }, 0, 0, FLAGS, "layout"}, { "nhwc", "nhwc", 0, AV_OPT_TYPE_CONST, { .i64 = DL_NHWC }, 0, 0, FLAGS, "layout"}, + { "scale", "Add scale preprocess operation. Divide each element of input by specified value.", OFFSET(options.scale), AV_OPT_TYPE_FLOAT, { .dbl = 0 }, INT_MIN, INT_MAX, FLAGS}, + { "mean", "Add mean preprocess operation. Subtract specified value from each element of input.", OFFSET(options.mean), AV_OPT_TYPE_FLOAT, { .dbl = 0 }, INT_MIN, INT_MAX, FLAGS}, { NULL } }; @@ -209,6 +213,7 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request) ie_blob_t *input_blob = NULL; #endif + memset(&input, 0, sizeof(input)); lltask = ff_queue_peek_front(ov_model->lltask_queue); av_assert0(lltask); task = lltask->task; @@ -280,6 +285,9 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request) // all models in openvino open model zoo use BGR as input, // change to be an option when necessary. input.order = DCO_BGR; + // We use preprocess_steps to scale input data, so disable scale and mean here. + input.scale = 1; + input.mean = 0; for (int i = 0; i < ctx->options.batch_size; ++i) { lltask = ff_queue_pop_front(ov_model->lltask_queue); @@ -350,6 +358,7 @@ static void infer_completion_callback(void *args) ov_shape_t output_shape = {0}; ov_element_type_e precision; + memset(&output, 0, sizeof(output)); status = ov_infer_request_get_output_tensor_by_index(request->infer_request, 0, &output_tensor); if (status != OK) { av_log(ctx, AV_LOG_ERROR, @@ -418,6 +427,8 @@ static void infer_completion_callback(void *args) #endif output.dt = precision_to_datatype(precision); output.layout = ctx->options.layout; + output.scale = ctx->options.scale; + output.mean = ctx->options.mean; av_assert0(request->lltask_count >= 1); for (int i = 0; i < request->lltask_count; ++i) { @@ -561,7 +572,9 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char * ie_config_t config = {NULL, NULL, NULL}; char *all_dev_names = NULL; #endif - + // We scale pixel by default when do frame processing. + if (fabsf(ctx->options.scale) < 1e-6f) + ctx->options.scale = ov_model->model->func_type == DFT_PROCESS_FRAME ? 255 : 1; // batch size if (ctx->options.batch_size <= 0) { ctx->options.batch_size = 1; @@ -628,15 +641,37 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char * goto err; } + status = ov_preprocess_input_tensor_info_set_element_type(input_tensor_info, U8); if (ov_model->model->func_type != DFT_PROCESS_FRAME) - //set precision only for detect and classify - status = ov_preprocess_input_tensor_info_set_element_type(input_tensor_info, U8); - status |= ov_preprocess_output_set_element_type(output_tensor_info, F32); + status |= ov_preprocess_output_set_element_type(output_tensor_info, F32); + else if (fabsf(ctx->options.scale - 1) > 1e-6f || fabsf(ctx->options.mean) > 1e-6f) + status |= ov_preprocess_output_set_element_type(output_tensor_info, F32); + else + status |= ov_preprocess_output_set_element_type(output_tensor_info, U8); if (status != OK) { av_log(ctx, AV_LOG_ERROR, "Failed to set input/output element type\n"); ret = ov2_map_error(status, NULL); goto err; } + // set preprocess steps. + if (fabsf(ctx->options.scale - 1) > 1e-6f || fabsf(ctx->options.mean) > 1e-6f) { + ov_preprocess_preprocess_steps_t* input_process_steps = NULL; + status = ov_preprocess_input_info_get_preprocess_steps(ov_model->input_info, &input_process_steps); + if (status != OK) { + av_log(ctx, AV_LOG_ERROR, "Failed to get preprocess steps\n"); + ret = ov2_map_error(status, NULL); + goto err; + } + status = ov_preprocess_preprocess_steps_convert_element_type(input_process_steps, F32); + status |= ov_preprocess_preprocess_steps_mean(input_process_steps, ctx->options.mean); + status |= ov_preprocess_preprocess_steps_scale(input_process_steps, ctx->options.scale); + if (status != OK) { + av_log(ctx, AV_LOG_ERROR, "Failed to set preprocess steps\n"); + ret = ov2_map_error(status, NULL); + goto err; + } + ov_preprocess_preprocess_steps_free(input_process_steps); + } //update model if(ov_model->ov_model) diff --git a/libavfilter/dnn/dnn_io_proc.c b/libavfilter/dnn/dnn_io_proc.c index dfa0d5e5da..ab656e8ed7 100644 --- a/libavfilter/dnn/dnn_io_proc.c +++ b/libavfilter/dnn/dnn_io_proc.c @@ -24,6 +24,20 @@ #include "libavutil/avassert.h" #include "libavutil/detection_bbox.h" +static int get_datatype_size(DNNDataType dt) +{ + switch (dt) + { + case DNN_FLOAT: + return sizeof(float); + case DNN_UINT8: + return sizeof(uint8_t); + default: + av_assert0(!"not supported yet."); + return 1; + } +} + int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx) { struct SwsContext *sws_ctx; @@ -33,14 +47,26 @@ int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx) void *middle_data = NULL; uint8_t *planar_data[4] = { 0 }; int plane_size = frame->width * frame->height * sizeof(uint8_t); + enum AVPixelFormat src_fmt = AV_PIX_FMT_NONE; + int src_datatype_size = get_datatype_size(output->dt); + int bytewidth = av_image_get_linesize(frame->format, frame->width, 0); if (bytewidth < 0) { return AVERROR(EINVAL); } - if (output->dt != DNN_FLOAT) { - avpriv_report_missing_feature(log_ctx, "data type rather than DNN_FLOAT"); + /* scale == 1 and mean == 0 and dt == UINT8: passthrough */ + if (fabsf(output->scale - 1) < 1e-6f && fabsf(output->mean) < 1e-6 && output->dt == DNN_UINT8) + src_fmt = AV_PIX_FMT_GRAY8; + /* (scale == 255 or scale == 0) and mean == 0 and dt == FLOAT: normalization */ + else if ((fabsf(output->scale - 255) < 1e-6f || fabsf(output->scale) < 1e-6f) && + fabsf(output->mean) < 1e-6 && output->dt == DNN_FLOAT) + src_fmt = AV_PIX_FMT_GRAYF32; + else { + av_log(log_ctx, AV_LOG_ERROR, "dnn_process output data doesn't type: UINT8 " + "scale: %f, mean: %f\n", output->scale, output->mean); return AVERROR(ENOSYS); } + dst_data = (void **)frame->data; linesize[0] = frame->linesize[0]; if (output->layout == DL_NCHW) { @@ -58,7 +84,7 @@ int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx) case AV_PIX_FMT_BGR24: sws_ctx = sws_getContext(frame->width * 3, frame->height, - AV_PIX_FMT_GRAYF32, + src_fmt, frame->width * 3, frame->height, AV_PIX_FMT_GRAY8, @@ -66,13 +92,13 @@ int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx) if (!sws_ctx) { av_log(log_ctx, AV_LOG_ERROR, "Impossible to create scale context for the conversion " "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n", - av_get_pix_fmt_name(AV_PIX_FMT_GRAYF32), frame->width * 3, frame->height, + av_get_pix_fmt_name(src_fmt), frame->width * 3, frame->height, av_get_pix_fmt_name(AV_PIX_FMT_GRAY8), frame->width * 3, frame->height); ret = AVERROR(EINVAL); goto err; } sws_scale(sws_ctx, (const uint8_t *[4]){(const uint8_t *)output->data, 0, 0, 0}, - (const int[4]){frame->width * 3 * sizeof(float), 0, 0, 0}, 0, frame->height, + (const int[4]){frame->width * 3 * src_datatype_size, 0, 0, 0}, 0, frame->height, (uint8_t * const*)dst_data, linesize); sws_freeContext(sws_ctx); // convert data from planar to packed @@ -131,13 +157,13 @@ int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx) if (!sws_ctx) { av_log(log_ctx, AV_LOG_ERROR, "Impossible to create scale context for the conversion " "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n", - av_get_pix_fmt_name(AV_PIX_FMT_GRAYF32), frame->width, frame->height, + av_get_pix_fmt_name(src_fmt), frame->width, frame->height, av_get_pix_fmt_name(AV_PIX_FMT_GRAY8), frame->width, frame->height); ret = AVERROR(EINVAL); goto err; } sws_scale(sws_ctx, (const uint8_t *[4]){(const uint8_t *)output->data, 0, 0, 0}, - (const int[4]){frame->width * sizeof(float), 0, 0, 0}, 0, frame->height, + (const int[4]){frame->width * src_datatype_size, 0, 0, 0}, 0, frame->height, (uint8_t * const*)frame->data, frame->linesize); sws_freeContext(sws_ctx); break; @@ -161,12 +187,22 @@ int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx) void *middle_data = NULL; uint8_t *planar_data[4] = { 0 }; int plane_size = frame->width * frame->height * sizeof(uint8_t); + enum AVPixelFormat dst_fmt = AV_PIX_FMT_NONE; + int dst_datatype_size = get_datatype_size(input->dt); int bytewidth = av_image_get_linesize(frame->format, frame->width, 0); if (bytewidth < 0) { return AVERROR(EINVAL); } - if (input->dt != DNN_FLOAT) { - avpriv_report_missing_feature(log_ctx, "data type rather than DNN_FLOAT"); + /* scale == 1 and mean == 0 and dt == UINT8: passthrough */ + if (fabsf(input->scale - 1) < 1e-6f && fabsf(input->mean) < 1e-6 && input->dt == DNN_UINT8) + dst_fmt = AV_PIX_FMT_GRAY8; + /* (scale == 255 or scale == 0) and mean == 0 and dt == FLOAT: normalization */ + else if ((fabsf(input->scale - 255) < 1e-6f || fabsf(input->scale) < 1e-6f) && + fabsf(input->mean) < 1e-6 && input->dt == DNN_FLOAT) + dst_fmt = AV_PIX_FMT_GRAYF32; + else { + av_log(log_ctx, AV_LOG_ERROR, "dnn_process input data doesn't support type: UINT8 " + "scale: %f, mean: %f\n", input->scale, input->mean); return AVERROR(ENOSYS); } @@ -223,20 +259,20 @@ int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx) AV_PIX_FMT_GRAY8, frame->width * 3, frame->height, - AV_PIX_FMT_GRAYF32, + dst_fmt, 0, NULL, NULL, NULL); if (!sws_ctx) { av_log(log_ctx, AV_LOG_ERROR, "Impossible to create scale context for the conversion " "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n", av_get_pix_fmt_name(AV_PIX_FMT_GRAY8), frame->width * 3, frame->height, - av_get_pix_fmt_name(AV_PIX_FMT_GRAYF32),frame->width * 3, frame->height); + av_get_pix_fmt_name(dst_fmt),frame->width * 3, frame->height); ret = AVERROR(EINVAL); goto err; } sws_scale(sws_ctx, (const uint8_t **)src_data, linesize, 0, frame->height, (uint8_t * const [4]){input->data, 0, 0, 0}, - (const int [4]){frame->width * 3 * sizeof(float), 0, 0, 0}); + (const int [4]){frame->width * 3 * dst_datatype_size, 0, 0, 0}); sws_freeContext(sws_ctx); break; case AV_PIX_FMT_GRAYF32: @@ -256,20 +292,20 @@ int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx) AV_PIX_FMT_GRAY8, frame->width, frame->height, - AV_PIX_FMT_GRAYF32, + dst_fmt, 0, NULL, NULL, NULL); if (!sws_ctx) { av_log(log_ctx, AV_LOG_ERROR, "Impossible to create scale context for the conversion " "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n", av_get_pix_fmt_name(AV_PIX_FMT_GRAY8), frame->width, frame->height, - av_get_pix_fmt_name(AV_PIX_FMT_GRAYF32),frame->width, frame->height); + av_get_pix_fmt_name(dst_fmt),frame->width, frame->height); ret = AVERROR(EINVAL); goto err; } sws_scale(sws_ctx, (const uint8_t **)frame->data, frame->linesize, 0, frame->height, (uint8_t * const [4]){input->data, 0, 0, 0}, - (const int [4]){frame->width * sizeof(float), 0, 0, 0}); + (const int [4]){frame->width * dst_datatype_size, 0, 0, 0}); sws_freeContext(sws_ctx); break; default: @@ -315,6 +351,14 @@ int ff_frame_to_dnn_classify(AVFrame *frame, DNNData *input, uint32_t bbox_index AVFrameSideData *sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES); av_assert0(sd); + /* (scale != 1 and scale != 0) or mean != 0 */ + if ((fabsf(input->scale - 1) > 1e-6f && fabsf(input->scale) > 1e-6f) || + fabsf(input->mean) > 1e-6f) { + av_log(log_ctx, AV_LOG_ERROR, "dnn_classify input data doesn't support " + "scale: %f, mean: %f\n", input->scale, input->mean); + return AVERROR(ENOSYS); + } + if (input->layout == DL_NCHW) { av_log(log_ctx, AV_LOG_ERROR, "dnn_classify input data doesn't support layout: NCHW\n"); return AVERROR(ENOSYS); @@ -373,6 +417,14 @@ int ff_frame_to_dnn_detect(AVFrame *frame, DNNData *input, void *log_ctx) int ret = 0; enum AVPixelFormat fmt = get_pixel_format(input); + /* (scale != 1 and scale != 0) or mean != 0 */ + if ((fabsf(input->scale - 1) > 1e-6f && fabsf(input->scale) > 1e-6f) || + fabsf(input->mean) > 1e-6f) { + av_log(log_ctx, AV_LOG_ERROR, "dnn_detect input data doesn't support " + "scale: %f, mean: %f\n", input->scale, input->mean); + return AVERROR(ENOSYS); + } + if (input->layout == DL_NCHW) { av_log(log_ctx, AV_LOG_ERROR, "dnn_detect input data doesn't support layout: NCHW\n"); return AVERROR(ENOSYS); diff --git a/libavfilter/dnn_interface.h b/libavfilter/dnn_interface.h index 956a63443a..183d8418b2 100644 --- a/libavfilter/dnn_interface.h +++ b/libavfilter/dnn_interface.h @@ -69,6 +69,8 @@ typedef struct DNNData{ DNNDataType dt; DNNColorOrder order; DNNLayout layout; + float scale; + float mean; } DNNData; typedef struct DNNExecBaseParams {