You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-10 06:10:52 +02:00
libavfilter/dnn_interface: use dims to represent shapes
For detect and classify output, width and height make no sence, so change width, height to dims to represent the shape of tensor. Use layout and dims to get width, height and channel. Signed-off-by: Wenbin Chen <wenbin.chen@intel.com> Reviewed-by: Guo Yejun <yejun.guo@intel.com>
This commit is contained in:
@@ -253,9 +253,9 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request)
|
||||
ov_shape_free(&input_shape);
|
||||
return ov2_map_error(status, NULL);
|
||||
}
|
||||
input.height = dims[1];
|
||||
input.width = dims[2];
|
||||
input.channels = dims[3];
|
||||
for (int i = 0; i < input_shape.rank; i++)
|
||||
input.dims[i] = dims[i];
|
||||
input.layout = DL_NHWC;
|
||||
input.dt = precision_to_datatype(precision);
|
||||
#else
|
||||
status = ie_infer_request_get_blob(request->infer_request, task->input_name, &input_blob);
|
||||
@@ -278,9 +278,9 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request)
|
||||
av_log(ctx, AV_LOG_ERROR, "Failed to get input blob buffer\n");
|
||||
return DNN_GENERIC_ERROR;
|
||||
}
|
||||
input.height = dims.dims[2];
|
||||
input.width = dims.dims[3];
|
||||
input.channels = dims.dims[1];
|
||||
for (int i = 0; i < input_shape.rank; i++)
|
||||
input.dims[i] = dims[i];
|
||||
input.layout = DL_NCHW;
|
||||
input.data = blob_buffer.buffer;
|
||||
input.dt = precision_to_datatype(precision);
|
||||
#endif
|
||||
@@ -339,8 +339,8 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request)
|
||||
av_assert0(!"should not reach here");
|
||||
break;
|
||||
}
|
||||
input.data = (uint8_t *)input.data
|
||||
+ input.width * input.height * input.channels * get_datatype_size(input.dt);
|
||||
input.data = (uint8_t *)input.data +
|
||||
input.dims[1] * input.dims[2] * input.dims[3] * get_datatype_size(input.dt);
|
||||
}
|
||||
#if HAVE_OPENVINO2
|
||||
ov_tensor_free(tensor);
|
||||
@@ -403,10 +403,11 @@ static void infer_completion_callback(void *args)
|
||||
goto end;
|
||||
}
|
||||
outputs[i].dt = precision_to_datatype(precision);
|
||||
|
||||
outputs[i].channels = output_shape.rank > 2 ? dims[output_shape.rank - 3] : 1;
|
||||
outputs[i].height = output_shape.rank > 1 ? dims[output_shape.rank - 2] : 1;
|
||||
outputs[i].width = output_shape.rank > 0 ? dims[output_shape.rank - 1] : 1;
|
||||
outputs[i].layout = DL_NCHW;
|
||||
outputs[i].dims[0] = 1;
|
||||
outputs[i].dims[1] = output_shape.rank > 2 ? dims[output_shape.rank - 3] : 1;
|
||||
outputs[i].dims[2] = output_shape.rank > 1 ? dims[output_shape.rank - 2] : 1;
|
||||
outputs[i].dims[3] = output_shape.rank > 0 ? dims[output_shape.rank - 1] : 1;
|
||||
av_assert0(request->lltask_count <= dims[0]);
|
||||
outputs[i].layout = ctx->options.layout;
|
||||
outputs[i].scale = ctx->options.scale;
|
||||
@@ -445,9 +446,9 @@ static void infer_completion_callback(void *args)
|
||||
return;
|
||||
}
|
||||
output.data = blob_buffer.buffer;
|
||||
output.channels = dims.dims[1];
|
||||
output.height = dims.dims[2];
|
||||
output.width = dims.dims[3];
|
||||
output.layout = DL_NCHW;
|
||||
for (int i = 0; i < 4; i++)
|
||||
output.dims[i] = dims.dims[i];
|
||||
av_assert0(request->lltask_count <= dims.dims[0]);
|
||||
output.dt = precision_to_datatype(precision);
|
||||
output.layout = ctx->options.layout;
|
||||
@@ -469,8 +470,10 @@ static void infer_completion_callback(void *args)
|
||||
ff_proc_from_dnn_to_frame(task->out_frame, outputs, ctx);
|
||||
}
|
||||
} else {
|
||||
task->out_frame->width = outputs[0].width;
|
||||
task->out_frame->height = outputs[0].height;
|
||||
task->out_frame->width =
|
||||
outputs[0].dims[dnn_get_width_idx_by_layout(outputs[0].layout)];
|
||||
task->out_frame->height =
|
||||
outputs[0].dims[dnn_get_height_idx_by_layout(outputs[0].layout)];
|
||||
}
|
||||
break;
|
||||
case DFT_ANALYTICS_DETECT:
|
||||
@@ -501,7 +504,8 @@ static void infer_completion_callback(void *args)
|
||||
av_freep(&request->lltasks[i]);
|
||||
for (int i = 0; i < ov_model->nb_outputs; i++)
|
||||
outputs[i].data = (uint8_t *)outputs[i].data +
|
||||
outputs[i].width * outputs[i].height * outputs[i].channels * get_datatype_size(outputs[i].dt);
|
||||
outputs[i].dims[1] * outputs[i].dims[2] * outputs[i].dims[3] *
|
||||
get_datatype_size(outputs[i].dt);
|
||||
}
|
||||
end:
|
||||
#if HAVE_OPENVINO2
|
||||
@@ -1085,7 +1089,6 @@ static int get_input_ov(void *model, DNNData *input, const char *input_name)
|
||||
#if HAVE_OPENVINO2
|
||||
ov_shape_t input_shape = {0};
|
||||
ov_element_type_e precision;
|
||||
int64_t* dims;
|
||||
ov_status_e status;
|
||||
if (input_name)
|
||||
status = ov_model_const_input_by_name(ov_model->ov_model, input_name, &ov_model->input_port);
|
||||
@@ -1105,16 +1108,18 @@ static int get_input_ov(void *model, DNNData *input, const char *input_name)
|
||||
av_log(ctx, AV_LOG_ERROR, "Failed to get input port shape.\n");
|
||||
return ov2_map_error(status, NULL);
|
||||
}
|
||||
dims = input_shape.dims;
|
||||
if (dims[1] <= 3) { // NCHW
|
||||
input->channels = dims[1];
|
||||
input->height = input_resizable ? -1 : dims[2];
|
||||
input->width = input_resizable ? -1 : dims[3];
|
||||
} else { // NHWC
|
||||
input->height = input_resizable ? -1 : dims[1];
|
||||
input->width = input_resizable ? -1 : dims[2];
|
||||
input->channels = dims[3];
|
||||
for (int i = 0; i < 4; i++)
|
||||
input->dims[i] = input_shape.dims[i];
|
||||
if (input_resizable) {
|
||||
input->dims[dnn_get_width_idx_by_layout(input->layout)] = -1;
|
||||
input->dims[dnn_get_height_idx_by_layout(input->layout)] = -1;
|
||||
}
|
||||
|
||||
if (input_shape.dims[1] <= 3) // NCHW
|
||||
input->layout = DL_NCHW;
|
||||
else // NHWC
|
||||
input->layout = DL_NHWC;
|
||||
|
||||
input->dt = precision_to_datatype(precision);
|
||||
ov_shape_free(&input_shape);
|
||||
return 0;
|
||||
@@ -1144,15 +1149,18 @@ static int get_input_ov(void *model, DNNData *input, const char *input_name)
|
||||
return DNN_GENERIC_ERROR;
|
||||
}
|
||||
|
||||
if (dims[1] <= 3) { // NCHW
|
||||
input->channels = dims[1];
|
||||
input->height = input_resizable ? -1 : dims[2];
|
||||
input->width = input_resizable ? -1 : dims[3];
|
||||
} else { // NHWC
|
||||
input->height = input_resizable ? -1 : dims[1];
|
||||
input->width = input_resizable ? -1 : dims[2];
|
||||
input->channels = dims[3];
|
||||
for (int i = 0; i < 4; i++)
|
||||
input->dims[i] = input_shape.dims[i];
|
||||
if (input_resizable) {
|
||||
input->dims[dnn_get_width_idx_by_layout(input->layout)] = -1;
|
||||
input->dims[dnn_get_height_idx_by_layout(input->layout)] = -1;
|
||||
}
|
||||
|
||||
if (input_shape.dims[1] <= 3) // NCHW
|
||||
input->layout = DL_NCHW;
|
||||
else // NHWC
|
||||
input->layout = DL_NHWC;
|
||||
|
||||
input->dt = precision_to_datatype(precision);
|
||||
return 0;
|
||||
}
|
||||
|
@@ -251,7 +251,12 @@ static TF_Tensor *allocate_input_tensor(const DNNData *input)
|
||||
{
|
||||
TF_DataType dt;
|
||||
size_t size;
|
||||
int64_t input_dims[] = {1, input->height, input->width, input->channels};
|
||||
int64_t input_dims[4] = { 0 };
|
||||
|
||||
input_dims[0] = 1;
|
||||
input_dims[1] = input->dims[dnn_get_height_idx_by_layout(input->layout)];
|
||||
input_dims[2] = input->dims[dnn_get_width_idx_by_layout(input->layout)];
|
||||
input_dims[3] = input->dims[dnn_get_channel_idx_by_layout(input->layout)];
|
||||
switch (input->dt) {
|
||||
case DNN_FLOAT:
|
||||
dt = TF_FLOAT;
|
||||
@@ -310,9 +315,9 @@ static int get_input_tf(void *model, DNNData *input, const char *input_name)
|
||||
|
||||
// currently only NHWC is supported
|
||||
av_assert0(dims[0] == 1 || dims[0] == -1);
|
||||
input->height = dims[1];
|
||||
input->width = dims[2];
|
||||
input->channels = dims[3];
|
||||
for (int i = 0; i < 4; i++)
|
||||
input->dims[i] = dims[i];
|
||||
input->layout = DL_NHWC;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -640,8 +645,8 @@ static int fill_model_input_tf(TFModel *tf_model, TFRequestItem *request) {
|
||||
}
|
||||
|
||||
infer_request = request->infer_request;
|
||||
input.height = task->in_frame->height;
|
||||
input.width = task->in_frame->width;
|
||||
input.dims[1] = task->in_frame->height;
|
||||
input.dims[2] = task->in_frame->width;
|
||||
|
||||
infer_request->tf_input = av_malloc(sizeof(TF_Output));
|
||||
if (!infer_request->tf_input) {
|
||||
@@ -731,9 +736,12 @@ static void infer_completion_callback(void *args) {
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < task->nb_output; ++i) {
|
||||
outputs[i].height = TF_Dim(infer_request->output_tensors[i], 1);
|
||||
outputs[i].width = TF_Dim(infer_request->output_tensors[i], 2);
|
||||
outputs[i].channels = TF_Dim(infer_request->output_tensors[i], 3);
|
||||
outputs[i].dims[dnn_get_height_idx_by_layout(outputs[i].layout)] =
|
||||
TF_Dim(infer_request->output_tensors[i], 1);
|
||||
outputs[i].dims[dnn_get_width_idx_by_layout(outputs[i].layout)] =
|
||||
TF_Dim(infer_request->output_tensors[i], 2);
|
||||
outputs[i].dims[dnn_get_channel_idx_by_layout(outputs[i].layout)] =
|
||||
TF_Dim(infer_request->output_tensors[i], 3);
|
||||
outputs[i].data = TF_TensorData(infer_request->output_tensors[i]);
|
||||
outputs[i].dt = (DNNDataType)TF_TensorType(infer_request->output_tensors[i]);
|
||||
}
|
||||
@@ -747,8 +755,10 @@ static void infer_completion_callback(void *args) {
|
||||
ff_proc_from_dnn_to_frame(task->out_frame, outputs, ctx);
|
||||
}
|
||||
} else {
|
||||
task->out_frame->width = outputs[0].width;
|
||||
task->out_frame->height = outputs[0].height;
|
||||
task->out_frame->width =
|
||||
outputs[0].dims[dnn_get_width_idx_by_layout(outputs[0].layout)];
|
||||
task->out_frame->height =
|
||||
outputs[0].dims[dnn_get_height_idx_by_layout(outputs[0].layout)];
|
||||
}
|
||||
break;
|
||||
case DFT_ANALYTICS_DETECT:
|
||||
|
@@ -70,7 +70,7 @@ int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx)
|
||||
dst_data = (void **)frame->data;
|
||||
linesize[0] = frame->linesize[0];
|
||||
if (output->layout == DL_NCHW) {
|
||||
middle_data = av_malloc(plane_size * output->channels);
|
||||
middle_data = av_malloc(plane_size * output->dims[1]);
|
||||
if (!middle_data) {
|
||||
ret = AVERROR(ENOMEM);
|
||||
goto err;
|
||||
@@ -209,7 +209,7 @@ int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx)
|
||||
src_data = (void **)frame->data;
|
||||
linesize[0] = frame->linesize[0];
|
||||
if (input->layout == DL_NCHW) {
|
||||
middle_data = av_malloc(plane_size * input->channels);
|
||||
middle_data = av_malloc(plane_size * input->dims[1]);
|
||||
if (!middle_data) {
|
||||
ret = AVERROR(ENOMEM);
|
||||
goto err;
|
||||
@@ -346,6 +346,7 @@ int ff_frame_to_dnn_classify(AVFrame *frame, DNNData *input, uint32_t bbox_index
|
||||
int ret = 0;
|
||||
enum AVPixelFormat fmt;
|
||||
int left, top, width, height;
|
||||
int width_idx, height_idx;
|
||||
const AVDetectionBBoxHeader *header;
|
||||
const AVDetectionBBox *bbox;
|
||||
AVFrameSideData *sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
|
||||
@@ -364,6 +365,9 @@ int ff_frame_to_dnn_classify(AVFrame *frame, DNNData *input, uint32_t bbox_index
|
||||
return AVERROR(ENOSYS);
|
||||
}
|
||||
|
||||
width_idx = dnn_get_width_idx_by_layout(input->layout);
|
||||
height_idx = dnn_get_height_idx_by_layout(input->layout);
|
||||
|
||||
header = (const AVDetectionBBoxHeader *)sd->data;
|
||||
bbox = av_get_detection_bbox(header, bbox_index);
|
||||
|
||||
@@ -374,17 +378,20 @@ int ff_frame_to_dnn_classify(AVFrame *frame, DNNData *input, uint32_t bbox_index
|
||||
|
||||
fmt = get_pixel_format(input);
|
||||
sws_ctx = sws_getContext(width, height, frame->format,
|
||||
input->width, input->height, fmt,
|
||||
input->dims[width_idx],
|
||||
input->dims[height_idx], fmt,
|
||||
SWS_FAST_BILINEAR, NULL, NULL, NULL);
|
||||
if (!sws_ctx) {
|
||||
av_log(log_ctx, AV_LOG_ERROR, "Failed to create scale context for the conversion "
|
||||
"fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n",
|
||||
av_get_pix_fmt_name(frame->format), width, height,
|
||||
av_get_pix_fmt_name(fmt), input->width, input->height);
|
||||
av_get_pix_fmt_name(fmt),
|
||||
input->dims[width_idx],
|
||||
input->dims[height_idx]);
|
||||
return AVERROR(EINVAL);
|
||||
}
|
||||
|
||||
ret = av_image_fill_linesizes(linesizes, fmt, input->width);
|
||||
ret = av_image_fill_linesizes(linesizes, fmt, input->dims[width_idx]);
|
||||
if (ret < 0) {
|
||||
av_log(log_ctx, AV_LOG_ERROR, "unable to get linesizes with av_image_fill_linesizes");
|
||||
sws_freeContext(sws_ctx);
|
||||
@@ -414,7 +421,7 @@ int ff_frame_to_dnn_detect(AVFrame *frame, DNNData *input, void *log_ctx)
|
||||
{
|
||||
struct SwsContext *sws_ctx;
|
||||
int linesizes[4];
|
||||
int ret = 0;
|
||||
int ret = 0, width_idx, height_idx;
|
||||
enum AVPixelFormat fmt = get_pixel_format(input);
|
||||
|
||||
/* (scale != 1 and scale != 0) or mean != 0 */
|
||||
@@ -430,18 +437,23 @@ int ff_frame_to_dnn_detect(AVFrame *frame, DNNData *input, void *log_ctx)
|
||||
return AVERROR(ENOSYS);
|
||||
}
|
||||
|
||||
width_idx = dnn_get_width_idx_by_layout(input->layout);
|
||||
height_idx = dnn_get_height_idx_by_layout(input->layout);
|
||||
|
||||
sws_ctx = sws_getContext(frame->width, frame->height, frame->format,
|
||||
input->width, input->height, fmt,
|
||||
input->dims[width_idx],
|
||||
input->dims[height_idx], fmt,
|
||||
SWS_FAST_BILINEAR, NULL, NULL, NULL);
|
||||
if (!sws_ctx) {
|
||||
av_log(log_ctx, AV_LOG_ERROR, "Impossible to create scale context for the conversion "
|
||||
"fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n",
|
||||
av_get_pix_fmt_name(frame->format), frame->width, frame->height,
|
||||
av_get_pix_fmt_name(fmt), input->width, input->height);
|
||||
av_get_pix_fmt_name(fmt), input->dims[width_idx],
|
||||
input->dims[height_idx]);
|
||||
return AVERROR(EINVAL);
|
||||
}
|
||||
|
||||
ret = av_image_fill_linesizes(linesizes, fmt, input->width);
|
||||
ret = av_image_fill_linesizes(linesizes, fmt, input->dims[width_idx]);
|
||||
if (ret < 0) {
|
||||
av_log(log_ctx, AV_LOG_ERROR, "unable to get linesizes with av_image_fill_linesizes");
|
||||
sws_freeContext(sws_ctx);
|
||||
|
Reference in New Issue
Block a user