diff options
Diffstat (limited to 'python/openvino/runtime/common/models/src')
21 files changed, 4612 insertions, 0 deletions
diff --git a/python/openvino/runtime/common/models/src/associative_embedding_decoder.cpp b/python/openvino/runtime/common/models/src/associative_embedding_decoder.cpp new file mode 100644 index 0000000..b1e8285 --- /dev/null +++ b/python/openvino/runtime/common/models/src/associative_embedding_decoder.cpp @@ -0,0 +1,201 @@ +/* +// Copyright (C) 2021-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/associative_embedding_decoder.h" + +#include <algorithm> +#include <iterator> +#include <limits> +#include <numeric> +#include <vector> + +#include <utils/kuhn_munkres.hpp> + +void findPeaks(const std::vector<cv::Mat>& nmsHeatMaps, + const std::vector<cv::Mat>& aembdsMaps, + std::vector<std::vector<Peak>>& allPeaks, + size_t jointId, + size_t maxNumPeople, + float detectionThreshold) { + const cv::Mat& nmsHeatMap = nmsHeatMaps[jointId]; + const float* heatMapData = nmsHeatMap.ptr<float>(); + cv::Size outputSize = nmsHeatMap.size(); + + std::vector<int> indices(outputSize.area()); + std::iota(std::begin(indices), std::end(indices), 0); + std::partial_sort(std::begin(indices), + std::begin(indices) + maxNumPeople, + std::end(indices), + [heatMapData](int l, int r) { + return heatMapData[l] > heatMapData[r]; + }); + + for (size_t personId = 0; personId < maxNumPeople; personId++) { + int index = indices[personId]; + int x = index / outputSize.width; + int y = index % outputSize.width; + float tag = aembdsMaps[jointId].at<float>(x, y); + float score = heatMapData[index]; + allPeaks[jointId].reserve(maxNumPeople); + if (score > detectionThreshold) { + allPeaks[jointId].emplace_back(Peak{cv::Point2f(static_cast<float>(x), static_cast<float>(y)), score, tag}); + } + } +} + +std::vector<Pose> matchByTag(std::vector<std::vector<Peak>>& allPeaks, + size_t maxNumPeople, + size_t numJoints, + float tagThreshold) { + size_t jointOrder[]{0, 1, 2, 3, 4, 5, 6, 11, 12, 7, 8, 9, 10, 13, 14, 15, 16}; + std::vector<Pose> allPoses; + for (size_t jointId : jointOrder) { + std::vector<Peak>& jointPeaks = allPeaks[jointId]; + std::vector<float> tags; + for (auto& peak : jointPeaks) { + tags.push_back(peak.tag); + } + if (allPoses.empty()) { + for (size_t personId = 0; personId < jointPeaks.size(); personId++) { + Peak peak = jointPeaks[personId]; + Pose pose = Pose(numJoints); + pose.add(jointId, peak); + allPoses.push_back(pose); + } + continue; + } + if (jointPeaks.empty() || (allPoses.size() == maxNumPeople)) { + continue; + } + std::vector<float> posesTags; + std::vector<cv::Point2f> posesCenters; + for (auto& pose : allPoses) { + posesTags.push_back(pose.getPoseTag()); + posesCenters.push_back(pose.getPoseCenter()); + } + size_t numAdded = tags.size(); + size_t numGrouped = posesTags.size(); + cv::Mat tagsDiff(numAdded, numGrouped, CV_32F); + cv::Mat matchingCost(numAdded, numGrouped, CV_32F); + std::vector<float> dists(numAdded); + for (size_t j = 0; j < numGrouped; j++) { + float minDist = std::numeric_limits<float>::max(); + // Compute euclidean distance (in spatial space) between the pose center and all joints. + const cv::Point2f center = posesCenters.at(j); + for (size_t i = 0; i < numAdded; i++) { + cv::Point2f v = jointPeaks.at(i).keypoint - center; + float dist = std::sqrt(v.x * v.x + v.y * v.y); + dists[i] = dist; + minDist = std::min(dist, minDist); + } + // Compute semantic distance (in embedding space) between the pose tag and all joints + // and corresponding matching costs. + auto poseTag = posesTags[j]; + for (size_t i = 0; i < numAdded; i++) { + float diff = static_cast<float>(cv::norm(tags[i] - poseTag)); + tagsDiff.at<float>(i, j) = diff; + if (diff < tagThreshold) { + diff *= dists[i] / (minDist + 1e-10f); + } + matchingCost.at<float>(i, j) = std::round(diff) * 100 - jointPeaks[i].score; + } + } + + if (numAdded > numGrouped) { + cv::copyMakeBorder(matchingCost, + matchingCost, + 0, + 0, + 0, + numAdded - numGrouped, + cv::BORDER_CONSTANT, + 10000000); + } + // Get pairs + auto res = KuhnMunkres().Solve(matchingCost); + for (size_t row = 0; row < res.size(); row++) { + size_t col = res[row]; + if (row < numAdded && col < numGrouped && tagsDiff.at<float>(row, col) < tagThreshold) { + allPoses[col].add(jointId, jointPeaks[row]); + } else { + Pose pose = Pose(numJoints); + pose.add(jointId, jointPeaks[row]); + allPoses.push_back(pose); + } + } + } + return allPoses; +} + +namespace { +cv::Point2f adjustLocation(const int x, const int y, const cv::Mat& heatMap) { + cv::Point2f delta(0.f, 0.f); + int width = heatMap.cols; + int height = heatMap.rows; + if ((1 < x) && (x < width - 1) && (1 < y) && (y < height - 1)) { + auto diffX = heatMap.at<float>(y, x + 1) - heatMap.at<float>(y, x - 1); + auto diffY = heatMap.at<float>(y + 1, x) - heatMap.at<float>(y - 1, x); + delta.x = diffX > 0 ? 0.25f : -0.25f; + delta.y = diffY > 0 ? 0.25f : -0.25f; + } + return delta; +} +} // namespace + +void adjustAndRefine(std::vector<Pose>& allPoses, + const std::vector<cv::Mat>& heatMaps, + const std::vector<cv::Mat>& aembdsMaps, + int poseId, + const float delta) { + Pose& pose = allPoses[poseId]; + float poseTag = pose.getPoseTag(); + for (size_t jointId = 0; jointId < pose.size(); jointId++) { + Peak& peak = pose.getPeak(jointId); + const cv::Mat& heatMap = heatMaps[jointId]; + const cv::Mat& aembds = aembdsMaps[jointId]; + + if (peak.score > 0) { + // Adjust + int x = static_cast<int>(peak.keypoint.x); + int y = static_cast<int>(peak.keypoint.y); + peak.keypoint += adjustLocation(x, y, heatMap); + if (delta) { + peak.keypoint.x += delta; + peak.keypoint.y += delta; + } + } else { + // Refine + // Get position with the closest tag value to the pose tag + cv::Mat diff = cv::abs(aembds - poseTag); + diff.convertTo(diff, CV_32S, 1.0, 0.0); + diff.convertTo(diff, CV_32F); + diff -= heatMap; + double min; + cv::Point2i minLoc; + cv::minMaxLoc(diff, &min, 0, &minLoc); + int x = minLoc.x; + int y = minLoc.y; + float val = heatMap.at<float>(y, x); + if (val > 0) { + peak.keypoint.x = static_cast<float>(x); + peak.keypoint.y = static_cast<float>(y); + peak.keypoint += adjustLocation(x, y, heatMap); + // Peak score is assigned directly, so it does not affect the pose score. + peak.score = val; + } + } + } +} diff --git a/python/openvino/runtime/common/models/src/classification_model.cpp b/python/openvino/runtime/common/models/src/classification_model.cpp new file mode 100644 index 0000000..90bc0d5 --- /dev/null +++ b/python/openvino/runtime/common/models/src/classification_model.cpp @@ -0,0 +1,196 @@ +/* +// Copyright (C) 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/classification_model.h" + +#include <algorithm> +#include <fstream> +#include <iterator> +#include <map> +#include <stdexcept> +#include <string> +#include <utility> +#include <vector> + +#include <openvino/op/softmax.hpp> +#include <openvino/op/topk.hpp> +#include <openvino/openvino.hpp> + +#include <utils/slog.hpp> + +#include "models/results.h" + +ClassificationModel::ClassificationModel(const std::string& modelFileName, + size_t nTop, + bool useAutoResize, + const std::vector<std::string>& labels, + const std::string& layout) + : ImageModel(modelFileName, useAutoResize, layout), + nTop(nTop), + labels(labels) {} + +std::unique_ptr<ResultBase> ClassificationModel::postprocess(InferenceResult& infResult) { + const ov::Tensor& indicesTensor = infResult.outputsData.find(outputsNames[0])->second; + const int* indicesPtr = indicesTensor.data<int>(); + const ov::Tensor& scoresTensor = infResult.outputsData.find(outputsNames[1])->second; + const float* scoresPtr = scoresTensor.data<float>(); + + ClassificationResult* result = new ClassificationResult(infResult.frameId, infResult.metaData); + auto retVal = std::unique_ptr<ResultBase>(result); + + result->topLabels.reserve(scoresTensor.get_size()); + for (size_t i = 0; i < scoresTensor.get_size(); ++i) { + int ind = indicesPtr[i]; + if (ind < 0 || ind >= static_cast<int>(labels.size())) { + throw std::runtime_error("Invalid index for the class label is found during postprocessing"); + } + result->topLabels.emplace_back(ind, labels[ind], scoresPtr[i]); + } + + return retVal; +} + +std::vector<std::string> ClassificationModel::loadLabels(const std::string& labelFilename) { + std::vector<std::string> labels; + + /* Read labels */ + std::ifstream inputFile(labelFilename); + if (!inputFile.is_open()) + throw std::runtime_error("Can't open the labels file: " + labelFilename); + std::string labelsLine; + while (std::getline(inputFile, labelsLine)) { + size_t labelBeginIdx = labelsLine.find(' '); + size_t labelEndIdx = labelsLine.find(','); // can be npos when class has only one label + if (labelBeginIdx == std::string::npos) { + throw std::runtime_error("The labels file has incorrect format."); + } + labels.push_back(labelsLine.substr(labelBeginIdx + 1, labelEndIdx - (labelBeginIdx + 1))); + } + if (labels.empty()) + throw std::logic_error("File is empty: " + labelFilename); + + return labels; +} + +void ClassificationModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output ------------------------------------------------- + // --------------------------- Prepare input ------------------------------------------------------ + if (model->inputs().size() != 1) { + throw std::logic_error("Classification model wrapper supports topologies with only 1 input"); + } + const auto& input = model->input(); + inputsNames.push_back(input.get_any_name()); + + const ov::Shape& inputShape = input.get_shape(); + const ov::Layout& inputLayout = getInputLayout(input); + + if (inputShape.size() != 4 || inputShape[ov::layout::channels_idx(inputLayout)] != 3) { + throw std::logic_error("3-channel 4-dimensional model's input is expected"); + } + + const auto width = inputShape[ov::layout::width_idx(inputLayout)]; + const auto height = inputShape[ov::layout::height_idx(inputLayout)]; + if (height != width) { + throw std::logic_error("Model input has incorrect image shape. Must be NxN square." + " Got " + + std::to_string(height) + "x" + std::to_string(width) + "."); + } + + ov::preprocess::PrePostProcessor ppp(model); + ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"}); + + if (useAutoResize) { + ppp.input().tensor().set_spatial_dynamic_shape(); + + ppp.input() + .preprocess() + .convert_element_type(ov::element::f32) + .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR); + } + + ppp.input().model().set_layout(inputLayout); + + // --------------------------- Prepare output ----------------------------------------------------- + if (model->outputs().size() != 1) { + throw std::logic_error("Classification model wrapper supports topologies with only 1 output"); + } + + const ov::Shape& outputShape = model->output().get_shape(); + if (outputShape.size() != 2 && outputShape.size() != 4) { + throw std::logic_error("Classification model wrapper supports topologies only with" + " 2-dimensional or 4-dimensional output"); + } + + const ov::Layout outputLayout("NCHW"); + if (outputShape.size() == 4 && (outputShape[ov::layout::height_idx(outputLayout)] != 1 || + outputShape[ov::layout::width_idx(outputLayout)] != 1)) { + throw std::logic_error("Classification model wrapper supports topologies only" + " with 4-dimensional output which has last two dimensions of size 1"); + } + + size_t classesNum = outputShape[ov::layout::channels_idx(outputLayout)]; + if (nTop > classesNum) { + throw std::logic_error("The model provides " + std::to_string(classesNum) + " classes, but " + + std::to_string(nTop) + " labels are requested to be predicted"); + } + if (classesNum == labels.size() + 1) { + labels.insert(labels.begin(), "other"); + slog::warn << "Inserted 'other' label as first." << slog::endl; + } else if (classesNum != labels.size()) { + throw std::logic_error("Model's number of classes and parsed labels must match (" + + std::to_string(outputShape[1]) + " and " + std::to_string(labels.size()) + ')'); + } + + ppp.output().tensor().set_element_type(ov::element::f32); + model = ppp.build(); + + // --------------------------- Adding softmax and topK output --------------------------- + auto nodes = model->get_ops(); + auto softmaxNodeIt = std::find_if(std::begin(nodes), std::end(nodes), [](const std::shared_ptr<ov::Node>& op) { + return std::string(op->get_type_name()) == "Softmax"; + }); + + std::shared_ptr<ov::Node> softmaxNode; + if (softmaxNodeIt == nodes.end()) { + auto logitsNode = model->get_output_op(0)->input(0).get_source_output().get_node(); + softmaxNode = std::make_shared<ov::op::v1::Softmax>(logitsNode->output(0), 1); + } else { + softmaxNode = *softmaxNodeIt; + } + const auto k = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, std::vector<size_t>{nTop}); + std::shared_ptr<ov::Node> topkNode = std::make_shared<ov::op::v3::TopK>(softmaxNode, + k, + 1, + ov::op::v3::TopK::Mode::MAX, + ov::op::v3::TopK::SortType::SORT_VALUES); + + auto indices = std::make_shared<ov::op::v0::Result>(topkNode->output(0)); + auto scores = std::make_shared<ov::op::v0::Result>(topkNode->output(1)); + ov::ResultVector res({scores, indices}); + model = std::make_shared<ov::Model>(res, model->get_parameters(), "classification"); + + // manually set output tensors name for created topK node + model->outputs()[0].set_names({"indices"}); + outputsNames.push_back("indices"); + model->outputs()[1].set_names({"scores"}); + outputsNames.push_back("scores"); + + // set output precisions + ppp = ov::preprocess::PrePostProcessor(model); + ppp.output("indices").tensor().set_element_type(ov::element::i32); + ppp.output("scores").tensor().set_element_type(ov::element::f32); + model = ppp.build(); +} diff --git a/python/openvino/runtime/common/models/src/deblurring_model.cpp b/python/openvino/runtime/common/models/src/deblurring_model.cpp new file mode 100644 index 0000000..261efb3 --- /dev/null +++ b/python/openvino/runtime/common/models/src/deblurring_model.cpp @@ -0,0 +1,158 @@ +/* +// Copyright (C) 2021-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/deblurring_model.h" + +#include <algorithm> +#include <stdexcept> +#include <string> +#include <vector> + +#include <opencv2/imgproc.hpp> +#include <openvino/openvino.hpp> + +#include <utils/ocv_common.hpp> +#include <utils/slog.hpp> + +#include "models/input_data.h" +#include "models/internal_model_data.h" +#include "models/results.h" + +DeblurringModel::DeblurringModel(const std::string& modelFileName, + const cv::Size& inputImgSize, + const std::string& layout) + : ImageModel(modelFileName, false, layout) { + netInputHeight = inputImgSize.height; + netInputWidth = inputImgSize.width; +} + +void DeblurringModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output ------------------------------------------------- + // --------------------------- Prepare input ------------------------------------------------------ + if (model->inputs().size() != 1) { + throw std::logic_error("Deblurring model wrapper supports topologies with only 1 input"); + } + + inputsNames.push_back(model->input().get_any_name()); + + const ov::Shape& inputShape = model->input().get_shape(); + const ov::Layout& inputLayout = getInputLayout(model->input()); + + if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 || + inputShape[ov::layout::channels_idx(inputLayout)] != 3) { + throw std::logic_error("3-channel 4-dimensional model's input is expected"); + } + + ov::preprocess::PrePostProcessor ppp(model); + ppp.input().tensor().set_element_type(ov::element::u8).set_layout("NHWC"); + + ppp.input().model().set_layout(inputLayout); + + // --------------------------- Prepare output ----------------------------------------------------- + if (model->outputs().size() != 1) { + throw std::logic_error("Deblurring model wrapper supports topologies with only 1 output"); + } + + outputsNames.push_back(model->output().get_any_name()); + + const ov::Shape& outputShape = model->output().get_shape(); + const ov::Layout outputLayout("NCHW"); + if (outputShape.size() != 4 || outputShape[ov::layout::batch_idx(outputLayout)] != 1 || + outputShape[ov::layout::channels_idx(outputLayout)] != 3) { + throw std::logic_error("3-channel 4-dimensional model's output is expected"); + } + + ppp.output().tensor().set_element_type(ov::element::f32); + model = ppp.build(); + + changeInputSize(model); +} + +void DeblurringModel::changeInputSize(std::shared_ptr<ov::Model>& model) { + const ov::Layout& layout = ov::layout::get_layout(model->input()); + ov::Shape inputShape = model->input().get_shape(); + + const auto batchId = ov::layout::batch_idx(layout); + const auto heightId = ov::layout::height_idx(layout); + const auto widthId = ov::layout::width_idx(layout); + + if (inputShape[heightId] % stride || inputShape[widthId] % stride) { + throw std::logic_error("Model input shape HxW = " + std::to_string(inputShape[heightId]) + "x" + + std::to_string(inputShape[widthId]) + "must be divisible by stride " + + std::to_string(stride)); + } + + netInputHeight = static_cast<int>((netInputHeight + stride - 1) / stride) * stride; + netInputWidth = static_cast<int>((netInputWidth + stride - 1) / stride) * stride; + + inputShape[batchId] = 1; + inputShape[heightId] = netInputHeight; + inputShape[widthId] = netInputWidth; + + model->reshape(inputShape); +} + +std::shared_ptr<InternalModelData> DeblurringModel::preprocess(const InputData& inputData, ov::InferRequest& request) { + auto& image = inputData.asRef<ImageInputData>().inputImage; + size_t h = image.rows; + size_t w = image.cols; + cv::Mat resizedImage; + + if (netInputHeight - stride < h && h <= netInputHeight && netInputWidth - stride < w && w <= netInputWidth) { + int bottom = netInputHeight - h; + int right = netInputWidth - w; + cv::copyMakeBorder(image, resizedImage, 0, bottom, 0, right, cv::BORDER_CONSTANT, 0); + } else { + slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl; + cv::resize(image, resizedImage, cv::Size(netInputWidth, netInputHeight)); + } + request.set_input_tensor(wrapMat2Tensor(resizedImage)); + + return std::make_shared<InternalImageModelData>(image.cols, image.rows); +} + +std::unique_ptr<ResultBase> DeblurringModel::postprocess(InferenceResult& infResult) { + ImageResult* result = new ImageResult; + *static_cast<ResultBase*>(result) = static_cast<ResultBase&>(infResult); + + const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>(); + const auto outputData = infResult.getFirstOutputTensor().data<float>(); + + std::vector<cv::Mat> imgPlanes; + const ov::Shape& outputShape = infResult.getFirstOutputTensor().get_shape(); + const ov::Layout outputLayout("NCHW"); + size_t outHeight = static_cast<int>((outputShape[ov::layout::height_idx(outputLayout)])); + size_t outWidth = static_cast<int>((outputShape[ov::layout::width_idx(outputLayout)])); + size_t numOfPixels = outWidth * outHeight; + imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0])), + cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])), + cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2]))}; + cv::Mat resultImg; + cv::merge(imgPlanes, resultImg); + + if (netInputHeight - stride < static_cast<size_t>(inputImgSize.inputImgHeight) && + static_cast<size_t>(inputImgSize.inputImgHeight) <= netInputHeight && + netInputWidth - stride < static_cast<size_t>(inputImgSize.inputImgWidth) && + static_cast<size_t>(inputImgSize.inputImgWidth) <= netInputWidth) { + result->resultImage = resultImg(cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight)); + } else { + cv::resize(resultImg, result->resultImage, cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight)); + } + + result->resultImage.convertTo(result->resultImage, CV_8UC3, 255); + + return std::unique_ptr<ResultBase>(result); +} diff --git a/python/openvino/runtime/common/models/src/detection_model.cpp b/python/openvino/runtime/common/models/src/detection_model.cpp new file mode 100644 index 0000000..83e2d22 --- /dev/null +++ b/python/openvino/runtime/common/models/src/detection_model.cpp @@ -0,0 +1,52 @@ +/* +// Copyright (C) 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/detection_model.h" + +#include <fstream> +#include <stdexcept> +#include <string> +#include <vector> + +#include "models/image_model.h" + +DetectionModel::DetectionModel(const std::string& modelFileName, + float confidenceThreshold, + bool useAutoResize, + const std::vector<std::string>& labels, + const std::string& layout) + : ImageModel(modelFileName, useAutoResize, layout), + confidenceThreshold(confidenceThreshold), + labels(labels) {} + +std::vector<std::string> DetectionModel::loadLabels(const std::string& labelFilename) { + std::vector<std::string> labelsList; + + /* Read labels (if any) */ + if (!labelFilename.empty()) { + std::ifstream inputFile(labelFilename); + if (!inputFile.is_open()) + throw std::runtime_error("Can't open the labels file: " + labelFilename); + std::string label; + while (std::getline(inputFile, label)) { + labelsList.push_back(label); + } + if (labelsList.empty()) + throw std::logic_error("File is empty: " + labelFilename); + } + + return labelsList; +} diff --git a/python/openvino/runtime/common/models/src/detection_model_centernet.cpp b/python/openvino/runtime/common/models/src/detection_model_centernet.cpp new file mode 100644 index 0000000..eac42a7 --- /dev/null +++ b/python/openvino/runtime/common/models/src/detection_model_centernet.cpp @@ -0,0 +1,302 @@ +/* +// Copyright (C) 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/detection_model_centernet.h" + +#include <stddef.h> + +#include <algorithm> +#include <cmath> +#include <map> +#include <stdexcept> +#include <utility> + +#include <opencv2/core.hpp> +#include <opencv2/imgproc.hpp> +#include <openvino/openvino.hpp> + +#include <utils/common.hpp> +#include <utils/image_utils.h> +#include <utils/ocv_common.hpp> + +#include "models/input_data.h" +#include "models/internal_model_data.h" +#include "models/results.h" + +ModelCenterNet::ModelCenterNet(const std::string& modelFileName, + float confidenceThreshold, + const std::vector<std::string>& labels, + const std::string& layout) + : DetectionModel(modelFileName, confidenceThreshold, false, labels, layout) {} + +void ModelCenterNet::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output ------------------------------------------------- + // --------------------------- Prepare input ------------------------------------------------------ + if (model->inputs().size() != 1) { + throw std::logic_error("CenterNet model wrapper expects models that have only 1 input"); + } + + const ov::Shape& inputShape = model->input().get_shape(); + const ov::Layout& inputLayout = getInputLayout(model->input()); + + if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) { + throw std::logic_error("Expected 3-channel input"); + } + + ov::preprocess::PrePostProcessor ppp(model); + inputTransform.setPrecision(ppp, model->input().get_any_name()); + ppp.input().tensor().set_layout("NHWC"); + + ppp.input().model().set_layout(inputLayout); + + // --------------------------- Reading image input parameters ------------------------------------------- + inputsNames.push_back(model->input().get_any_name()); + netInputWidth = inputShape[ov::layout::width_idx(inputLayout)]; + netInputHeight = inputShape[ov::layout::height_idx(inputLayout)]; + + // --------------------------- Prepare output ----------------------------------------------------- + if (model->outputs().size() != 3) { + throw std::logic_error("CenterNet model wrapper expects models that have 3 outputs"); + } + + const ov::Layout outLayout{"NCHW"}; + for (const auto& output : model->outputs()) { + auto outTensorName = output.get_any_name(); + outputsNames.push_back(outTensorName); + ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outLayout); + } + std::sort(outputsNames.begin(), outputsNames.end()); + model = ppp.build(); +} + +cv::Point2f getDir(const cv::Point2f& srcPoint, float rotRadius) { + float sn = sinf(rotRadius); + float cs = cosf(rotRadius); + + cv::Point2f srcResult(0.0f, 0.0f); + srcResult.x = srcPoint.x * cs - srcPoint.y * sn; + srcResult.y = srcPoint.x * sn + srcPoint.y * cs; + + return srcResult; +} + +cv::Point2f get3rdPoint(const cv::Point2f& a, const cv::Point2f& b) { + cv::Point2f direct = a - b; + return b + cv::Point2f(-direct.y, direct.x); +} + +cv::Mat getAffineTransform(float centerX, + float centerY, + int srcW, + float rot, + size_t outputWidth, + size_t outputHeight, + bool inv = false) { + float rotRad = static_cast<float>(CV_PI) * rot / 180.0f; + auto srcDir = getDir({0.0f, -0.5f * srcW}, rotRad); + cv::Point2f dstDir(0.0f, -0.5f * outputWidth); + std::vector<cv::Point2f> src(3, {0.0f, 0.0f}); + std::vector<cv::Point2f> dst(3, {0.0f, 0.0f}); + + src[0] = {centerX, centerY}; + src[1] = srcDir + src[0]; + src[2] = get3rdPoint(src[0], src[1]); + + dst[0] = {outputWidth * 0.5f, outputHeight * 0.5f}; + dst[1] = dst[0] + dstDir; + dst[2] = get3rdPoint(dst[0], dst[1]); + + cv::Mat trans; + if (inv) { + trans = cv::getAffineTransform(dst, src); + } else { + trans = cv::getAffineTransform(src, dst); + } + + return trans; +} + +std::shared_ptr<InternalModelData> ModelCenterNet::preprocess(const InputData& inputData, ov::InferRequest& request) { + auto& img = inputData.asRef<ImageInputData>().inputImage; + const auto& resizedImg = resizeImageExt(img, netInputWidth, netInputHeight, RESIZE_KEEP_ASPECT_LETTERBOX); + + request.set_input_tensor(wrapMat2Tensor(inputTransform(resizedImg))); + return std::make_shared<InternalImageModelData>(img.cols, img.rows); +} + +std::vector<std::pair<size_t, float>> nms(float* scoresPtr, const ov::Shape& shape, float threshold, int kernel = 3) { + std::vector<std::pair<size_t, float>> scores; + scores.reserve(ModelCenterNet::INIT_VECTOR_SIZE); + auto chSize = shape[2] * shape[3]; + + for (size_t i = 0; i < shape[1] * shape[2] * shape[3]; ++i) { + scoresPtr[i] = expf(scoresPtr[i]) / (1 + expf(scoresPtr[i])); + } + + for (size_t ch = 0; ch < shape[1]; ++ch) { + for (size_t w = 0; w < shape[2]; ++w) { + for (size_t h = 0; h < shape[3]; ++h) { + float max = scoresPtr[chSize * ch + shape[2] * w + h]; + + // --------------------- filter on threshold-------------------------------------- + if (max < threshold) { + continue; + } + + // --------------------- store index and score------------------------------------ + scores.push_back({chSize * ch + shape[2] * w + h, max}); + + bool next = true; + // ---------------------- maxpool2d ----------------------------------------------- + for (int i = -kernel / 2; i < kernel / 2 + 1 && next; ++i) { + for (int j = -kernel / 2; j < kernel / 2 + 1; ++j) { + if (w + i >= 0 && w + i < shape[2] && h + j >= 0 && h + j < shape[3]) { + if (scoresPtr[chSize * ch + shape[2] * (w + i) + h + j] > max) { + scores.pop_back(); + next = false; + break; + } + } else { + if (max < 0) { + scores.pop_back(); + next = false; + break; + } + } + } + } + } + } + } + + return scores; +} + +static std::vector<std::pair<size_t, float>> filterScores(const ov::Tensor& scoresTensor, float threshold) { + auto shape = scoresTensor.get_shape(); + float* scoresPtr = scoresTensor.data<float>(); + + return nms(scoresPtr, shape, threshold); +} + +std::vector<std::pair<float, float>> filterReg(const ov::Tensor& regressionTensor, + const std::vector<std::pair<size_t, float>>& scores, + size_t chSize) { + const float* regPtr = regressionTensor.data<float>(); + std::vector<std::pair<float, float>> reg; + + for (auto s : scores) { + reg.push_back({regPtr[s.first % chSize], regPtr[chSize + s.first % chSize]}); + } + + return reg; +} + +std::vector<std::pair<float, float>> filterWH(const ov::Tensor& whTensor, + const std::vector<std::pair<size_t, float>>& scores, + size_t chSize) { + const float* whPtr = whTensor.data<float>(); + std::vector<std::pair<float, float>> wh; + + for (auto s : scores) { + wh.push_back({whPtr[s.first % chSize], whPtr[chSize + s.first % chSize]}); + } + + return wh; +} + +std::vector<ModelCenterNet::BBox> calcBoxes(const std::vector<std::pair<size_t, float>>& scores, + const std::vector<std::pair<float, float>>& reg, + const std::vector<std::pair<float, float>>& wh, + const ov::Shape& shape) { + std::vector<ModelCenterNet::BBox> boxes(scores.size()); + + for (size_t i = 0; i < boxes.size(); ++i) { + size_t chIdx = scores[i].first % (shape[2] * shape[3]); + auto xCenter = chIdx % shape[3]; + auto yCenter = chIdx / shape[3]; + + boxes[i].left = xCenter + reg[i].first - wh[i].first / 2.0f; + boxes[i].top = yCenter + reg[i].second - wh[i].second / 2.0f; + boxes[i].right = xCenter + reg[i].first + wh[i].first / 2.0f; + boxes[i].bottom = yCenter + reg[i].second + wh[i].second / 2.0f; + } + + return boxes; +} + +void transform(std::vector<ModelCenterNet::BBox>& boxes, + const ov::Shape& shape, + int scale, + float centerX, + float centerY) { + cv::Mat1f trans = getAffineTransform(centerX, centerY, scale, 0, shape[2], shape[3], true); + + for (auto& b : boxes) { + ModelCenterNet::BBox newbb; + + newbb.left = trans.at<float>(0, 0) * b.left + trans.at<float>(0, 1) * b.top + trans.at<float>(0, 2); + newbb.top = trans.at<float>(1, 0) * b.left + trans.at<float>(1, 1) * b.top + trans.at<float>(1, 2); + newbb.right = trans.at<float>(0, 0) * b.right + trans.at<float>(0, 1) * b.bottom + trans.at<float>(0, 2); + newbb.bottom = trans.at<float>(1, 0) * b.right + trans.at<float>(1, 1) * b.bottom + trans.at<float>(1, 2); + + b = newbb; + } +} + +std::unique_ptr<ResultBase> ModelCenterNet::postprocess(InferenceResult& infResult) { + // --------------------------- Filter data and get valid indices --------------------------------- + const auto& heatmapTensor = infResult.outputsData[outputsNames[0]]; + const auto& heatmapTensorShape = heatmapTensor.get_shape(); + const auto chSize = heatmapTensorShape[2] * heatmapTensorShape[3]; + const auto scores = filterScores(heatmapTensor, confidenceThreshold); + + const auto& regressionTensor = infResult.outputsData[outputsNames[1]]; + const auto reg = filterReg(regressionTensor, scores, chSize); + + const auto& whTensor = infResult.outputsData[outputsNames[2]]; + const auto wh = filterWH(whTensor, scores, chSize); + + // --------------------------- Calculate bounding boxes & apply inverse affine transform ---------- + auto boxes = calcBoxes(scores, reg, wh, heatmapTensorShape); + + const auto imgWidth = infResult.internalModelData->asRef<InternalImageModelData>().inputImgWidth; + const auto imgHeight = infResult.internalModelData->asRef<InternalImageModelData>().inputImgHeight; + const auto scale = std::max(imgWidth, imgHeight); + const float centerX = imgWidth / 2.0f; + const float centerY = imgHeight / 2.0f; + + transform(boxes, heatmapTensorShape, scale, centerX, centerY); + + // --------------------------- Create detection result objects ------------------------------------ + DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData); + + result->objects.reserve(scores.size()); + for (size_t i = 0; i < scores.size(); ++i) { + DetectedObject desc; + desc.confidence = scores[i].second; + desc.labelID = scores[i].first / chSize; + desc.label = getLabelName(desc.labelID); + desc.x = clamp(boxes[i].left, 0.f, static_cast<float>(imgWidth)); + desc.y = clamp(boxes[i].top, 0.f, static_cast<float>(imgHeight)); + desc.width = clamp(boxes[i].getWidth(), 0.f, static_cast<float>(imgWidth)); + desc.height = clamp(boxes[i].getHeight(), 0.f, static_cast<float>(imgHeight)); + + result->objects.push_back(desc); + } + + return std::unique_ptr<ResultBase>(result); +} diff --git a/python/openvino/runtime/common/models/src/detection_model_faceboxes.cpp b/python/openvino/runtime/common/models/src/detection_model_faceboxes.cpp new file mode 100644 index 0000000..bb349a6 --- /dev/null +++ b/python/openvino/runtime/common/models/src/detection_model_faceboxes.cpp @@ -0,0 +1,261 @@ +/* +// Copyright (C) 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/detection_model_faceboxes.h" + +#include <algorithm> +#include <cmath> +#include <map> +#include <stdexcept> + +#include <openvino/openvino.hpp> + +#include <utils/common.hpp> +#include <utils/nms.hpp> +#include <utils/ocv_common.hpp> + +#include "models/internal_model_data.h" +#include "models/results.h" + +ModelFaceBoxes::ModelFaceBoxes(const std::string& modelFileName, + float confidenceThreshold, + bool useAutoResize, + float boxIOUThreshold, + const std::string& layout) + : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, {"Face"}, layout), + maxProposalsCount(0), + boxIOUThreshold(boxIOUThreshold), + variance({0.1f, 0.2f}), + steps({32, 64, 128}), + minSizes({{32, 64, 128}, {256}, {512}}) {} + +void ModelFaceBoxes::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output ------------------------------------------------- + // --------------------------- Prepare input ------------------------------------------------------ + if (model->inputs().size() != 1) { + throw std::logic_error("FaceBoxes model wrapper expects models that have only 1 input"); + } + + const ov::Shape& inputShape = model->input().get_shape(); + const ov::Layout& inputLayout = getInputLayout(model->input()); + + if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) { + throw std::logic_error("Expected 3-channel input"); + } + + ov::preprocess::PrePostProcessor ppp(model); + inputTransform.setPrecision(ppp, model->input().get_any_name()); + ppp.input().tensor().set_layout({"NHWC"}); + + if (useAutoResize) { + ppp.input().tensor().set_spatial_dynamic_shape(); + + ppp.input() + .preprocess() + .convert_element_type(ov::element::f32) + .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR); + } + + ppp.input().model().set_layout(inputLayout); + + // --------------------------- Reading image input parameters ------------------------------------------- + inputsNames.push_back(model->input().get_any_name()); + netInputWidth = inputShape[ov::layout::width_idx(inputLayout)]; + netInputHeight = inputShape[ov::layout::height_idx(inputLayout)]; + + // --------------------------- Prepare output ----------------------------------------------------- + if (model->outputs().size() != 2) { + throw std::logic_error("FaceBoxes model wrapper expects models that have 2 outputs"); + } + + const ov::Layout outputLayout{"CHW"}; + maxProposalsCount = model->outputs().front().get_shape()[ov::layout::height_idx(outputLayout)]; + for (const auto& output : model->outputs()) { + const auto outTensorName = output.get_any_name(); + outputsNames.push_back(outTensorName); + ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outputLayout); + } + std::sort(outputsNames.begin(), outputsNames.end()); + model = ppp.build(); + + // --------------------------- Calculating anchors ---------------------------------------------------- + std::vector<std::pair<size_t, size_t>> featureMaps; + for (auto s : steps) { + featureMaps.push_back({netInputHeight / s, netInputWidth / s}); + } + + priorBoxes(featureMaps); +} + +void calculateAnchors(std::vector<Anchor>& anchors, + const std::vector<float>& vx, + const std::vector<float>& vy, + const int minSize, + const int step) { + float skx = static_cast<float>(minSize); + float sky = static_cast<float>(minSize); + + std::vector<float> dense_cx, dense_cy; + + for (auto x : vx) { + dense_cx.push_back(x * step); + } + + for (auto y : vy) { + dense_cy.push_back(y * step); + } + + for (auto cy : dense_cy) { + for (auto cx : dense_cx) { + anchors.push_back( + {cx - 0.5f * skx, cy - 0.5f * sky, cx + 0.5f * skx, cy + 0.5f * sky}); // left top right bottom + } + } +} + +void calculateAnchorsZeroLevel(std::vector<Anchor>& anchors, + const int fx, + const int fy, + const std::vector<int>& minSizes, + const int step) { + for (auto s : minSizes) { + std::vector<float> vx, vy; + if (s == 32) { + vx.push_back(static_cast<float>(fx)); + vx.push_back(fx + 0.25f); + vx.push_back(fx + 0.5f); + vx.push_back(fx + 0.75f); + + vy.push_back(static_cast<float>(fy)); + vy.push_back(fy + 0.25f); + vy.push_back(fy + 0.5f); + vy.push_back(fy + 0.75f); + } else if (s == 64) { + vx.push_back(static_cast<float>(fx)); + vx.push_back(fx + 0.5f); + + vy.push_back(static_cast<float>(fy)); + vy.push_back(fy + 0.5f); + } else { + vx.push_back(fx + 0.5f); + vy.push_back(fy + 0.5f); + } + calculateAnchors(anchors, vx, vy, s, step); + } +} + +void ModelFaceBoxes::priorBoxes(const std::vector<std::pair<size_t, size_t>>& featureMaps) { + anchors.reserve(maxProposalsCount); + + for (size_t k = 0; k < featureMaps.size(); ++k) { + std::vector<float> a; + for (size_t i = 0; i < featureMaps[k].first; ++i) { + for (size_t j = 0; j < featureMaps[k].second; ++j) { + if (k == 0) { + calculateAnchorsZeroLevel(anchors, j, i, minSizes[k], steps[k]); + } else { + calculateAnchors(anchors, {j + 0.5f}, {i + 0.5f}, minSizes[k][0], steps[k]); + } + } + } + } +} + +std::pair<std::vector<size_t>, std::vector<float>> filterScores(const ov::Tensor& scoresTensor, + const float confidenceThreshold) { + auto shape = scoresTensor.get_shape(); + const float* scoresPtr = scoresTensor.data<float>(); + + std::vector<size_t> indices; + std::vector<float> scores; + scores.reserve(ModelFaceBoxes::INIT_VECTOR_SIZE); + indices.reserve(ModelFaceBoxes::INIT_VECTOR_SIZE); + for (size_t i = 1; i < shape[1] * shape[2]; i = i + 2) { + if (scoresPtr[i] > confidenceThreshold) { + indices.push_back(i / 2); + scores.push_back(scoresPtr[i]); + } + } + + return {indices, scores}; +} + +std::vector<Anchor> filterBoxes(const ov::Tensor& boxesTensor, + const std::vector<Anchor>& anchors, + const std::vector<size_t>& validIndices, + const std::vector<float>& variance) { + auto shape = boxesTensor.get_shape(); + const float* boxesPtr = boxesTensor.data<float>(); + + std::vector<Anchor> boxes; + boxes.reserve(ModelFaceBoxes::INIT_VECTOR_SIZE); + for (auto i : validIndices) { + auto objStart = shape[2] * i; + + auto dx = boxesPtr[objStart]; + auto dy = boxesPtr[objStart + 1]; + auto dw = boxesPtr[objStart + 2]; + auto dh = boxesPtr[objStart + 3]; + + auto predCtrX = dx * variance[0] * anchors[i].getWidth() + anchors[i].getXCenter(); + auto predCtrY = dy * variance[0] * anchors[i].getHeight() + anchors[i].getYCenter(); + auto predW = exp(dw * variance[1]) * anchors[i].getWidth(); + auto predH = exp(dh * variance[1]) * anchors[i].getHeight(); + + boxes.push_back({static_cast<float>(predCtrX - 0.5f * predW), + static_cast<float>(predCtrY - 0.5f * predH), + static_cast<float>(predCtrX + 0.5f * predW), + static_cast<float>(predCtrY + 0.5f * predH)}); + } + + return boxes; +} + +std::unique_ptr<ResultBase> ModelFaceBoxes::postprocess(InferenceResult& infResult) { + // Filter scores and get valid indices for bounding boxes + const auto scoresTensor = infResult.outputsData[outputsNames[1]]; + const auto scores = filterScores(scoresTensor, confidenceThreshold); + + // Filter bounding boxes on indices + auto boxesTensor = infResult.outputsData[outputsNames[0]]; + std::vector<Anchor> boxes = filterBoxes(boxesTensor, anchors, scores.first, variance); + + // Apply Non-maximum Suppression + const std::vector<int> keep = nms(boxes, scores.second, boxIOUThreshold); + + // Create detection result objects + DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData); + const auto imgWidth = infResult.internalModelData->asRef<InternalImageModelData>().inputImgWidth; + const auto imgHeight = infResult.internalModelData->asRef<InternalImageModelData>().inputImgHeight; + const float scaleX = static_cast<float>(netInputWidth) / imgWidth; + const float scaleY = static_cast<float>(netInputHeight) / imgHeight; + + result->objects.reserve(keep.size()); + for (auto i : keep) { + DetectedObject desc; + desc.confidence = scores.second[i]; + desc.x = clamp(boxes[i].left / scaleX, 0.f, static_cast<float>(imgWidth)); + desc.y = clamp(boxes[i].top / scaleY, 0.f, static_cast<float>(imgHeight)); + desc.width = clamp(boxes[i].getWidth() / scaleX, 0.f, static_cast<float>(imgWidth)); + desc.height = clamp(boxes[i].getHeight() / scaleY, 0.f, static_cast<float>(imgHeight)); + desc.labelID = 0; + desc.label = labels[0]; + + result->objects.push_back(desc); + } + + return std::unique_ptr<ResultBase>(result); +} diff --git a/python/openvino/runtime/common/models/src/detection_model_retinaface.cpp b/python/openvino/runtime/common/models/src/detection_model_retinaface.cpp new file mode 100644 index 0000000..8835725 --- /dev/null +++ b/python/openvino/runtime/common/models/src/detection_model_retinaface.cpp @@ -0,0 +1,394 @@ +/* +// Copyright (C) 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/detection_model_retinaface.h" + +#include <stddef.h> + +#include <algorithm> +#include <cmath> +#include <stdexcept> + +#include <opencv2/core.hpp> +#include <openvino/openvino.hpp> + +#include <utils/common.hpp> +#include <utils/nms.hpp> + +#include "models/internal_model_data.h" +#include "models/results.h" + +ModelRetinaFace::ModelRetinaFace(const std::string& modelFileName, + float confidenceThreshold, + bool useAutoResize, + float boxIOUThreshold, + const std::string& layout) + : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, {"Face"}, layout), // Default label is "Face" + shouldDetectMasks(false), + shouldDetectLandmarks(false), + boxIOUThreshold(boxIOUThreshold), + maskThreshold(0.8f), + landmarkStd(1.0f), + anchorCfg({{32, {32, 16}, 16, {1}}, {16, {8, 4}, 16, {1}}, {8, {2, 1}, 16, {1}}}) { + generateAnchorsFpn(); +} + +void ModelRetinaFace::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output ------------------------------------------------- + // --------------------------- Prepare input ------------------------------------------------------ + if (model->inputs().size() != 1) { + throw std::logic_error("RetinaFace model wrapper expects models that have only 1 input"); + } + const ov::Shape& inputShape = model->input().get_shape(); + const ov::Layout& inputLayout = getInputLayout(model->input()); + + if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) { + throw std::logic_error("Expected 3-channel input"); + } + + ov::preprocess::PrePostProcessor ppp(model); + ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"}); + + if (useAutoResize) { + ppp.input().tensor().set_spatial_dynamic_shape(); + + ppp.input() + .preprocess() + .convert_element_type(ov::element::f32) + .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR); + } + + ppp.input().model().set_layout(inputLayout); + + // --------------------------- Reading image input parameters ------------------------------------------- + inputsNames.push_back(model->input().get_any_name()); + netInputWidth = inputShape[ov::layout::width_idx(inputLayout)]; + netInputHeight = inputShape[ov::layout::height_idx(inputLayout)]; + + // --------------------------- Prepare output ----------------------------------------------------- + + const ov::OutputVector& outputs = model->outputs(); + if (outputs.size() != 6 && outputs.size() != 9 && outputs.size() != 12) { + throw std::logic_error("RetinaFace model wrapper expects models that have 6, 9 or 12 outputs"); + } + + const ov::Layout outputLayout{"NCHW"}; + std::vector<size_t> outputsSizes[OUT_MAX]; + for (const auto& output : model->outputs()) { + auto outTensorName = output.get_any_name(); + outputsNames.push_back(outTensorName); + ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outputLayout); + + OutputType type = OUT_MAX; + if (outTensorName.find("box") != std::string::npos) { + type = OUT_BOXES; + } else if (outTensorName.find("cls") != std::string::npos) { + type = OUT_SCORES; + } else if (outTensorName.find("landmark") != std::string::npos) { + type = OUT_LANDMARKS; + shouldDetectLandmarks = true; + } else if (outTensorName.find("type") != std::string::npos) { + type = OUT_MASKSCORES; + labels.clear(); + labels.push_back("No Mask"); + labels.push_back("Mask"); + shouldDetectMasks = true; + landmarkStd = 0.2f; + } else { + continue; + } + + size_t num = output.get_shape()[ov::layout::height_idx(outputLayout)]; + size_t i = 0; + for (; i < outputsSizes[type].size(); ++i) { + if (num < outputsSizes[type][i]) { + break; + } + } + separateOutputsNames[type].insert(separateOutputsNames[type].begin() + i, outTensorName); + outputsSizes[type].insert(outputsSizes[type].begin() + i, num); + } + model = ppp.build(); + + for (size_t idx = 0; idx < outputsSizes[OUT_BOXES].size(); ++idx) { + size_t width = outputsSizes[OUT_BOXES][idx]; + size_t height = outputsSizes[OUT_BOXES][idx]; + auto s = anchorCfg[idx].stride; + auto anchorNum = anchorsFpn[s].size(); + + anchors.push_back(std::vector<Anchor>(height * width * anchorNum)); + for (size_t iw = 0; iw < width; ++iw) { + size_t sw = iw * s; + for (size_t ih = 0; ih < height; ++ih) { + size_t sh = ih * s; + for (size_t k = 0; k < anchorNum; ++k) { + Anchor& anc = anchors[idx][(ih * width + iw) * anchorNum + k]; + anc.left = anchorsFpn[s][k].left + sw; + anc.top = anchorsFpn[s][k].top + sh; + anc.right = anchorsFpn[s][k].right + sw; + anc.bottom = anchorsFpn[s][k].bottom + sh; + } + } + } + } +} + +std::vector<Anchor> ratioEnum(const Anchor& anchor, const std::vector<int>& ratios) { + std::vector<Anchor> retVal; + const auto w = anchor.getWidth(); + const auto h = anchor.getHeight(); + const auto xCtr = anchor.getXCenter(); + const auto yCtr = anchor.getYCenter(); + + for (const auto ratio : ratios) { + const auto size = w * h; + const auto sizeRatio = static_cast<float>(size) / ratio; + const auto ws = sqrt(sizeRatio); + const auto hs = ws * ratio; + retVal.push_back({static_cast<float>(xCtr - 0.5f * (ws - 1.0f)), + static_cast<float>(yCtr - 0.5f * (hs - 1.0f)), + static_cast<float>(xCtr + 0.5f * (ws - 1.0f)), + static_cast<float>(yCtr + 0.5f * (hs - 1.0f))}); + } + return retVal; +} + +std::vector<Anchor> scaleEnum(const Anchor& anchor, const std::vector<int>& scales) { + std::vector<Anchor> retVal; + const auto w = anchor.getWidth(); + const auto h = anchor.getHeight(); + const auto xCtr = anchor.getXCenter(); + const auto yCtr = anchor.getYCenter(); + + for (auto scale : scales) { + const auto ws = w * scale; + const auto hs = h * scale; + retVal.push_back({static_cast<float>(xCtr - 0.5f * (ws - 1.0f)), + static_cast<float>(yCtr - 0.5f * (hs - 1.0f)), + static_cast<float>(xCtr + 0.5f * (ws - 1.0f)), + static_cast<float>(yCtr + 0.5f * (hs - 1.0f))}); + } + return retVal; +} + +std::vector<Anchor> generateAnchors(const int baseSize, + const std::vector<int>& ratios, + const std::vector<int>& scales) { + Anchor baseAnchor{0.0f, 0.0f, baseSize - 1.0f, baseSize - 1.0f}; + auto ratioAnchors = ratioEnum(baseAnchor, ratios); + std::vector<Anchor> retVal; + + for (const auto& ra : ratioAnchors) { + auto addon = scaleEnum(ra, scales); + retVal.insert(retVal.end(), addon.begin(), addon.end()); + } + return retVal; +} + +void ModelRetinaFace::generateAnchorsFpn() { + auto cfg = anchorCfg; + std::sort(cfg.begin(), cfg.end(), [](const AnchorCfgLine& x, const AnchorCfgLine& y) { + return x.stride > y.stride; + }); + + for (const auto& cfgLine : cfg) { + anchorsFpn.emplace(cfgLine.stride, generateAnchors(cfgLine.baseSize, cfgLine.ratios, cfgLine.scales)); + } +} + +std::vector<size_t> thresholding(const ov::Tensor& scoresTensor, const int anchorNum, const float confidenceThreshold) { + std::vector<size_t> indices; + indices.reserve(ModelRetinaFace::INIT_VECTOR_SIZE); + auto shape = scoresTensor.get_shape(); + size_t restAnchors = shape[1] - anchorNum; + const float* scoresPtr = scoresTensor.data<float>(); + + for (size_t x = anchorNum; x < shape[1]; ++x) { + for (size_t y = 0; y < shape[2]; ++y) { + for (size_t z = 0; z < shape[3]; ++z) { + auto idx = (x * shape[2] + y) * shape[3] + z; + auto score = scoresPtr[idx]; + if (score >= confidenceThreshold) { + indices.push_back((y * shape[3] + z) * restAnchors + (x - anchorNum)); + } + } + } + } + + return indices; +} + +void filterScores(std::vector<float>& scores, + const std::vector<size_t>& indices, + const ov::Tensor& scoresTensor, + const int anchorNum) { + const auto& shape = scoresTensor.get_shape(); + const float* scoresPtr = scoresTensor.data<float>(); + const auto start = shape[2] * shape[3] * anchorNum; + + for (auto i : indices) { + auto offset = (i % anchorNum) * shape[2] * shape[3] + i / anchorNum; + scores.push_back(scoresPtr[start + offset]); + } +} + +void filterBoxes(std::vector<Anchor>& boxes, + const std::vector<size_t>& indices, + const ov::Tensor& boxesTensor, + int anchorNum, + const std::vector<Anchor>& anchors) { + const auto& shape = boxesTensor.get_shape(); + const float* boxesPtr = boxesTensor.data<float>(); + const auto boxPredLen = shape[1] / anchorNum; + const auto blockWidth = shape[2] * shape[3]; + + for (auto i : indices) { + auto offset = blockWidth * boxPredLen * (i % anchorNum) + (i / anchorNum); + + const auto dx = boxesPtr[offset]; + const auto dy = boxesPtr[offset + blockWidth]; + const auto dw = boxesPtr[offset + blockWidth * 2]; + const auto dh = boxesPtr[offset + blockWidth * 3]; + + const auto predCtrX = dx * anchors[i].getWidth() + anchors[i].getXCenter(); + const auto predCtrY = dy * anchors[i].getHeight() + anchors[i].getYCenter(); + const auto predW = exp(dw) * anchors[i].getWidth(); + const auto predH = exp(dh) * anchors[i].getHeight(); + + boxes.push_back({static_cast<float>(predCtrX - 0.5f * (predW - 1.0f)), + static_cast<float>(predCtrY - 0.5f * (predH - 1.0f)), + static_cast<float>(predCtrX + 0.5f * (predW - 1.0f)), + static_cast<float>(predCtrY + 0.5f * (predH - 1.0f))}); + } +} + +void filterLandmarks(std::vector<cv::Point2f>& landmarks, + const std::vector<size_t>& indices, + const ov::Tensor& landmarksTensor, + int anchorNum, + const std::vector<Anchor>& anchors, + const float landmarkStd) { + const auto& shape = landmarksTensor.get_shape(); + const float* landmarksPtr = landmarksTensor.data<float>(); + const auto landmarkPredLen = shape[1] / anchorNum; + const auto blockWidth = shape[2] * shape[3]; + + for (auto i : indices) { + for (int j = 0; j < ModelRetinaFace::LANDMARKS_NUM; ++j) { + auto offset = (i % anchorNum) * landmarkPredLen * shape[2] * shape[3] + i / anchorNum; + auto deltaX = landmarksPtr[offset + j * 2 * blockWidth] * landmarkStd; + auto deltaY = landmarksPtr[offset + (j * 2 + 1) * blockWidth] * landmarkStd; + landmarks.push_back({deltaX * anchors[i].getWidth() + anchors[i].getXCenter(), + deltaY * anchors[i].getHeight() + anchors[i].getYCenter()}); + } + } +} + +void filterMasksScores(std::vector<float>& masks, + const std::vector<size_t>& indices, + const ov::Tensor& maskScoresTensor, + const int anchorNum) { + auto shape = maskScoresTensor.get_shape(); + const float* maskScoresPtr = maskScoresTensor.data<float>(); + auto start = shape[2] * shape[3] * anchorNum * 2; + + for (auto i : indices) { + auto offset = (i % anchorNum) * shape[2] * shape[3] + i / anchorNum; + masks.push_back(maskScoresPtr[start + offset]); + } +} + +std::unique_ptr<ResultBase> ModelRetinaFace::postprocess(InferenceResult& infResult) { + std::vector<float> scores; + scores.reserve(INIT_VECTOR_SIZE); + std::vector<Anchor> boxes; + boxes.reserve(INIT_VECTOR_SIZE); + std::vector<cv::Point2f> landmarks; + std::vector<float> masks; + + if (shouldDetectLandmarks) { + landmarks.reserve(INIT_VECTOR_SIZE); + } + if (shouldDetectMasks) { + masks.reserve(INIT_VECTOR_SIZE); + } + + // --------------------------- Gather & Filter output from all levels + // ---------------------------------------------------------- + for (size_t idx = 0; idx < anchorCfg.size(); ++idx) { + const auto boxRaw = infResult.outputsData[separateOutputsNames[OUT_BOXES][idx]]; + const auto scoresRaw = infResult.outputsData[separateOutputsNames[OUT_SCORES][idx]]; + auto s = anchorCfg[idx].stride; + auto anchorNum = anchorsFpn[s].size(); + + auto validIndices = thresholding(scoresRaw, anchorNum, confidenceThreshold); + filterScores(scores, validIndices, scoresRaw, anchorNum); + filterBoxes(boxes, validIndices, boxRaw, anchorNum, anchors[idx]); + if (shouldDetectLandmarks) { + const auto landmarksRaw = infResult.outputsData[separateOutputsNames[OUT_LANDMARKS][idx]]; + filterLandmarks(landmarks, validIndices, landmarksRaw, anchorNum, anchors[idx], landmarkStd); + } + if (shouldDetectMasks) { + const auto masksRaw = infResult.outputsData[separateOutputsNames[OUT_MASKSCORES][idx]]; + filterMasksScores(masks, validIndices, masksRaw, anchorNum); + } + } + // --------------------------- Apply Non-maximum Suppression + // ---------------------------------------------------------- !shouldDetectLandmarks determines nms behavior, if + // true - boundaries are included in areas calculation + const auto keep = nms(boxes, scores, boxIOUThreshold, !shouldDetectLandmarks); + + // --------------------------- Create detection result objects + // -------------------------------------------------------- + RetinaFaceDetectionResult* result = new RetinaFaceDetectionResult(infResult.frameId, infResult.metaData); + + const auto imgWidth = infResult.internalModelData->asRef<InternalImageModelData>().inputImgWidth; + const auto imgHeight = infResult.internalModelData->asRef<InternalImageModelData>().inputImgHeight; + const auto scaleX = static_cast<float>(netInputWidth) / imgWidth; + const auto scaleY = static_cast<float>(netInputHeight) / imgHeight; + + result->objects.reserve(keep.size()); + result->landmarks.reserve(keep.size() * ModelRetinaFace::LANDMARKS_NUM); + for (auto i : keep) { + DetectedObject desc; + desc.confidence = scores[i]; + //--- Scaling coordinates + boxes[i].left /= scaleX; + boxes[i].top /= scaleY; + boxes[i].right /= scaleX; + boxes[i].bottom /= scaleY; + + desc.x = clamp(boxes[i].left, 0.f, static_cast<float>(imgWidth)); + desc.y = clamp(boxes[i].top, 0.f, static_cast<float>(imgHeight)); + desc.width = clamp(boxes[i].getWidth(), 0.f, static_cast<float>(imgWidth)); + desc.height = clamp(boxes[i].getHeight(), 0.f, static_cast<float>(imgHeight)); + //--- Default label 0 - Face. If detecting masks then labels would be 0 - No Mask, 1 - Mask + desc.labelID = shouldDetectMasks ? (masks[i] > maskThreshold) : 0; + desc.label = labels[desc.labelID]; + result->objects.push_back(desc); + + //--- Scaling landmarks coordinates + for (size_t l = 0; l < ModelRetinaFace::LANDMARKS_NUM && shouldDetectLandmarks; ++l) { + landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].x = + clamp(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].x / scaleX, 0.f, static_cast<float>(imgWidth)); + landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].y = + clamp(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].y / scaleY, 0.f, static_cast<float>(imgHeight)); + result->landmarks.push_back(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l]); + } + } + + return std::unique_ptr<ResultBase>(result); +} diff --git a/python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp b/python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp new file mode 100644 index 0000000..8322c3c --- /dev/null +++ b/python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp @@ -0,0 +1,277 @@ +/* +// Copyright (C) 2021-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/detection_model_retinaface_pt.h" + +#include <stdint.h> + +#include <algorithm> +#include <cmath> +#include <map> +#include <stdexcept> +#include <string> +#include <vector> + +#include <openvino/openvino.hpp> + +#include <utils/common.hpp> +#include <utils/nms.hpp> +#include <utils/ocv_common.hpp> + +#include "models/internal_model_data.h" +#include "models/results.h" + +ModelRetinaFacePT::ModelRetinaFacePT(const std::string& modelFileName, + float confidenceThreshold, + bool useAutoResize, + float boxIOUThreshold, + const std::string& layout) + : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, {"Face"}, layout), // Default label is "Face" + landmarksNum(0), + boxIOUThreshold(boxIOUThreshold) {} + +void ModelRetinaFacePT::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output ------------------------------------------------- + // --------------------------- Prepare input ------------------------------------------------------ + if (model->inputs().size() != 1) { + throw std::logic_error("RetinaFacePT model wrapper expects models that have only 1 input"); + } + + const ov::Shape& inputShape = model->input().get_shape(); + const ov::Layout& inputLayout = getInputLayout(model->input()); + + if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) { + throw std::logic_error("Expected 3-channel input"); + } + + ov::preprocess::PrePostProcessor ppp(model); + inputTransform.setPrecision(ppp, model->input().get_any_name()); + ppp.input().tensor().set_layout({"NHWC"}); + + if (useAutoResize) { + ppp.input().tensor().set_spatial_dynamic_shape(); + + ppp.input() + .preprocess() + .convert_element_type(ov::element::f32) + .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR); + } + + ppp.input().model().set_layout(inputLayout); + + // --------------------------- Reading image input parameters ------------------------------------------- + inputsNames.push_back(model->input().get_any_name()); + netInputWidth = inputShape[ov::layout::width_idx(inputLayout)]; + netInputHeight = inputShape[ov::layout::height_idx(inputLayout)]; + + // --------------------------- Prepare output ----------------------------------------------------- + if (model->outputs().size() != 3) { + throw std::logic_error("RetinaFace model wrapper expects models that have 3 outputs"); + } + + landmarksNum = 0; + + outputsNames.resize(2); + std::vector<uint32_t> outputsSizes[OUT_MAX]; + const ov::Layout chw("CHW"); + const ov::Layout nchw("NCHW"); + for (auto& output : model->outputs()) { + auto outTensorName = output.get_any_name(); + outputsNames.push_back(outTensorName); + ppp.output(outTensorName) + .tensor() + .set_element_type(ov::element::f32) + .set_layout(output.get_shape().size() == 4 ? nchw : chw); + + if (outTensorName.find("bbox") != std::string::npos) { + outputsNames[OUT_BOXES] = outTensorName; + } else if (outTensorName.find("cls") != std::string::npos) { + outputsNames[OUT_SCORES] = outTensorName; + } else if (outTensorName.find("landmark") != std::string::npos) { + // Landmarks might be optional, if it is present, resize names array to fit landmarks output name to the + // last item of array Considering that other outputs names are already filled in or will be filled later + outputsNames.resize(std::max(outputsNames.size(), (size_t)OUT_LANDMARKS + 1)); + outputsNames[OUT_LANDMARKS] = outTensorName; + landmarksNum = + output.get_shape()[ov::layout::width_idx(chw)] / 2; // Each landmark consist of 2 variables (x and y) + } else { + continue; + } + } + + if (outputsNames[OUT_BOXES] == "" || outputsNames[OUT_SCORES] == "") { + throw std::logic_error("Bbox or cls layers are not found"); + } + + model = ppp.build(); + priors = generatePriorData(); +} + +std::vector<size_t> ModelRetinaFacePT::filterByScore(const ov::Tensor& scoresTensor, const float confidenceThreshold) { + std::vector<size_t> indicies; + const auto& shape = scoresTensor.get_shape(); + const float* scoresPtr = scoresTensor.data<float>(); + + for (size_t x = 0; x < shape[1]; ++x) { + const auto idx = (x * shape[2] + 1); + const auto score = scoresPtr[idx]; + if (score >= confidenceThreshold) { + indicies.push_back(x); + } + } + + return indicies; +} + +std::vector<float> ModelRetinaFacePT::getFilteredScores(const ov::Tensor& scoresTensor, + const std::vector<size_t>& indicies) { + const auto& shape = scoresTensor.get_shape(); + const float* scoresPtr = scoresTensor.data<float>(); + + std::vector<float> scores; + scores.reserve(indicies.size()); + + for (auto i : indicies) { + scores.push_back(scoresPtr[i * shape[2] + 1]); + } + return scores; +} + +std::vector<cv::Point2f> ModelRetinaFacePT::getFilteredLandmarks(const ov::Tensor& landmarksTensor, + const std::vector<size_t>& indicies, + int imgWidth, + int imgHeight) { + const auto& shape = landmarksTensor.get_shape(); + const float* landmarksPtr = landmarksTensor.data<float>(); + + std::vector<cv::Point2f> landmarks(landmarksNum * indicies.size()); + + for (size_t i = 0; i < indicies.size(); i++) { + const size_t idx = indicies[i]; + const auto& prior = priors[idx]; + for (size_t j = 0; j < landmarksNum; j++) { + landmarks[i * landmarksNum + j].x = + clamp(prior.cX + landmarksPtr[idx * shape[2] + j * 2] * variance[0] * prior.width, 0.f, 1.f) * imgWidth; + landmarks[i * landmarksNum + j].y = + clamp(prior.cY + landmarksPtr[idx * shape[2] + j * 2 + 1] * variance[0] * prior.height, 0.f, 1.f) * + imgHeight; + } + } + return landmarks; +} + +std::vector<ModelRetinaFacePT::Box> ModelRetinaFacePT::generatePriorData() { + const float globalMinSizes[][2] = {{16, 32}, {64, 128}, {256, 512}}; + const float steps[] = {8., 16., 32.}; + std::vector<ModelRetinaFacePT::Box> anchors; + for (size_t stepNum = 0; stepNum < arraySize(steps); stepNum++) { + const int featureW = static_cast<int>(std::round(netInputWidth / steps[stepNum])); + const int featureH = static_cast<int>(std::round(netInputHeight / steps[stepNum])); + + const auto& minSizes = globalMinSizes[stepNum]; + for (int i = 0; i < featureH; i++) { + for (int j = 0; j < featureW; j++) { + for (auto minSize : minSizes) { + const float sKX = minSize / netInputWidth; + const float sKY = minSize / netInputHeight; + const float denseCY = (i + 0.5f) * steps[stepNum] / netInputHeight; + const float denseCX = (j + 0.5f) * steps[stepNum] / netInputWidth; + anchors.push_back(ModelRetinaFacePT::Box{denseCX, denseCY, sKX, sKY}); + } + } + } + } + return anchors; +} + +std::vector<Anchor> ModelRetinaFacePT::getFilteredProposals(const ov::Tensor& boxesTensor, + const std::vector<size_t>& indicies, + int imgWidth, + int imgHeight) { + std::vector<Anchor> rects; + rects.reserve(indicies.size()); + + const auto& shape = boxesTensor.get_shape(); + const float* boxesPtr = boxesTensor.data<float>(); + + if (shape[1] != priors.size()) { + throw std::logic_error("rawBoxes size is not equal to priors size"); + } + + for (auto i : indicies) { + const auto pRawBox = reinterpret_cast<const Box*>(boxesPtr + i * shape[2]); + const auto& prior = priors[i]; + const float cX = priors[i].cX + pRawBox->cX * variance[0] * prior.width; + const float cY = priors[i].cY + pRawBox->cY * variance[0] * prior.height; + const float width = prior.width * exp(pRawBox->width * variance[1]); + const float height = prior.height * exp(pRawBox->height * variance[1]); + rects.push_back(Anchor{clamp(cX - width / 2, 0.f, 1.f) * imgWidth, + clamp(cY - height / 2, 0.f, 1.f) * imgHeight, + clamp(cX + width / 2, 0.f, 1.f) * imgWidth, + clamp(cY + height / 2, 0.f, 1.f) * imgHeight}); + } + + return rects; +} + +std::unique_ptr<ResultBase> ModelRetinaFacePT::postprocess(InferenceResult& infResult) { + // (raw_output, scale_x, scale_y, face_prob_threshold, image_size): + const auto boxesTensor = infResult.outputsData[outputsNames[OUT_BOXES]]; + const auto scoresTensor = infResult.outputsData[outputsNames[OUT_SCORES]]; + + const auto& validIndicies = filterByScore(scoresTensor, confidenceThreshold); + const auto& scores = getFilteredScores(scoresTensor, validIndicies); + + const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>(); + const auto& landmarks = landmarksNum ? getFilteredLandmarks(infResult.outputsData[outputsNames[OUT_LANDMARKS]], + validIndicies, + internalData.inputImgWidth, + internalData.inputImgHeight) + : std::vector<cv::Point2f>(); + + const auto& proposals = + getFilteredProposals(boxesTensor, validIndicies, internalData.inputImgWidth, internalData.inputImgHeight); + + const auto& keptIndicies = nms(proposals, scores, boxIOUThreshold, !landmarksNum); + + // --------------------------- Create detection result objects + // -------------------------------------------------------- + RetinaFaceDetectionResult* result = new RetinaFaceDetectionResult(infResult.frameId, infResult.metaData); + + result->objects.reserve(keptIndicies.size()); + result->landmarks.reserve(keptIndicies.size() * landmarksNum); + for (auto i : keptIndicies) { + DetectedObject desc; + desc.confidence = scores[i]; + + //--- Scaling coordinates + desc.x = proposals[i].left; + desc.y = proposals[i].top; + desc.width = proposals[i].getWidth(); + desc.height = proposals[i].getHeight(); + + desc.labelID = 0; + desc.label = labels[desc.labelID]; + result->objects.push_back(desc); + + //--- Filtering landmarks coordinates + for (uint32_t l = 0; l < landmarksNum; ++l) { + result->landmarks.emplace_back(landmarks[i * landmarksNum + l].x, landmarks[i * landmarksNum + l].y); + } + } + + return std::unique_ptr<ResultBase>(result); +} diff --git a/python/openvino/runtime/common/models/src/detection_model_ssd.cpp b/python/openvino/runtime/common/models/src/detection_model_ssd.cpp new file mode 100644 index 0000000..ef741ee --- /dev/null +++ b/python/openvino/runtime/common/models/src/detection_model_ssd.cpp @@ -0,0 +1,281 @@ +/* +// Copyright (C) 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/detection_model_ssd.h" + +#include <algorithm> +#include <map> +#include <stdexcept> +#include <string> +#include <unordered_set> +#include <vector> + +#include <openvino/openvino.hpp> + +#include <utils/common.hpp> +#include <utils/ocv_common.hpp> + +#include "models/internal_model_data.h" +#include "models/results.h" + +struct InputData; + +ModelSSD::ModelSSD(const std::string& modelFileName, + float confidenceThreshold, + bool useAutoResize, + const std::vector<std::string>& labels, + const std::string& layout) + : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, labels, layout) {} + +std::shared_ptr<InternalModelData> ModelSSD::preprocess(const InputData& inputData, ov::InferRequest& request) { + if (inputsNames.size() > 1) { + const auto& imageInfoTensor = request.get_tensor(inputsNames[1]); + const auto info = imageInfoTensor.data<float>(); + info[0] = static_cast<float>(netInputHeight); + info[1] = static_cast<float>(netInputWidth); + info[2] = 1; + request.set_tensor(inputsNames[1], imageInfoTensor); + } + + return DetectionModel::preprocess(inputData, request); +} + +std::unique_ptr<ResultBase> ModelSSD::postprocess(InferenceResult& infResult) { + return outputsNames.size() > 1 ? postprocessMultipleOutputs(infResult) : postprocessSingleOutput(infResult); +} + +std::unique_ptr<ResultBase> ModelSSD::postprocessSingleOutput(InferenceResult& infResult) { + const ov::Tensor& detectionsTensor = infResult.getFirstOutputTensor(); + size_t detectionsNum = detectionsTensor.get_shape()[detectionsNumId]; + const float* detections = detectionsTensor.data<float>(); + + DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData); + auto retVal = std::unique_ptr<ResultBase>(result); + + const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>(); + + for (size_t i = 0; i < detectionsNum; i++) { + float image_id = detections[i * objectSize + 0]; + if (image_id < 0) { + break; + } + + float confidence = detections[i * objectSize + 2]; + + /** Filtering out objects with confidence < confidence_threshold probability **/ + if (confidence > confidenceThreshold) { + DetectedObject desc; + + desc.confidence = confidence; + desc.labelID = static_cast<int>(detections[i * objectSize + 1]); + desc.label = getLabelName(desc.labelID); + + desc.x = clamp(detections[i * objectSize + 3] * internalData.inputImgWidth, + 0.f, + static_cast<float>(internalData.inputImgWidth)); + desc.y = clamp(detections[i * objectSize + 4] * internalData.inputImgHeight, + 0.f, + static_cast<float>(internalData.inputImgHeight)); + desc.width = clamp(detections[i * objectSize + 5] * internalData.inputImgWidth, + 0.f, + static_cast<float>(internalData.inputImgWidth)) - + desc.x; + desc.height = clamp(detections[i * objectSize + 6] * internalData.inputImgHeight, + 0.f, + static_cast<float>(internalData.inputImgHeight)) - + desc.y; + + result->objects.push_back(desc); + } + } + + return retVal; +} + +std::unique_ptr<ResultBase> ModelSSD::postprocessMultipleOutputs(InferenceResult& infResult) { + const float* boxes = infResult.outputsData[outputsNames[0]].data<float>(); + size_t detectionsNum = infResult.outputsData[outputsNames[0]].get_shape()[detectionsNumId]; + const float* labels = infResult.outputsData[outputsNames[1]].data<float>(); + const float* scores = outputsNames.size() > 2 ? infResult.outputsData[outputsNames[2]].data<float>() : nullptr; + + DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData); + auto retVal = std::unique_ptr<ResultBase>(result); + + const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>(); + + // In models with scores are stored in separate output, coordinates are normalized to [0,1] + // In other multiple-outputs models coordinates are normalized to [0,netInputWidth] and [0,netInputHeight] + float widthScale = static_cast<float>(internalData.inputImgWidth) / (scores ? 1 : netInputWidth); + float heightScale = static_cast<float>(internalData.inputImgHeight) / (scores ? 1 : netInputHeight); + + for (size_t i = 0; i < detectionsNum; i++) { + float confidence = scores ? scores[i] : boxes[i * objectSize + 4]; + + /** Filtering out objects with confidence < confidence_threshold probability **/ + if (confidence > confidenceThreshold) { + DetectedObject desc; + + desc.confidence = confidence; + desc.labelID = static_cast<int>(labels[i]); + desc.label = getLabelName(desc.labelID); + + desc.x = clamp(boxes[i * objectSize] * widthScale, 0.f, static_cast<float>(internalData.inputImgWidth)); + desc.y = + clamp(boxes[i * objectSize + 1] * heightScale, 0.f, static_cast<float>(internalData.inputImgHeight)); + desc.width = + clamp(boxes[i * objectSize + 2] * widthScale, 0.f, static_cast<float>(internalData.inputImgWidth)) - + desc.x; + desc.height = + clamp(boxes[i * objectSize + 3] * heightScale, 0.f, static_cast<float>(internalData.inputImgHeight)) - + desc.y; + + result->objects.push_back(desc); + } + } + + return retVal; +} + +void ModelSSD::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output ------------------------------------------------- + // --------------------------- Prepare input ------------------------------------------------------ + ov::preprocess::PrePostProcessor ppp(model); + for (const auto& input : model->inputs()) { + auto inputTensorName = input.get_any_name(); + const ov::Shape& shape = input.get_shape(); + ov::Layout inputLayout = getInputLayout(input); + + if (shape.size() == 4) { // 1st input contains images + if (inputsNames.empty()) { + inputsNames.push_back(inputTensorName); + } else { + inputsNames[0] = inputTensorName; + } + + inputTransform.setPrecision(ppp, inputTensorName); + ppp.input(inputTensorName).tensor().set_layout({"NHWC"}); + + if (useAutoResize) { + ppp.input(inputTensorName).tensor().set_spatial_dynamic_shape(); + + ppp.input(inputTensorName) + .preprocess() + .convert_element_type(ov::element::f32) + .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR); + } + + ppp.input(inputTensorName).model().set_layout(inputLayout); + + netInputWidth = shape[ov::layout::width_idx(inputLayout)]; + netInputHeight = shape[ov::layout::height_idx(inputLayout)]; + } else if (shape.size() == 2) { // 2nd input contains image info + inputsNames.resize(2); + inputsNames[1] = inputTensorName; + ppp.input(inputTensorName).tensor().set_element_type(ov::element::f32); + } else { + throw std::logic_error("Unsupported " + std::to_string(input.get_shape().size()) + + "D " + "input layer '" + + input.get_any_name() + + "'. " + "Only 2D and 4D input layers are supported"); + } + } + model = ppp.build(); + + // --------------------------- Prepare output ----------------------------------------------------- + if (model->outputs().size() == 1) { + prepareSingleOutput(model); + } else { + prepareMultipleOutputs(model); + } +} + +void ModelSSD::prepareSingleOutput(std::shared_ptr<ov::Model>& model) { + const auto& output = model->output(); + outputsNames.push_back(output.get_any_name()); + + const ov::Shape& shape = output.get_shape(); + const ov::Layout& layout("NCHW"); + if (shape.size() != 4) { + throw std::logic_error("SSD single output must have 4 dimensions, but had " + std::to_string(shape.size())); + } + detectionsNumId = ov::layout::height_idx(layout); + objectSize = shape[ov::layout::width_idx(layout)]; + if (objectSize != 7) { + throw std::logic_error("SSD single output must have 7 as a last dimension, but had " + + std::to_string(objectSize)); + } + ov::preprocess::PrePostProcessor ppp(model); + ppp.output().tensor().set_element_type(ov::element::f32).set_layout(layout); + model = ppp.build(); +} + +void ModelSSD::prepareMultipleOutputs(std::shared_ptr<ov::Model>& model) { + const ov::OutputVector& outputs = model->outputs(); + for (auto& output : outputs) { + const auto& tensorNames = output.get_names(); + for (const auto& name : tensorNames) { + if (name.find("boxes") != std::string::npos) { + outputsNames.push_back(name); + break; + } else if (name.find("labels") != std::string::npos) { + outputsNames.push_back(name); + break; + } else if (name.find("scores") != std::string::npos) { + outputsNames.push_back(name); + break; + } + } + } + if (outputsNames.size() != 2 && outputsNames.size() != 3) { + throw std::logic_error("SSD model wrapper must have 2 or 3 outputs, but had " + + std::to_string(outputsNames.size())); + } + std::sort(outputsNames.begin(), outputsNames.end()); + + ov::preprocess::PrePostProcessor ppp(model); + const auto& boxesShape = model->output(outputsNames[0]).get_partial_shape().get_max_shape(); + + ov::Layout boxesLayout; + if (boxesShape.size() == 2) { + boxesLayout = "NC"; + detectionsNumId = ov::layout::batch_idx(boxesLayout); + objectSize = boxesShape[ov::layout::channels_idx(boxesLayout)]; + + if (objectSize != 5) { + throw std::logic_error("Incorrect 'boxes' output shape, [n][5] shape is required"); + } + } else if (boxesShape.size() == 3) { + boxesLayout = "CHW"; + detectionsNumId = ov::layout::height_idx(boxesLayout); + objectSize = boxesShape[ov::layout::width_idx(boxesLayout)]; + + if (objectSize != 4) { + throw std::logic_error("Incorrect 'boxes' output shape, [b][n][4] shape is required"); + } + } else { + throw std::logic_error("Incorrect number of 'boxes' output dimensions, expected 2 or 3, but had " + + std::to_string(boxesShape.size())); + } + + ppp.output(outputsNames[0]).tensor().set_layout(boxesLayout); + + for (const auto& outName : outputsNames) { + ppp.output(outName).tensor().set_element_type(ov::element::f32); + } + model = ppp.build(); +} diff --git a/python/openvino/runtime/common/models/src/detection_model_yolo.cpp b/python/openvino/runtime/common/models/src/detection_model_yolo.cpp new file mode 100644 index 0000000..2c4fb1d --- /dev/null +++ b/python/openvino/runtime/common/models/src/detection_model_yolo.cpp @@ -0,0 +1,481 @@ +/* +// Copyright (C) 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/detection_model_yolo.h" + +#include <algorithm> +#include <cmath> +#include <cstdint> +#include <stdexcept> +#include <string> +#include <utility> +#include <vector> + +#include <openvino/openvino.hpp> + +#include <utils/common.hpp> +#include <utils/slog.hpp> + +#include "models/internal_model_data.h" +#include "models/results.h" + +std::vector<float> defaultAnchors[] = { + // YOLOv1v2 + {0.57273f, 0.677385f, 1.87446f, 2.06253f, 3.33843f, 5.47434f, 7.88282f, 3.52778f, 9.77052f, 9.16828f}, + // YOLOv3 + {10.0f, + 13.0f, + 16.0f, + 30.0f, + 33.0f, + 23.0f, + 30.0f, + 61.0f, + 62.0f, + 45.0f, + 59.0f, + 119.0f, + 116.0f, + 90.0f, + 156.0f, + 198.0f, + 373.0f, + 326.0f}, + // YOLOv4 + {12.0f, + 16.0f, + 19.0f, + 36.0f, + 40.0f, + 28.0f, + 36.0f, + 75.0f, + 76.0f, + 55.0f, + 72.0f, + 146.0f, + 142.0f, + 110.0f, + 192.0f, + 243.0f, + 459.0f, + 401.0f}, + // YOLOv4_Tiny + {10.0f, 14.0f, 23.0f, 27.0f, 37.0f, 58.0f, 81.0f, 82.0f, 135.0f, 169.0f, 344.0f, 319.0f}, + // YOLOF + {16.0f, 16.0f, 32.0f, 32.0f, 64.0f, 64.0f, 128.0f, 128.0f, 256.0f, 256.0f, 512.0f, 512.0f}}; + +const std::vector<int64_t> defaultMasks[] = { + // YOLOv1v2 + {}, + // YOLOv3 + {}, + // YOLOv4 + {0, 1, 2, 3, 4, 5, 6, 7, 8}, + // YOLOv4_Tiny + {1, 2, 3, 3, 4, 5}, + // YOLOF + {0, 1, 2, 3, 4, 5}}; + +static inline float sigmoid(float x) { + return 1.f / (1.f + exp(-x)); +} + +static inline float linear(float x) { + return x; +} + +ModelYolo::ModelYolo(const std::string& modelFileName, + float confidenceThreshold, + bool useAutoResize, + bool useAdvancedPostprocessing, + float boxIOUThreshold, + const std::vector<std::string>& labels, + const std::vector<float>& anchors, + const std::vector<int64_t>& masks, + const std::string& layout) + : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, labels, layout), + boxIOUThreshold(boxIOUThreshold), + useAdvancedPostprocessing(useAdvancedPostprocessing), + yoloVersion(YOLO_V3), + presetAnchors(anchors), + presetMasks(masks) {} + +void ModelYolo::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output ------------------------------------------------- + // --------------------------- Prepare input ------------------------------------------------------ + if (model->inputs().size() != 1) { + throw std::logic_error("YOLO model wrapper accepts models that have only 1 input"); + } + + const auto& input = model->input(); + const ov::Shape& inputShape = model->input().get_shape(); + ov::Layout inputLayout = getInputLayout(input); + + if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) { + throw std::logic_error("Expected 3-channel input"); + } + + ov::preprocess::PrePostProcessor ppp(model); + ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"}); + + if (useAutoResize) { + ppp.input().tensor().set_spatial_dynamic_shape(); + + ppp.input() + .preprocess() + .convert_element_type(ov::element::f32) + .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR); + } + + ppp.input().model().set_layout(inputLayout); + + //--- Reading image input parameters + inputsNames.push_back(model->input().get_any_name()); + netInputWidth = inputShape[ov::layout::width_idx(inputLayout)]; + netInputHeight = inputShape[ov::layout::height_idx(inputLayout)]; + + // --------------------------- Prepare output ----------------------------------------------------- + const ov::OutputVector& outputs = model->outputs(); + std::map<std::string, ov::Shape> outShapes; + for (auto& out : outputs) { + ppp.output(out.get_any_name()).tensor().set_element_type(ov::element::f32); + if (out.get_shape().size() == 4) { + if (out.get_shape()[ov::layout::height_idx("NCHW")] != out.get_shape()[ov::layout::width_idx("NCHW")] && + out.get_shape()[ov::layout::height_idx("NHWC")] == out.get_shape()[ov::layout::width_idx("NHWC")]) { + ppp.output(out.get_any_name()).model().set_layout("NHWC"); + // outShapes are saved before ppp.build() thus set yoloRegionLayout as it is in model before ppp.build() + yoloRegionLayout = "NHWC"; + } + // yolo-v1-tiny-tf out shape is [1, 21125] thus set layout only for 4 dim tensors + ppp.output(out.get_any_name()).tensor().set_layout("NCHW"); + } + outputsNames.push_back(out.get_any_name()); + outShapes[out.get_any_name()] = out.get_shape(); + } + model = ppp.build(); + + yoloVersion = YOLO_V3; + bool isRegionFound = false; + for (const auto& op : model->get_ordered_ops()) { + if (std::string("RegionYolo") == op->get_type_name()) { + auto regionYolo = std::dynamic_pointer_cast<ov::op::v0::RegionYolo>(op); + + if (regionYolo) { + if (!regionYolo->get_mask().size()) { + yoloVersion = YOLO_V1V2; + } + + const auto& opName = op->get_friendly_name(); + for (const auto& out : outputs) { + if (out.get_node()->get_friendly_name() == opName || + out.get_node()->get_input_node_ptr(0)->get_friendly_name() == opName) { + isRegionFound = true; + regions.emplace(out.get_any_name(), Region(regionYolo)); + } + } + } + } + } + + if (!isRegionFound) { + switch (outputsNames.size()) { + case 1: + yoloVersion = YOLOF; + break; + case 2: + yoloVersion = YOLO_V4_TINY; + break; + case 3: + yoloVersion = YOLO_V4; + break; + } + + int num = yoloVersion == YOLOF ? 6 : 3; + isObjConf = yoloVersion == YOLOF ? 0 : 1; + int i = 0; + + auto chosenMasks = presetMasks.size() ? presetMasks : defaultMasks[yoloVersion]; + if (chosenMasks.size() != num * outputs.size()) { + throw std::runtime_error(std::string("Invalid size of masks array, got ") + + std::to_string(presetMasks.size()) + ", should be " + + std::to_string(num * outputs.size())); + } + + std::sort(outputsNames.begin(), + outputsNames.end(), + [&outShapes, this](const std::string& x, const std::string& y) { + return outShapes[x][ov::layout::height_idx(yoloRegionLayout)] > + outShapes[y][ov::layout::height_idx(yoloRegionLayout)]; + }); + + for (const auto& name : outputsNames) { + const auto& shape = outShapes[name]; + if (shape[ov::layout::channels_idx(yoloRegionLayout)] % num != 0) { + throw std::logic_error(std::string("Output tensor ") + name + " has wrong channel dimension"); + } + regions.emplace( + name, + Region(shape[ov::layout::channels_idx(yoloRegionLayout)] / num - 4 - (isObjConf ? 1 : 0), + 4, + presetAnchors.size() ? presetAnchors : defaultAnchors[yoloVersion], + std::vector<int64_t>(chosenMasks.begin() + i * num, chosenMasks.begin() + (i + 1) * num), + shape[ov::layout::width_idx(yoloRegionLayout)], + shape[ov::layout::height_idx(yoloRegionLayout)])); + i++; + } + } else { + // Currently externally set anchors and masks are supported only for YoloV4 + if (presetAnchors.size() || presetMasks.size()) { + slog::warn << "Preset anchors and mask can be set for YoloV4 model only. " + "This model is not YoloV4, so these options will be ignored." + << slog::endl; + } + } +} + +std::unique_ptr<ResultBase> ModelYolo::postprocess(InferenceResult& infResult) { + DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData); + std::vector<DetectedObject> objects; + + // Parsing outputs + const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>(); + + for (auto& output : infResult.outputsData) { + this->parseYOLOOutput(output.first, + output.second, + netInputHeight, + netInputWidth, + internalData.inputImgHeight, + internalData.inputImgWidth, + objects); + } + + if (useAdvancedPostprocessing) { + // Advanced postprocessing + // Checking IOU threshold conformance + // For every i-th object we're finding all objects it intersects with, and comparing confidence + // If i-th object has greater confidence than all others, we include it into result + for (const auto& obj1 : objects) { + bool isGoodResult = true; + for (const auto& obj2 : objects) { + if (obj1.labelID == obj2.labelID && obj1.confidence < obj2.confidence && + intersectionOverUnion(obj1, obj2) >= boxIOUThreshold) { // if obj1 is the same as obj2, condition + // expression will evaluate to false anyway + isGoodResult = false; + break; + } + } + if (isGoodResult) { + result->objects.push_back(obj1); + } + } + } else { + // Classic postprocessing + std::sort(objects.begin(), objects.end(), [](const DetectedObject& x, const DetectedObject& y) { + return x.confidence > y.confidence; + }); + for (size_t i = 0; i < objects.size(); ++i) { + if (objects[i].confidence == 0) + continue; + for (size_t j = i + 1; j < objects.size(); ++j) + if (intersectionOverUnion(objects[i], objects[j]) >= boxIOUThreshold) + objects[j].confidence = 0; + result->objects.push_back(objects[i]); + } + } + + return std::unique_ptr<ResultBase>(result); +} + +void ModelYolo::parseYOLOOutput(const std::string& output_name, + const ov::Tensor& tensor, + const unsigned long resized_im_h, + const unsigned long resized_im_w, + const unsigned long original_im_h, + const unsigned long original_im_w, + std::vector<DetectedObject>& objects) { + // --------------------------- Extracting layer parameters ------------------------------------- + auto it = regions.find(output_name); + if (it == regions.end()) { + throw std::runtime_error(std::string("Can't find output layer with name ") + output_name); + } + auto& region = it->second; + + int sideW = 0; + int sideH = 0; + unsigned long scaleH; + unsigned long scaleW; + switch (yoloVersion) { + case YOLO_V1V2: + sideH = region.outputHeight; + sideW = region.outputWidth; + scaleW = region.outputWidth; + scaleH = region.outputHeight; + break; + case YOLO_V3: + case YOLO_V4: + case YOLO_V4_TINY: + case YOLOF: + sideH = static_cast<int>(tensor.get_shape()[ov::layout::height_idx("NCHW")]); + sideW = static_cast<int>(tensor.get_shape()[ov::layout::width_idx("NCHW")]); + scaleW = resized_im_w; + scaleH = resized_im_h; + break; + } + + auto entriesNum = sideW * sideH; + const float* outData = tensor.data<float>(); + + auto postprocessRawData = + (yoloVersion == YOLO_V4 || yoloVersion == YOLO_V4_TINY || yoloVersion == YOLOF) ? sigmoid : linear; + + // --------------------------- Parsing YOLO Region output ------------------------------------- + for (int i = 0; i < entriesNum; ++i) { + int row = i / sideW; + int col = i % sideW; + for (int n = 0; n < region.num; ++n) { + //--- Getting region data + int obj_index = calculateEntryIndex(entriesNum, + region.coords, + region.classes + isObjConf, + n * entriesNum + i, + region.coords); + int box_index = + calculateEntryIndex(entriesNum, region.coords, region.classes + isObjConf, n * entriesNum + i, 0); + float scale = isObjConf ? postprocessRawData(outData[obj_index]) : 1; + + //--- Preliminary check for confidence threshold conformance + if (scale >= confidenceThreshold) { + //--- Calculating scaled region's coordinates + float x, y; + if (yoloVersion == YOLOF) { + x = (static_cast<float>(col) / sideW + + outData[box_index + 0 * entriesNum] * region.anchors[2 * n] / scaleW) * + original_im_w; + y = (static_cast<float>(row) / sideH + + outData[box_index + 1 * entriesNum] * region.anchors[2 * n + 1] / scaleH) * + original_im_h; + } else { + x = static_cast<float>((col + postprocessRawData(outData[box_index + 0 * entriesNum])) / sideW * + original_im_w); + y = static_cast<float>((row + postprocessRawData(outData[box_index + 1 * entriesNum])) / sideH * + original_im_h); + } + float height = static_cast<float>(std::exp(outData[box_index + 3 * entriesNum]) * + region.anchors[2 * n + 1] * original_im_h / scaleH); + float width = static_cast<float>(std::exp(outData[box_index + 2 * entriesNum]) * region.anchors[2 * n] * + original_im_w / scaleW); + + DetectedObject obj; + obj.x = clamp(x - width / 2, 0.f, static_cast<float>(original_im_w)); + obj.y = clamp(y - height / 2, 0.f, static_cast<float>(original_im_h)); + obj.width = clamp(width, 0.f, static_cast<float>(original_im_w - obj.x)); + obj.height = clamp(height, 0.f, static_cast<float>(original_im_h - obj.y)); + + for (size_t j = 0; j < region.classes; ++j) { + int class_index = calculateEntryIndex(entriesNum, + region.coords, + region.classes + isObjConf, + n * entriesNum + i, + region.coords + isObjConf + j); + float prob = scale * postprocessRawData(outData[class_index]); + + //--- Checking confidence threshold conformance and adding region to the list + if (prob >= confidenceThreshold) { + obj.confidence = prob; + obj.labelID = j; + obj.label = getLabelName(obj.labelID); + objects.push_back(obj); + } + } + } + } + } +} + +int ModelYolo::calculateEntryIndex(int totalCells, int lcoords, size_t lclasses, int location, int entry) { + int n = location / totalCells; + int loc = location % totalCells; + return (n * (lcoords + lclasses) + entry) * totalCells + loc; +} + +double ModelYolo::intersectionOverUnion(const DetectedObject& o1, const DetectedObject& o2) { + double overlappingWidth = fmin(o1.x + o1.width, o2.x + o2.width) - fmax(o1.x, o2.x); + double overlappingHeight = fmin(o1.y + o1.height, o2.y + o2.height) - fmax(o1.y, o2.y); + double intersectionArea = + (overlappingWidth < 0 || overlappingHeight < 0) ? 0 : overlappingHeight * overlappingWidth; + double unionArea = o1.width * o1.height + o2.width * o2.height - intersectionArea; + return intersectionArea / unionArea; +} + +ModelYolo::Region::Region(const std::shared_ptr<ov::op::v0::RegionYolo>& regionYolo) { + coords = regionYolo->get_num_coords(); + classes = regionYolo->get_num_classes(); + auto mask = regionYolo->get_mask(); + num = mask.size(); + + auto shape = regionYolo->get_input_shape(0); + outputWidth = shape[3]; + outputHeight = shape[2]; + + if (num) { + // Parsing YoloV3 parameters + anchors.resize(num * 2); + + for (int i = 0; i < num; ++i) { + anchors[i * 2] = regionYolo->get_anchors()[mask[i] * 2]; + anchors[i * 2 + 1] = regionYolo->get_anchors()[mask[i] * 2 + 1]; + } + } else { + // Parsing YoloV2 parameters + num = regionYolo->get_num_regions(); + anchors = regionYolo->get_anchors(); + if (anchors.empty()) { + anchors = defaultAnchors[YOLO_V1V2]; + num = 5; + } + } +} + +ModelYolo::Region::Region(size_t classes, + int coords, + const std::vector<float>& anchors, + const std::vector<int64_t>& masks, + size_t outputWidth, + size_t outputHeight) + : classes(classes), + coords(coords), + outputWidth(outputWidth), + outputHeight(outputHeight) { + num = masks.size(); + + if (anchors.size() == 0 || anchors.size() % 2 != 0) { + throw std::runtime_error("Explicitly initialized region should have non-empty even-sized regions vector"); + } + + if (num) { + this->anchors.resize(num * 2); + + for (int i = 0; i < num; ++i) { + this->anchors[i * 2] = anchors[masks[i] * 2]; + this->anchors[i * 2 + 1] = anchors[masks[i] * 2 + 1]; + } + } else { + this->anchors = anchors; + num = anchors.size() / 2; + } +} diff --git a/python/openvino/runtime/common/models/src/detection_model_yolov3_onnx.cpp b/python/openvino/runtime/common/models/src/detection_model_yolov3_onnx.cpp new file mode 100644 index 0000000..132eb9e --- /dev/null +++ b/python/openvino/runtime/common/models/src/detection_model_yolov3_onnx.cpp @@ -0,0 +1,188 @@ +/* +// Copyright (C) 2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/detection_model_yolov3_onnx.h" + +#include <algorithm> +#include <cmath> +#include <cstdint> +#include <stdexcept> +#include <string> +#include <utility> +#include <vector> + +#include <openvino/openvino.hpp> + +#include <utils/common.hpp> +#include <utils/slog.hpp> + +#include "models/input_data.h" +#include "models/internal_model_data.h" +#include "models/results.h" +#include "utils/image_utils.h" + +ModelYoloV3ONNX::ModelYoloV3ONNX(const std::string& modelFileName, + float confidenceThreshold, + const std::vector<std::string>& labels, + const std::string& layout) + : DetectionModel(modelFileName, confidenceThreshold, false, labels, layout) { + interpolationMode = cv::INTER_CUBIC; + resizeMode = RESIZE_KEEP_ASPECT_LETTERBOX; + } + + +void ModelYoloV3ONNX::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output ------------------------------------------------- + // --------------------------- Prepare inputs ------------------------------------------------------ + const ov::OutputVector& inputs = model->inputs(); + if (inputs.size() != 2) { + throw std::logic_error("YoloV3ONNX model wrapper expects models that have 2 inputs"); + } + + ov::preprocess::PrePostProcessor ppp(model); + inputsNames.reserve(inputs.size()); + for (auto& input : inputs) { + const ov::Shape& currentShape = input.get_shape(); + std::string currentName = input.get_any_name(); + const ov::Layout& currentLayout = getInputLayout(input); + + if (currentShape.size() == 4) { + if (currentShape[ov::layout::channels_idx(currentLayout)] != 3) { + throw std::logic_error("Expected 4D image input with 3 channels"); + } + inputsNames[0] = currentName; + netInputWidth = currentShape[ov::layout::width_idx(currentLayout)]; + netInputHeight = currentShape[ov::layout::height_idx(currentLayout)]; + ppp.input(currentName).tensor().set_element_type(ov::element::u8).set_layout({"NHWC"}); + } else if (currentShape.size() == 2) { + if (currentShape[ov::layout::channels_idx(currentLayout)] != 2) { + throw std::logic_error("Expected 2D image info input with 2 channels"); + } + inputsNames[1] = currentName; + ppp.input(currentName).tensor().set_element_type(ov::element::i32); + } + ppp.input(currentName).model().set_layout(currentLayout); + } + + // --------------------------- Prepare outputs ----------------------------------------------------- + const ov::OutputVector& outputs = model->outputs(); + if (outputs.size() != 3) { + throw std::logic_error("YoloV3ONNX model wrapper expects models that have 3 outputs"); + } + + for (auto& output : outputs) { + const ov::Shape& currentShape = output.get_partial_shape().get_max_shape(); + std::string currentName = output.get_any_name(); + if (currentShape.back() == 3) { + indicesOutputName = currentName; + ppp.output(currentName).tensor().set_element_type(ov::element::i32); + } else if (currentShape[2] == 4) { + boxesOutputName = currentName; + ppp.output(currentName).tensor().set_element_type(ov::element::f32); + } else if (currentShape[1] == numberOfClasses) { + scoresOutputName = currentName; + ppp.output(currentName).tensor().set_element_type(ov::element::f32); + } else { + throw std::logic_error("Expected shapes [:,:,4], [:," + + std::to_string(numberOfClasses) + ",:] and [:,3] for outputs"); + } + outputsNames.push_back(currentName); + } + model = ppp.build(); +} + +std::shared_ptr<InternalModelData> ModelYoloV3ONNX::preprocess(const InputData& inputData, + ov::InferRequest& request) { + const auto& origImg = inputData.asRef<ImageInputData>().inputImage; + + cv::Mat info(cv::Size(1, 2), CV_32SC1); + info.at<int>(0, 0) = origImg.rows; + info.at<int>(0, 1) = origImg.cols; + auto allocator = std::make_shared<SharedTensorAllocator>(info); + ov::Tensor infoInput = ov::Tensor(ov::element::i32, ov::Shape({1, 2}), ov::Allocator(allocator)); + + request.set_tensor(inputsNames[1], infoInput); + + return ImageModel::preprocess(inputData, request); +} + +namespace { +float getScore(const ov::Tensor& scoresTensor, size_t classInd, size_t boxInd) { + const float* scoresPtr = scoresTensor.data<float>(); + const auto shape = scoresTensor.get_shape(); + int N = shape[2]; + + return scoresPtr[classInd * N + boxInd]; +} +} + +std::unique_ptr<ResultBase> ModelYoloV3ONNX::postprocess(InferenceResult& infResult) { + // Get info about input image + const auto imgWidth = infResult.internalModelData->asRef<InternalImageModelData>().inputImgWidth; + const auto imgHeight = infResult.internalModelData->asRef<InternalImageModelData>().inputImgHeight; + + // Get outputs tensors + const ov::Tensor& boxes = infResult.outputsData[boxesOutputName]; + const float* boxesPtr = boxes.data<float>(); + + const ov::Tensor& scores = infResult.outputsData[scoresOutputName]; + const ov::Tensor& indices = infResult.outputsData[indicesOutputName]; + + const int* indicesData = indices.data<int>(); + const auto indicesShape = indices.get_shape(); + const auto boxShape = boxes.get_shape(); + + // Generate detection results + DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData); + size_t numberOfBoxes = indicesShape.size() == 3 ? indicesShape[1] : indicesShape[0]; + int indicesStride = indicesShape.size() == 3 ? indicesShape[2] : indicesShape[1]; + + for (size_t i = 0; i < numberOfBoxes; ++i) { + int batchInd = indicesData[i * indicesStride]; + int classInd = indicesData[i * indicesStride + 1]; + int boxInd = indicesData[i * indicesStride + 2]; + + if (batchInd == -1) { + break; + } + + float score = getScore(scores, classInd, boxInd); + + if (score > confidenceThreshold) { + DetectedObject obj; + size_t startPos = boxShape[2] * boxInd; + + auto x = boxesPtr[startPos + 1]; + auto y = boxesPtr[startPos]; + auto width = boxesPtr[startPos + 3] - x; + auto height = boxesPtr[startPos + 2] - y; + + // Create new detected box + obj.x = clamp(x, 0.f, static_cast<float>(imgWidth)); + obj.y = clamp(y, 0.f, static_cast<float>(imgHeight)); + obj.height = clamp(height, 0.f, static_cast<float>(imgHeight)); + obj.width = clamp(width, 0.f, static_cast<float>(imgWidth)); + obj.confidence = score; + obj.labelID = classInd; + obj.label = getLabelName(classInd); + + result->objects.push_back(obj); + + } + } + + return std::unique_ptr<ResultBase>(result); +} diff --git a/python/openvino/runtime/common/models/src/detection_model_yolox.cpp b/python/openvino/runtime/common/models/src/detection_model_yolox.cpp new file mode 100644 index 0000000..1e434ff --- /dev/null +++ b/python/openvino/runtime/common/models/src/detection_model_yolox.cpp @@ -0,0 +1,194 @@ +/* +// Copyright (C) 2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/detection_model_yolox.h" + +#include <algorithm> +#include <cmath> +#include <cstdint> +#include <stdexcept> +#include <string> +#include <utility> +#include <vector> + +#include <openvino/openvino.hpp> + +#include <utils/common.hpp> +#include <utils/slog.hpp> + +#include "models/input_data.h" +#include "models/internal_model_data.h" +#include "models/results.h" +#include "utils/image_utils.h" +#include "utils/nms.hpp" + +ModelYoloX::ModelYoloX(const std::string& modelFileName, + float confidenceThreshold, + float boxIOUThreshold, + const std::vector<std::string>& labels, + const std::string& layout) + : DetectionModel(modelFileName, confidenceThreshold, false, labels, layout), + boxIOUThreshold(boxIOUThreshold) { + resizeMode = RESIZE_KEEP_ASPECT; +} + +void ModelYoloX::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output ------------------------------------------------- + // --------------------------- Prepare input ------------------------------------------------------ + const ov::OutputVector& inputs = model->inputs(); + if (inputs.size() != 1) { + throw std::logic_error("YOLOX model wrapper accepts models that have only 1 input"); + } + + //--- Check image input + const auto& input = model->input(); + const ov::Shape& inputShape = model->input().get_shape(); + ov::Layout inputLayout = getInputLayout(input); + + if (inputShape.size() != 4 && inputShape[ov::layout::channels_idx(inputLayout)] != 3) { + throw std::logic_error("Expected 4D image input with 3 channels"); + } + + ov::preprocess::PrePostProcessor ppp(model); + ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"}); + + ppp.input().model().set_layout(inputLayout); + + //--- Reading image input parameters + inputsNames.push_back(input.get_any_name()); + netInputWidth = inputShape[ov::layout::width_idx(inputLayout)]; + netInputHeight = inputShape[ov::layout::height_idx(inputLayout)]; + setStridesGrids(); + + // --------------------------- Prepare output ----------------------------------------------------- + if (model->outputs().size() != 1) { + throw std::logic_error("YoloX model wrapper expects models that have only 1 output"); + } + const auto& output = model->output(); + outputsNames.push_back(output.get_any_name()); + const ov::Shape& shape = output.get_shape(); + + if (shape.size() != 3) { + throw std::logic_error("YOLOX single output must have 3 dimensions, but had " + std::to_string(shape.size())); + } + ppp.output().tensor().set_element_type(ov::element::f32); + + model = ppp.build(); +} + +void ModelYoloX::setStridesGrids() { + std::vector<size_t> strides = {8, 16, 32}; + std::vector<size_t> hsizes(3); + std::vector<size_t> wsizes(3); + + for (size_t i = 0; i < strides.size(); ++i) { + hsizes[i] = netInputHeight / strides[i]; + wsizes[i] = netInputWidth / strides[i]; + } + + for (size_t size_index = 0; size_index < hsizes.size(); ++size_index) { + for (size_t h_index = 0; h_index < hsizes[size_index]; ++h_index) { + for (size_t w_index = 0; w_index < wsizes[size_index]; ++w_index) { + grids.emplace_back(w_index, h_index); + expandedStrides.push_back(strides[size_index]); + } + } + } +} + +std::shared_ptr<InternalModelData> ModelYoloX::preprocess(const InputData& inputData, + ov::InferRequest& request) { + const auto& origImg = inputData.asRef<ImageInputData>().inputImage; + double scale = std::min(static_cast<double>(netInputWidth) / origImg.cols, + static_cast<double>(netInputHeight) / origImg.rows); + + cv::Mat resizedImage = resizeImageExt(origImg, netInputWidth, netInputHeight, resizeMode, + interpolationMode, nullptr, cv::Scalar(114, 114, 114)); + + request.set_input_tensor(wrapMat2Tensor(resizedImage)); + return std::make_shared<InternalScaleData>(origImg.cols, origImg.rows, scale, scale); +} + +std::unique_ptr<ResultBase> ModelYoloX::postprocess(InferenceResult& infResult) { + // Get metadata about input image shape and scale + const auto& scale = infResult.internalModelData->asRef<InternalScaleData>(); + + // Get output tensor + const ov::Tensor& output = infResult.outputsData[outputsNames[0]]; + const auto& outputShape = output.get_shape(); + float* outputPtr = output.data<float>(); + + // Generate detection results + DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData); + + // Update coordinates according to strides + for (size_t box_index = 0; box_index < expandedStrides.size(); ++box_index) { + size_t startPos = outputShape[2] * box_index; + outputPtr[startPos] = (outputPtr[startPos] + grids[box_index].first) * expandedStrides[box_index]; + outputPtr[startPos + 1] = (outputPtr[startPos + 1] + grids[box_index].second) * expandedStrides[box_index]; + outputPtr[startPos + 2] = std::exp(outputPtr[startPos + 2]) * expandedStrides[box_index]; + outputPtr[startPos + 3] = std::exp(outputPtr[startPos + 3]) * expandedStrides[box_index]; + } + + // Filter predictions + std::vector<Anchor> validBoxes; + std::vector<float> scores; + std::vector<size_t> classes; + for (size_t box_index = 0; box_index < expandedStrides.size(); ++box_index) { + size_t startPos = outputShape[2] * box_index; + float score = outputPtr[startPos + 4]; + if (score < confidenceThreshold) + continue; + float maxClassScore = -1; + size_t mainClass = 0; + for (size_t class_index = 0; class_index < numberOfClasses; ++class_index) { + if (outputPtr[startPos + 5 + class_index] > maxClassScore) { + maxClassScore = outputPtr[startPos + 5 + class_index]; + mainClass = class_index; + } + } + + // Filter by score + score *= maxClassScore; + if (score < confidenceThreshold) + continue; + + // Add successful boxes + scores.push_back(score); + classes.push_back(mainClass); + Anchor trueBox = {outputPtr[startPos + 0] - outputPtr[startPos + 2] / 2, outputPtr[startPos + 1] - outputPtr[startPos + 3] / 2, + outputPtr[startPos + 0] + outputPtr[startPos + 2] / 2, outputPtr[startPos + 1] + outputPtr[startPos + 3] / 2}; + validBoxes.push_back(Anchor({trueBox.left / scale.scaleX, trueBox.top / scale.scaleY, + trueBox.right / scale.scaleX, trueBox.bottom / scale.scaleY})); + } + + // NMS for valid boxes + std::vector<int> keep = nms(validBoxes, scores, boxIOUThreshold, true); + for (auto& index: keep) { + // Create new detected box + DetectedObject obj; + obj.x = clamp(validBoxes[index].left, 0.f, static_cast<float>(scale.inputImgWidth)); + obj.y = clamp(validBoxes[index].top, 0.f, static_cast<float>(scale.inputImgHeight)); + obj.height = clamp(validBoxes[index].bottom - validBoxes[index].top, 0.f, static_cast<float>(scale.inputImgHeight)); + obj.width = clamp(validBoxes[index].right - validBoxes[index].left, 0.f, static_cast<float>(scale.inputImgWidth)); + obj.confidence = scores[index]; + obj.labelID = classes[index]; + obj.label = getLabelName(classes[index]); + result->objects.push_back(obj); + } + + return std::unique_ptr<ResultBase>(result); +} diff --git a/python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp b/python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp new file mode 100644 index 0000000..33a3604 --- /dev/null +++ b/python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp @@ -0,0 +1,264 @@ +/* +// Copyright (C) 2021-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/hpe_model_associative_embedding.h" + +#include <stddef.h> + +#include <algorithm> +#include <cmath> +#include <map> +#include <stdexcept> +#include <string> +#include <unordered_set> +#include <utility> +#include <vector> + +#include <openvino/openvino.hpp> + +#include <utils/image_utils.h> +#include <utils/ocv_common.hpp> +#include <utils/slog.hpp> + +#include "models/associative_embedding_decoder.h" +#include "models/input_data.h" +#include "models/internal_model_data.h" +#include "models/results.h" + +const cv::Vec3f HpeAssociativeEmbedding::meanPixel = cv::Vec3f::all(128); +const float HpeAssociativeEmbedding::detectionThreshold = 0.1f; +const float HpeAssociativeEmbedding::tagThreshold = 1.0f; + +HpeAssociativeEmbedding::HpeAssociativeEmbedding(const std::string& modelFileName, + double aspectRatio, + int targetSize, + float confidenceThreshold, + const std::string& layout, + float delta, + RESIZE_MODE resizeMode) + : ImageModel(modelFileName, false, layout), + aspectRatio(aspectRatio), + targetSize(targetSize), + confidenceThreshold(confidenceThreshold), + delta(delta) { + resizeMode = resizeMode; + interpolationMode = cv::INTER_CUBIC; + } + +void HpeAssociativeEmbedding::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output ------------------------------------------------- + // --------------------------- Prepare input Tensors ------------------------------------------------------ + if (model->inputs().size() != 1) { + throw std::logic_error("HPE AE model wrapper supports topologies with only 1 input."); + } + inputsNames.push_back(model->input().get_any_name()); + + const ov::Shape& inputShape = model->input().get_shape(); + const ov::Layout& inputLayout = getInputLayout(model->input()); + + if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 || + inputShape[ov::layout::channels_idx(inputLayout)] != 3) { + throw std::logic_error("3-channel 4-dimensional model's input is expected"); + } + + ov::preprocess::PrePostProcessor ppp(model); + ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"}); + + ppp.input().model().set_layout(inputLayout); + + // --------------------------- Prepare output Tensors ----------------------------------------------------- + const ov::OutputVector& outputs = model->outputs(); + if (outputs.size() != 2 && outputs.size() != 3) { + throw std::logic_error("HPE AE model model wrapper supports topologies only with 2 or 3 outputs"); + } + + for (const auto& output : model->outputs()) { + const auto& outTensorName = output.get_any_name(); + ppp.output(outTensorName).tensor().set_element_type(ov::element::f32); + + for (const auto& name : output.get_names()) { + outputsNames.push_back(name); + } + + const ov::Shape& outputShape = output.get_shape(); + if (outputShape.size() != 4 && outputShape.size() != 5) { + throw std::logic_error("output tensors are expected to be 4-dimensional or 5-dimensional"); + } + if (outputShape[ov::layout::batch_idx("NC...")] != 1 || outputShape[ov::layout::channels_idx("NC...")] != 17) { + throw std::logic_error("output tensors are expected to have 1 batch size and 17 channels"); + } + } + model = ppp.build(); + + embeddingsTensorName = findTensorByName("embeddings", outputsNames); + heatmapsTensorName = findTensorByName("heatmaps", outputsNames); + try { + nmsHeatmapsTensorName = findTensorByName("nms_heatmaps", outputsNames); + } catch (const std::runtime_error&) { nmsHeatmapsTensorName = heatmapsTensorName; } + + changeInputSize(model); +} + +void HpeAssociativeEmbedding::changeInputSize(std::shared_ptr<ov::Model>& model) { + ov::Shape inputShape = model->input().get_shape(); + const ov::Layout& layout = ov::layout::get_layout(model->input()); + const auto batchId = ov::layout::batch_idx(layout); + const auto heightId = ov::layout::height_idx(layout); + const auto widthId = ov::layout::width_idx(layout); + + if (!targetSize) { + targetSize = static_cast<int>(std::min(inputShape[heightId], inputShape[widthId])); + } + int inputHeight = aspectRatio >= 1.0 ? targetSize : static_cast<int>(std::round(targetSize / aspectRatio)); + int inputWidth = aspectRatio >= 1.0 ? static_cast<int>(std::round(targetSize * aspectRatio)) : targetSize; + int height = static_cast<int>((inputHeight + stride - 1) / stride) * stride; + int width = static_cast<int>((inputWidth + stride - 1) / stride) * stride; + inputShape[batchId] = 1; + inputShape[heightId] = height; + inputShape[widthId] = width; + inputLayerSize = cv::Size(width, height); + + model->reshape(inputShape); +} + +std::shared_ptr<InternalModelData> HpeAssociativeEmbedding::preprocess(const InputData& inputData, + ov::InferRequest& request) { + auto& image = inputData.asRef<ImageInputData>().inputImage; + cv::Rect roi; + auto paddedImage = resizeImageExt(image, inputLayerSize.width, inputLayerSize.height, resizeMode, interpolationMode, &roi); + if (inputLayerSize.height - stride >= roi.height || inputLayerSize.width - stride >= roi.width) { + slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl; + } + request.set_input_tensor(wrapMat2Tensor(paddedImage)); + + return std::make_shared<InternalScaleData>(paddedImage.cols, + paddedImage.rows, + image.size().width / static_cast<float>(roi.width), + image.size().height / static_cast<float>(roi.height)); +} + +std::unique_ptr<ResultBase> HpeAssociativeEmbedding::postprocess(InferenceResult& infResult) { + HumanPoseResult* result = new HumanPoseResult(infResult.frameId, infResult.metaData); + + const auto& aembds = infResult.outputsData[embeddingsTensorName]; + const ov::Shape& aembdsShape = aembds.get_shape(); + float* const aembdsMapped = aembds.data<float>(); + std::vector<cv::Mat> aembdsMaps = split(aembdsMapped, aembdsShape); + + const auto& heats = infResult.outputsData[heatmapsTensorName]; + const ov::Shape& heatMapsShape = heats.get_shape(); + float* const heatMapsMapped = heats.data<float>(); + std::vector<cv::Mat> heatMaps = split(heatMapsMapped, heatMapsShape); + + std::vector<cv::Mat> nmsHeatMaps = heatMaps; + if (nmsHeatmapsTensorName != heatmapsTensorName) { + const auto& nmsHeats = infResult.outputsData[nmsHeatmapsTensorName]; + const ov::Shape& nmsHeatMapsShape = nmsHeats.get_shape(); + float* const nmsHeatMapsMapped = nmsHeats.data<float>(); + nmsHeatMaps = split(nmsHeatMapsMapped, nmsHeatMapsShape); + } + std::vector<HumanPose> poses = extractPoses(heatMaps, aembdsMaps, nmsHeatMaps); + + // Rescale poses to the original image + const auto& scale = infResult.internalModelData->asRef<InternalScaleData>(); + const float outputScale = inputLayerSize.width / static_cast<float>(heatMapsShape[3]); + float shiftX = 0.0, shiftY = 0.0; + float scaleX = 1.0, scaleY = 1.0; + + if (resizeMode == RESIZE_KEEP_ASPECT_LETTERBOX) { + scaleX = scaleY = std::min(scale.scaleX, scale.scaleY); + if (aspectRatio >= 1.0) + shiftX = static_cast<float>((targetSize * scaleX * aspectRatio - scale.inputImgWidth * scaleX) / 2); + else + shiftY = static_cast<float>((targetSize * scaleY / aspectRatio - scale.inputImgHeight * scaleY) / 2); + scaleX = scaleY *= outputScale; + } else { + scaleX = scale.scaleX * outputScale; + scaleY = scale.scaleY * outputScale; + } + + for (auto& pose : poses) { + for (auto& keypoint : pose.keypoints) { + if (keypoint != cv::Point2f(-1, -1)) { + keypoint.x = keypoint.x * scaleX + shiftX; + keypoint.y = keypoint.y * scaleY + shiftY; + } + } + result->poses.push_back(pose); + } + + return std::unique_ptr<ResultBase>(result); +} + +std::string HpeAssociativeEmbedding::findTensorByName(const std::string& tensorName, + const std::vector<std::string>& outputsNames) { + std::vector<std::string> suitableLayers; + for (auto& outputName : outputsNames) { + if (outputName.rfind(tensorName, 0) == 0) { + suitableLayers.push_back(outputName); + } + } + if (suitableLayers.empty()) { + throw std::runtime_error("Suitable tensor for " + tensorName + " output is not found"); + } else if (suitableLayers.size() > 1) { + throw std::runtime_error("More than 1 tensor matched to " + tensorName + " output"); + } + return suitableLayers[0]; +} + +std::vector<cv::Mat> HpeAssociativeEmbedding::split(float* data, const ov::Shape& shape) { + std::vector<cv::Mat> flattenData(shape[1]); + for (size_t i = 0; i < flattenData.size(); i++) { + flattenData[i] = cv::Mat(shape[2], shape[3], CV_32FC1, data + i * shape[2] * shape[3]); + } + return flattenData; +} + +std::vector<HumanPose> HpeAssociativeEmbedding::extractPoses(std::vector<cv::Mat>& heatMaps, + const std::vector<cv::Mat>& aembdsMaps, + const std::vector<cv::Mat>& nmsHeatMaps) const { + std::vector<std::vector<Peak>> allPeaks(numJoints); + for (int i = 0; i < numJoints; i++) { + findPeaks(nmsHeatMaps, aembdsMaps, allPeaks, i, maxNumPeople, detectionThreshold); + } + std::vector<Pose> allPoses = matchByTag(allPeaks, maxNumPeople, numJoints, tagThreshold); + // swap for all poses + for (auto& pose : allPoses) { + for (size_t j = 0; j < numJoints; j++) { + Peak& peak = pose.getPeak(j); + std::swap(peak.keypoint.x, peak.keypoint.y); + } + } + std::vector<HumanPose> poses; + for (size_t i = 0; i < allPoses.size(); i++) { + Pose& pose = allPoses[i]; + // Filtering poses with low mean scores + if (pose.getMeanScore() <= confidenceThreshold) { + continue; + } + for (size_t j = 0; j < heatMaps.size(); j++) { + heatMaps[j] = cv::abs(heatMaps[j]); + } + adjustAndRefine(allPoses, heatMaps, aembdsMaps, i, delta); + std::vector<cv::Point2f> keypoints; + for (size_t j = 0; j < numJoints; j++) { + Peak& peak = pose.getPeak(j); + keypoints.push_back(peak.keypoint); + } + poses.push_back({keypoints, pose.getMeanScore()}); + } + return poses; +} diff --git a/python/openvino/runtime/common/models/src/hpe_model_openpose.cpp b/python/openvino/runtime/common/models/src/hpe_model_openpose.cpp new file mode 100644 index 0000000..d8b4cb6 --- /dev/null +++ b/python/openvino/runtime/common/models/src/hpe_model_openpose.cpp @@ -0,0 +1,256 @@ +/* +// Copyright (C) 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/hpe_model_openpose.h" + +#include <algorithm> +#include <cmath> +#include <map> +#include <stdexcept> +#include <string> +#include <utility> +#include <vector> + +#include <opencv2/imgproc.hpp> +#include <openvino/openvino.hpp> + +#include <utils/image_utils.h> +#include <utils/ocv_common.hpp> +#include <utils/slog.hpp> + +#include "models/input_data.h" +#include "models/internal_model_data.h" +#include "models/openpose_decoder.h" +#include "models/results.h" + +const cv::Vec3f HPEOpenPose::meanPixel = cv::Vec3f::all(128); +const float HPEOpenPose::minPeaksDistance = 3.0f; +const float HPEOpenPose::midPointsScoreThreshold = 0.05f; +const float HPEOpenPose::foundMidPointsRatioThreshold = 0.8f; +const float HPEOpenPose::minSubsetScore = 0.2f; + +HPEOpenPose::HPEOpenPose(const std::string& modelFileName, + double aspectRatio, + int targetSize, + float confidenceThreshold, + const std::string& layout) + : ImageModel(modelFileName, false, layout), + aspectRatio(aspectRatio), + targetSize(targetSize), + confidenceThreshold(confidenceThreshold) { + resizeMode = RESIZE_KEEP_ASPECT; + interpolationMode = cv::INTER_CUBIC; + } + +void HPEOpenPose::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output ------------------------------------------------- + // --------------------------- Prepare input ------------------------------------------------------ + if (model->inputs().size() != 1) { + throw std::logic_error("HPE OpenPose model wrapper supports topologies with only 1 input"); + } + inputsNames.push_back(model->input().get_any_name()); + const ov::Shape& inputShape = model->input().get_shape(); + const ov::Layout& inputLayout = getInputLayout(model->input()); + + if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 || + inputShape[ov::layout::channels_idx(inputLayout)] != 3) + throw std::logic_error("3-channel 4-dimensional model's input is expected"); + + ov::preprocess::PrePostProcessor ppp(model); + ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"}); + + ppp.input().model().set_layout(inputLayout); + + // --------------------------- Prepare output ----------------------------------------------------- + const ov::OutputVector& outputs = model->outputs(); + if (outputs.size() != 2) { + throw std::runtime_error("HPE OpenPose supports topologies with only 2 outputs"); + } + + const ov::Layout outputLayout("NCHW"); + for (const auto& output : model->outputs()) { + const auto& outTensorName = output.get_any_name(); + ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outputLayout); + outputsNames.push_back(outTensorName); + } + model = ppp.build(); + + const size_t batchId = ov::layout::batch_idx(outputLayout); + const size_t channelsId = ov::layout::channels_idx(outputLayout); + const size_t widthId = ov::layout::width_idx(outputLayout); + const size_t heightId = ov::layout::height_idx(outputLayout); + + ov::Shape heatmapsOutputShape = model->outputs().front().get_shape(); + ov::Shape pafsOutputShape = model->outputs().back().get_shape(); + if (heatmapsOutputShape[channelsId] > pafsOutputShape[channelsId]) { + std::swap(heatmapsOutputShape, pafsOutputShape); + std::swap(outputsNames[0], outputsNames[1]); + } + + if (heatmapsOutputShape.size() != 4 || heatmapsOutputShape[batchId] != 1 || + heatmapsOutputShape[ov::layout::channels_idx(outputLayout)] != keypointsNumber + 1) { + throw std::logic_error("1x" + std::to_string(keypointsNumber + 1) + + "xHFMxWFM dimension of model's heatmap is expected"); + } + if (pafsOutputShape.size() != 4 || pafsOutputShape[batchId] != 1 || + pafsOutputShape[channelsId] != 2 * (keypointsNumber + 1)) { + throw std::logic_error("1x" + std::to_string(2 * (keypointsNumber + 1)) + + "xHFMxWFM dimension of model's output is expected"); + } + if (pafsOutputShape[heightId] != heatmapsOutputShape[heightId] || + pafsOutputShape[widthId] != heatmapsOutputShape[widthId]) { + throw std::logic_error("output and heatmap are expected to have matching last two dimensions"); + } + + changeInputSize(model); +} + +void HPEOpenPose::changeInputSize(std::shared_ptr<ov::Model>& model) { + ov::Shape inputShape = model->input().get_shape(); + const ov::Layout& layout = ov::layout::get_layout(model->inputs().front()); + const auto batchId = ov::layout::batch_idx(layout); + const auto heightId = ov::layout::height_idx(layout); + const auto widthId = ov::layout::width_idx(layout); + + if (!targetSize) { + targetSize = inputShape[heightId]; + } + int height = static_cast<int>((targetSize + stride - 1) / stride) * stride; + int inputWidth = static_cast<int>(std::round(targetSize * aspectRatio)); + int width = static_cast<int>((inputWidth + stride - 1) / stride) * stride; + inputShape[batchId] = 1; + inputShape[heightId] = height; + inputShape[widthId] = width; + inputLayerSize = cv::Size(width, height); + model->reshape(inputShape); +} + +std::shared_ptr<InternalModelData> HPEOpenPose::preprocess(const InputData& inputData, ov::InferRequest& request) { + auto& image = inputData.asRef<ImageInputData>().inputImage; + cv::Rect roi; + auto paddedImage = + resizeImageExt(image, inputLayerSize.width, inputLayerSize.height, resizeMode, interpolationMode, &roi); + if (inputLayerSize.width < roi.width) + throw std::runtime_error("The image aspect ratio doesn't fit current model shape"); + + if (inputLayerSize.width - stride >= roi.width) { + slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl; + } + + request.set_input_tensor(wrapMat2Tensor(paddedImage)); + return std::make_shared<InternalScaleData>(paddedImage.cols, + paddedImage.rows, + image.cols / static_cast<float>(roi.width), + image.rows / static_cast<float>(roi.height)); +} + +std::unique_ptr<ResultBase> HPEOpenPose::postprocess(InferenceResult& infResult) { + HumanPoseResult* result = new HumanPoseResult(infResult.frameId, infResult.metaData); + + const auto& heatMapsMapped = infResult.outputsData[outputsNames[0]]; + const auto& outputMapped = infResult.outputsData[outputsNames[1]]; + + const ov::Shape& outputShape = outputMapped.get_shape(); + const ov::Shape& heatMapShape = heatMapsMapped.get_shape(); + + float* const predictions = outputMapped.data<float>(); + float* const heats = heatMapsMapped.data<float>(); + + std::vector<cv::Mat> heatMaps(keypointsNumber); + for (size_t i = 0; i < heatMaps.size(); i++) { + heatMaps[i] = + cv::Mat(heatMapShape[2], heatMapShape[3], CV_32FC1, heats + i * heatMapShape[2] * heatMapShape[3]); + } + resizeFeatureMaps(heatMaps); + + std::vector<cv::Mat> pafs(outputShape[1]); + for (size_t i = 0; i < pafs.size(); i++) { + pafs[i] = + cv::Mat(heatMapShape[2], heatMapShape[3], CV_32FC1, predictions + i * heatMapShape[2] * heatMapShape[3]); + } + resizeFeatureMaps(pafs); + + std::vector<HumanPose> poses = extractPoses(heatMaps, pafs); + + const auto& scale = infResult.internalModelData->asRef<InternalScaleData>(); + float scaleX = stride / upsampleRatio * scale.scaleX; + float scaleY = stride / upsampleRatio * scale.scaleY; + for (auto& pose : poses) { + for (auto& keypoint : pose.keypoints) { + if (keypoint != cv::Point2f(-1, -1)) { + keypoint.x *= scaleX; + keypoint.y *= scaleY; + } + } + } + for (size_t i = 0; i < poses.size(); ++i) { + result->poses.push_back(poses[i]); + } + + return std::unique_ptr<ResultBase>(result); +} + +void HPEOpenPose::resizeFeatureMaps(std::vector<cv::Mat>& featureMaps) const { + for (auto& featureMap : featureMaps) { + cv::resize(featureMap, featureMap, cv::Size(), upsampleRatio, upsampleRatio, cv::INTER_CUBIC); + } +} + +class FindPeaksBody : public cv::ParallelLoopBody { +public: + FindPeaksBody(const std::vector<cv::Mat>& heatMaps, + float minPeaksDistance, + std::vector<std::vector<Peak>>& peaksFromHeatMap, + float confidenceThreshold) + : heatMaps(heatMaps), + minPeaksDistance(minPeaksDistance), + peaksFromHeatMap(peaksFromHeatMap), + confidenceThreshold(confidenceThreshold) {} + + void operator()(const cv::Range& range) const override { + for (int i = range.start; i < range.end; i++) { + findPeaks(heatMaps, minPeaksDistance, peaksFromHeatMap, i, confidenceThreshold); + } + } + +private: + const std::vector<cv::Mat>& heatMaps; + float minPeaksDistance; + std::vector<std::vector<Peak>>& peaksFromHeatMap; + float confidenceThreshold; +}; + +std::vector<HumanPose> HPEOpenPose::extractPoses(const std::vector<cv::Mat>& heatMaps, + const std::vector<cv::Mat>& pafs) const { + std::vector<std::vector<Peak>> peaksFromHeatMap(heatMaps.size()); + FindPeaksBody findPeaksBody(heatMaps, minPeaksDistance, peaksFromHeatMap, confidenceThreshold); + cv::parallel_for_(cv::Range(0, static_cast<int>(heatMaps.size())), findPeaksBody); + int peaksBefore = 0; + for (size_t heatmapId = 1; heatmapId < heatMaps.size(); heatmapId++) { + peaksBefore += static_cast<int>(peaksFromHeatMap[heatmapId - 1].size()); + for (auto& peak : peaksFromHeatMap[heatmapId]) { + peak.id += peaksBefore; + } + } + std::vector<HumanPose> poses = groupPeaksToPoses(peaksFromHeatMap, + pafs, + keypointsNumber, + midPointsScoreThreshold, + foundMidPointsRatioThreshold, + minJointsNumber, + minSubsetScore); + return poses; +} diff --git a/python/openvino/runtime/common/models/src/image_model.cpp b/python/openvino/runtime/common/models/src/image_model.cpp new file mode 100644 index 0000000..511faf3 --- /dev/null +++ b/python/openvino/runtime/common/models/src/image_model.cpp @@ -0,0 +1,57 @@ +/* +// Copyright (C) 2021-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/image_model.h" + +#include <stdexcept> +#include <vector> + +#include <opencv2/core.hpp> +#include <openvino/openvino.hpp> + +#include <utils/image_utils.h> +#include <utils/ocv_common.hpp> + +#include "models/input_data.h" +#include "models/internal_model_data.h" + +ImageModel::ImageModel(const std::string& modelFileName, bool useAutoResize, const std::string& layout) + : ModelBase(modelFileName, layout), + useAutoResize(useAutoResize) {} + +std::shared_ptr<InternalModelData> ImageModel::preprocess(const InputData& inputData, ov::InferRequest& request) { + const auto& origImg = inputData.asRef<ImageInputData>().inputImage; + auto img = inputTransform(origImg); + + if (!useAutoResize) { + // /* Resize and copy data from the image to the input tensor */ + const ov::Tensor& frameTensor = request.get_tensor(inputsNames[0]); // first input should be image + const ov::Shape& tensorShape = frameTensor.get_shape(); + const ov::Layout layout("NHWC"); + const size_t width = tensorShape[ov::layout::width_idx(layout)]; + const size_t height = tensorShape[ov::layout::height_idx(layout)]; + const size_t channels = tensorShape[ov::layout::channels_idx(layout)]; + if (static_cast<size_t>(img.channels()) != channels) { + throw std::runtime_error("The number of channels for model input and image must match"); + } + if (channels != 1 && channels != 3) { + throw std::runtime_error("Unsupported number of channels"); + } + img = resizeImageExt(img, width, height, resizeMode, interpolationMode); + } + request.set_tensor(inputsNames[0], wrapMat2Tensor(img)); + return std::make_shared<InternalImageModelData>(origImg.cols, origImg.rows); +} diff --git a/python/openvino/runtime/common/models/src/jpeg_restoration_model.cpp b/python/openvino/runtime/common/models/src/jpeg_restoration_model.cpp new file mode 100644 index 0000000..8eb3ae1 --- /dev/null +++ b/python/openvino/runtime/common/models/src/jpeg_restoration_model.cpp @@ -0,0 +1,167 @@ +/* +// Copyright (C) 2021-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include <stddef.h> + +#include <algorithm> +#include <memory> +#include <stdexcept> +#include <string> +#include <vector> + +#include <opencv2/core.hpp> +#include <opencv2/imgcodecs.hpp> +#include <opencv2/imgproc.hpp> +#include <openvino/openvino.hpp> + +#include <utils/ocv_common.hpp> +#include <utils/slog.hpp> + +#include "models/image_model.h" +#include "models/input_data.h" +#include "models/internal_model_data.h" +#include "models/jpeg_restoration_model.h" +#include "models/results.h" + +JPEGRestorationModel::JPEGRestorationModel(const std::string& modelFileName, + const cv::Size& inputImgSize, + bool _jpegCompression, + const std::string& layout) + : ImageModel(modelFileName, false, layout) { + netInputHeight = inputImgSize.height; + netInputWidth = inputImgSize.width; + jpegCompression = _jpegCompression; +} + +void JPEGRestorationModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output ------------------------------------------------- + // --------------------------- Prepare input ------------------------------------------------------ + if (model->inputs().size() != 1) { + throw std::logic_error("The JPEG Restoration model wrapper supports topologies with only 1 input"); + } + inputsNames.push_back(model->input().get_any_name()); + + const ov::Shape& inputShape = model->input().get_shape(); + const ov::Layout& inputLayout = getInputLayout(model->input()); + + if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 || + inputShape[ov::layout::channels_idx(inputLayout)] != 3) { + throw std::logic_error("3-channel 4-dimensional model's input is expected"); + } + + ov::preprocess::PrePostProcessor ppp(model); + ppp.input().tensor().set_element_type(ov::element::u8).set_layout("NHWC"); + + ppp.input().model().set_layout(inputLayout); + + // --------------------------- Prepare output ----------------------------------------------------- + const ov::OutputVector& outputs = model->outputs(); + if (outputs.size() != 1) { + throw std::logic_error("The JPEG Restoration model wrapper supports topologies with only 1 output"); + } + const ov::Shape& outputShape = model->output().get_shape(); + const ov::Layout outputLayout{"NCHW"}; + if (outputShape.size() != 4 || outputShape[ov::layout::batch_idx(outputLayout)] != 1 || + outputShape[ov::layout::channels_idx(outputLayout)] != 3) { + throw std::logic_error("3-channel 4-dimensional model's output is expected"); + } + + outputsNames.push_back(model->output().get_any_name()); + ppp.output().tensor().set_element_type(ov::element::f32); + model = ppp.build(); + + changeInputSize(model); +} + +void JPEGRestorationModel::changeInputSize(std::shared_ptr<ov::Model>& model) { + ov::Shape inputShape = model->input().get_shape(); + const ov::Layout& layout = ov::layout::get_layout(model->input()); + + const auto batchId = ov::layout::batch_idx(layout); + const auto heightId = ov::layout::height_idx(layout); + const auto widthId = ov::layout::width_idx(layout); + + if (inputShape[heightId] % stride || inputShape[widthId] % stride) { + throw std::logic_error("The shape of the model input must be divisible by stride"); + } + + netInputHeight = static_cast<int>((netInputHeight + stride - 1) / stride) * stride; + netInputWidth = static_cast<int>((netInputWidth + stride - 1) / stride) * stride; + + inputShape[batchId] = 1; + inputShape[heightId] = netInputHeight; + inputShape[widthId] = netInputWidth; + + model->reshape(inputShape); +} + +std::shared_ptr<InternalModelData> JPEGRestorationModel::preprocess(const InputData& inputData, + ov::InferRequest& request) { + cv::Mat image = inputData.asRef<ImageInputData>().inputImage; + const size_t h = image.rows; + const size_t w = image.cols; + cv::Mat resizedImage; + if (jpegCompression) { + std::vector<uchar> encimg; + std::vector<int> params{cv::IMWRITE_JPEG_QUALITY, 40}; + cv::imencode(".jpg", image, encimg, params); + image = cv::imdecode(cv::Mat(encimg), 3); + } + + if (netInputHeight - stride < h && h <= netInputHeight && netInputWidth - stride < w && w <= netInputWidth) { + int bottom = netInputHeight - h; + int right = netInputWidth - w; + cv::copyMakeBorder(image, resizedImage, 0, bottom, 0, right, cv::BORDER_CONSTANT, 0); + } else { + slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl; + cv::resize(image, resizedImage, cv::Size(netInputWidth, netInputHeight)); + } + request.set_input_tensor(wrapMat2Tensor(resizedImage)); + + return std::make_shared<InternalImageModelData>(image.cols, image.rows); +} + +std::unique_ptr<ResultBase> JPEGRestorationModel::postprocess(InferenceResult& infResult) { + ImageResult* result = new ImageResult; + *static_cast<ResultBase*>(result) = static_cast<ResultBase&>(infResult); + + const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>(); + const auto outputData = infResult.getFirstOutputTensor().data<float>(); + + std::vector<cv::Mat> imgPlanes; + const ov::Shape& outputShape = infResult.getFirstOutputTensor().get_shape(); + const size_t outHeight = static_cast<int>(outputShape[2]); + const size_t outWidth = static_cast<int>(outputShape[3]); + const size_t numOfPixels = outWidth * outHeight; + imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0])), + cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])), + cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2]))}; + cv::Mat resultImg; + cv::merge(imgPlanes, resultImg); + + if (netInputHeight - stride < static_cast<size_t>(inputImgSize.inputImgHeight) && + static_cast<size_t>(inputImgSize.inputImgHeight) <= netInputHeight && + netInputWidth - stride < static_cast<size_t>(inputImgSize.inputImgWidth) && + static_cast<size_t>(inputImgSize.inputImgWidth) <= netInputWidth) { + result->resultImage = resultImg(cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight)); + } else { + cv::resize(resultImg, result->resultImage, cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight)); + } + + result->resultImage.convertTo(result->resultImage, CV_8UC3, 255); + + return std::unique_ptr<ResultBase>(result); +} diff --git a/python/openvino/runtime/common/models/src/model_base.cpp b/python/openvino/runtime/common/models/src/model_base.cpp new file mode 100644 index 0000000..c2ebd1b --- /dev/null +++ b/python/openvino/runtime/common/models/src/model_base.cpp @@ -0,0 +1,67 @@ +/* +// Copyright (C) 2021-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/model_base.h" + +#include <utility> + +#include <openvino/openvino.hpp> + +#include <utils/common.hpp> +#include <utils/config_factory.h> +#include <utils/ocv_common.hpp> +#include <utils/slog.hpp> + +std::shared_ptr<ov::Model> ModelBase::prepareModel(ov::Core& core) { + // --------------------------- Read IR Generated by ModelOptimizer (.xml and .bin files) ------------ + /** Read model **/ + slog::info << "Reading model " << modelFileName << slog::endl; + std::shared_ptr<ov::Model> model = core.read_model(modelFileName); + logBasicModelInfo(model); + // -------------------------- Reading all outputs names and customizing I/O tensors (in inherited classes) + prepareInputsOutputs(model); + + /** Set batch size to 1 **/ + ov::set_batch(model, 1); + + return model; +} + +ov::CompiledModel ModelBase::compileModel(const ModelConfig& config, ov::Core& core) { + this->config = config; + auto model = prepareModel(core); + compiledModel = core.compile_model(model, config.deviceName, config.compiledModelConfig); + logCompiledModelInfo(compiledModel, modelFileName, config.deviceName); + return compiledModel; +} + +ov::Layout ModelBase::getInputLayout(const ov::Output<ov::Node>& input) { + const ov::Shape& inputShape = input.get_shape(); + ov::Layout layout = ov::layout::get_layout(input); + if (layout.empty()) { + if (inputsLayouts.empty()) { + layout = getLayoutFromShape(inputShape); + slog::warn << "Automatically detected layout '" << layout.to_string() << "' for input '" + << input.get_any_name() << "' will be used." << slog::endl; + } else if (inputsLayouts.size() == 1) { + layout = inputsLayouts.begin()->second; + } else { + layout = inputsLayouts[input.get_any_name()]; + } + } + + return layout; +} diff --git a/python/openvino/runtime/common/models/src/openpose_decoder.cpp b/python/openvino/runtime/common/models/src/openpose_decoder.cpp new file mode 100644 index 0000000..6d51607 --- /dev/null +++ b/python/openvino/runtime/common/models/src/openpose_decoder.cpp @@ -0,0 +1,345 @@ +/* +// Copyright (C) 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/openpose_decoder.h" + +#include <algorithm> +#include <cmath> +#include <memory> +#include <utility> +#include <vector> + +#include <utils/common.hpp> + +#include "models/results.h" + +Peak::Peak(const int id, const cv::Point2f& pos, const float score) : id(id), pos(pos), score(score) {} + +HumanPoseByPeaksIndices::HumanPoseByPeaksIndices(const int keypointsNumber) + : peaksIndices(std::vector<int>(keypointsNumber, -1)), + nJoints(0), + score(0.0f) {} + +TwoJointsConnection::TwoJointsConnection(const int firstJointIdx, const int secondJointIdx, const float score) + : firstJointIdx(firstJointIdx), + secondJointIdx(secondJointIdx), + score(score) {} + +void findPeaks(const std::vector<cv::Mat>& heatMaps, + const float minPeaksDistance, + std::vector<std::vector<Peak>>& allPeaks, + int heatMapId, + float confidenceThreshold) { + std::vector<cv::Point> peaks; + const cv::Mat& heatMap = heatMaps[heatMapId]; + const float* heatMapData = heatMap.ptr<float>(); + size_t heatMapStep = heatMap.step1(); + for (int y = -1; y < heatMap.rows + 1; y++) { + for (int x = -1; x < heatMap.cols + 1; x++) { + float val = 0; + if (x >= 0 && y >= 0 && x < heatMap.cols && y < heatMap.rows) { + val = heatMapData[y * heatMapStep + x]; + val = val >= confidenceThreshold ? val : 0; + } + + float left_val = 0; + if (y >= 0 && x < (heatMap.cols - 1) && y < heatMap.rows) { + left_val = heatMapData[y * heatMapStep + x + 1]; + left_val = left_val >= confidenceThreshold ? left_val : 0; + } + + float right_val = 0; + if (x > 0 && y >= 0 && y < heatMap.rows) { + right_val = heatMapData[y * heatMapStep + x - 1]; + right_val = right_val >= confidenceThreshold ? right_val : 0; + } + + float top_val = 0; + if (x >= 0 && x < heatMap.cols && y < (heatMap.rows - 1)) { + top_val = heatMapData[(y + 1) * heatMapStep + x]; + top_val = top_val >= confidenceThreshold ? top_val : 0; + } + + float bottom_val = 0; + if (x >= 0 && y > 0 && x < heatMap.cols) { + bottom_val = heatMapData[(y - 1) * heatMapStep + x]; + bottom_val = bottom_val >= confidenceThreshold ? bottom_val : 0; + } + + if ((val > left_val) && (val > right_val) && (val > top_val) && (val > bottom_val)) { + peaks.push_back(cv::Point(x, y)); + } + } + } + std::sort(peaks.begin(), peaks.end(), [](const cv::Point& a, const cv::Point& b) { + return a.x < b.x; + }); + std::vector<bool> isActualPeak(peaks.size(), true); + int peakCounter = 0; + std::vector<Peak>& peaksWithScoreAndID = allPeaks[heatMapId]; + for (size_t i = 0; i < peaks.size(); i++) { + if (isActualPeak[i]) { + for (size_t j = i + 1; j < peaks.size(); j++) { + if (sqrt((peaks[i].x - peaks[j].x) * (peaks[i].x - peaks[j].x) + + (peaks[i].y - peaks[j].y) * (peaks[i].y - peaks[j].y)) < minPeaksDistance) { + isActualPeak[j] = false; + } + } + peaksWithScoreAndID.push_back(Peak(peakCounter++, peaks[i], heatMap.at<float>(peaks[i]))); + } + } +} + +std::vector<HumanPose> groupPeaksToPoses(const std::vector<std::vector<Peak>>& allPeaks, + const std::vector<cv::Mat>& pafs, + const size_t keypointsNumber, + const float midPointsScoreThreshold, + const float foundMidPointsRatioThreshold, + const int minJointsNumber, + const float minSubsetScore) { + static const std::pair<int, int> limbIdsHeatmap[] = {{2, 3}, + {2, 6}, + {3, 4}, + {4, 5}, + {6, 7}, + {7, 8}, + {2, 9}, + {9, 10}, + {10, 11}, + {2, 12}, + {12, 13}, + {13, 14}, + {2, 1}, + {1, 15}, + {15, 17}, + {1, 16}, + {16, 18}, + {3, 17}, + {6, 18}}; + static const std::pair<int, int> limbIdsPaf[] = {{31, 32}, + {39, 40}, + {33, 34}, + {35, 36}, + {41, 42}, + {43, 44}, + {19, 20}, + {21, 22}, + {23, 24}, + {25, 26}, + {27, 28}, + {29, 30}, + {47, 48}, + {49, 50}, + {53, 54}, + {51, 52}, + {55, 56}, + {37, 38}, + {45, 46}}; + + std::vector<Peak> candidates; + for (const auto& peaks : allPeaks) { + candidates.insert(candidates.end(), peaks.begin(), peaks.end()); + } + std::vector<HumanPoseByPeaksIndices> subset(0, HumanPoseByPeaksIndices(keypointsNumber)); + for (size_t k = 0; k < arraySize(limbIdsPaf); k++) { + std::vector<TwoJointsConnection> connections; + const int mapIdxOffset = keypointsNumber + 1; + std::pair<cv::Mat, cv::Mat> scoreMid = {pafs[limbIdsPaf[k].first - mapIdxOffset], + pafs[limbIdsPaf[k].second - mapIdxOffset]}; + const int idxJointA = limbIdsHeatmap[k].first - 1; + const int idxJointB = limbIdsHeatmap[k].second - 1; + const std::vector<Peak>& candA = allPeaks[idxJointA]; + const std::vector<Peak>& candB = allPeaks[idxJointB]; + const size_t nJointsA = candA.size(); + const size_t nJointsB = candB.size(); + if (nJointsA == 0 && nJointsB == 0) { + continue; + } else if (nJointsA == 0) { + for (size_t i = 0; i < nJointsB; i++) { + int num = 0; + for (size_t j = 0; j < subset.size(); j++) { + if (subset[j].peaksIndices[idxJointB] == candB[i].id) { + num++; + continue; + } + } + if (num == 0) { + HumanPoseByPeaksIndices personKeypoints(keypointsNumber); + personKeypoints.peaksIndices[idxJointB] = candB[i].id; + personKeypoints.nJoints = 1; + personKeypoints.score = candB[i].score; + subset.push_back(personKeypoints); + } + } + continue; + } else if (nJointsB == 0) { + for (size_t i = 0; i < nJointsA; i++) { + int num = 0; + for (size_t j = 0; j < subset.size(); j++) { + if (subset[j].peaksIndices[idxJointA] == candA[i].id) { + num++; + continue; + } + } + if (num == 0) { + HumanPoseByPeaksIndices personKeypoints(keypointsNumber); + personKeypoints.peaksIndices[idxJointA] = candA[i].id; + personKeypoints.nJoints = 1; + personKeypoints.score = candA[i].score; + subset.push_back(personKeypoints); + } + } + continue; + } + + std::vector<TwoJointsConnection> tempJointConnections; + for (size_t i = 0; i < nJointsA; i++) { + for (size_t j = 0; j < nJointsB; j++) { + cv::Point2f pt = candA[i].pos * 0.5 + candB[j].pos * 0.5; + cv::Point mid = cv::Point(cvRound(pt.x), cvRound(pt.y)); + cv::Point2f vec = candB[j].pos - candA[i].pos; + double norm_vec = cv::norm(vec); + if (norm_vec == 0) { + continue; + } + vec /= norm_vec; + float score = vec.x * scoreMid.first.at<float>(mid) + vec.y * scoreMid.second.at<float>(mid); + int height_n = pafs[0].rows / 2; + float suc_ratio = 0.0f; + float mid_score = 0.0f; + const int mid_num = 10; + const float scoreThreshold = -100.0f; + if (score > scoreThreshold) { + float p_sum = 0; + int p_count = 0; + cv::Size2f step((candB[j].pos.x - candA[i].pos.x) / (mid_num - 1), + (candB[j].pos.y - candA[i].pos.y) / (mid_num - 1)); + for (int n = 0; n < mid_num; n++) { + cv::Point midPoint(cvRound(candA[i].pos.x + n * step.width), + cvRound(candA[i].pos.y + n * step.height)); + cv::Point2f pred(scoreMid.first.at<float>(midPoint), scoreMid.second.at<float>(midPoint)); + score = vec.x * pred.x + vec.y * pred.y; + if (score > midPointsScoreThreshold) { + p_sum += score; + p_count++; + } + } + suc_ratio = static_cast<float>(p_count / mid_num); + float ratio = p_count > 0 ? p_sum / p_count : 0.0f; + mid_score = ratio + static_cast<float>(std::min(height_n / norm_vec - 1, 0.0)); + } + if (mid_score > 0 && suc_ratio > foundMidPointsRatioThreshold) { + tempJointConnections.push_back(TwoJointsConnection(i, j, mid_score)); + } + } + } + if (!tempJointConnections.empty()) { + std::sort(tempJointConnections.begin(), + tempJointConnections.end(), + [](const TwoJointsConnection& a, const TwoJointsConnection& b) { + return (a.score > b.score); + }); + } + size_t num_limbs = std::min(nJointsA, nJointsB); + size_t cnt = 0; + std::vector<int> occurA(nJointsA, 0); + std::vector<int> occurB(nJointsB, 0); + for (size_t row = 0; row < tempJointConnections.size(); row++) { + if (cnt == num_limbs) { + break; + } + const int& indexA = tempJointConnections[row].firstJointIdx; + const int& indexB = tempJointConnections[row].secondJointIdx; + const float& score = tempJointConnections[row].score; + if (occurA[indexA] == 0 && occurB[indexB] == 0) { + connections.push_back(TwoJointsConnection(candA[indexA].id, candB[indexB].id, score)); + cnt++; + occurA[indexA] = 1; + occurB[indexB] = 1; + } + } + if (connections.empty()) { + continue; + } + + bool extraJointConnections = (k == 17 || k == 18); + if (k == 0) { + subset = std::vector<HumanPoseByPeaksIndices>(connections.size(), HumanPoseByPeaksIndices(keypointsNumber)); + for (size_t i = 0; i < connections.size(); i++) { + const int& indexA = connections[i].firstJointIdx; + const int& indexB = connections[i].secondJointIdx; + subset[i].peaksIndices[idxJointA] = indexA; + subset[i].peaksIndices[idxJointB] = indexB; + subset[i].nJoints = 2; + subset[i].score = candidates[indexA].score + candidates[indexB].score + connections[i].score; + } + } else if (extraJointConnections) { + for (size_t i = 0; i < connections.size(); i++) { + const int& indexA = connections[i].firstJointIdx; + const int& indexB = connections[i].secondJointIdx; + for (size_t j = 0; j < subset.size(); j++) { + if (subset[j].peaksIndices[idxJointA] == indexA && subset[j].peaksIndices[idxJointB] == -1) { + subset[j].peaksIndices[idxJointB] = indexB; + } else if (subset[j].peaksIndices[idxJointB] == indexB && subset[j].peaksIndices[idxJointA] == -1) { + subset[j].peaksIndices[idxJointA] = indexA; + } + } + } + continue; + } else { + for (size_t i = 0; i < connections.size(); i++) { + const int& indexA = connections[i].firstJointIdx; + const int& indexB = connections[i].secondJointIdx; + bool num = false; + for (size_t j = 0; j < subset.size(); j++) { + if (subset[j].peaksIndices[idxJointA] == indexA) { + subset[j].peaksIndices[idxJointB] = indexB; + subset[j].nJoints++; + subset[j].score += candidates[indexB].score + connections[i].score; + num = true; + } + } + if (!num) { + HumanPoseByPeaksIndices hpWithScore(keypointsNumber); + hpWithScore.peaksIndices[idxJointA] = indexA; + hpWithScore.peaksIndices[idxJointB] = indexB; + hpWithScore.nJoints = 2; + hpWithScore.score = candidates[indexA].score + candidates[indexB].score + connections[i].score; + subset.push_back(hpWithScore); + } + } + } + } + std::vector<HumanPose> poses; + for (const auto& subsetI : subset) { + if (subsetI.nJoints < minJointsNumber || subsetI.score / subsetI.nJoints < minSubsetScore) { + continue; + } + int position = -1; + HumanPose pose{std::vector<cv::Point2f>(keypointsNumber, cv::Point2f(-1.0f, -1.0f)), + subsetI.score * std::max(0, subsetI.nJoints - 1)}; + for (const auto& peakIdx : subsetI.peaksIndices) { + position++; + if (peakIdx >= 0) { + pose.keypoints[position] = candidates[peakIdx].pos; + pose.keypoints[position].x += 0.5; + pose.keypoints[position].y += 0.5; + } + } + poses.push_back(pose); + } + return poses; +} diff --git a/python/openvino/runtime/common/models/src/segmentation_model.cpp b/python/openvino/runtime/common/models/src/segmentation_model.cpp new file mode 100644 index 0000000..82a153b --- /dev/null +++ b/python/openvino/runtime/common/models/src/segmentation_model.cpp @@ -0,0 +1,157 @@ +/* +// Copyright (C) 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/segmentation_model.h" + +#include <stddef.h> +#include <stdint.h> + +#include <fstream> +#include <stdexcept> +#include <string> +#include <vector> + +#include <opencv2/core.hpp> +#include <opencv2/imgproc.hpp> +#include <openvino/openvino.hpp> + +#include "models/internal_model_data.h" +#include "models/results.h" + +SegmentationModel::SegmentationModel(const std::string& modelFileName, bool useAutoResize, const std::string& layout) + : ImageModel(modelFileName, useAutoResize, layout) {} + +std::vector<std::string> SegmentationModel::loadLabels(const std::string& labelFilename) { + std::vector<std::string> labelsList; + + /* Read labels (if any) */ + if (!labelFilename.empty()) { + std::ifstream inputFile(labelFilename); + if (!inputFile.is_open()) + throw std::runtime_error("Can't open the labels file: " + labelFilename); + std::string label; + while (std::getline(inputFile, label)) { + labelsList.push_back(label); + } + if (labelsList.empty()) + throw std::logic_error("File is empty: " + labelFilename); + } + + return labelsList; +} + +void SegmentationModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output --------------------------------------------- + // --------------------------- Prepare input ----------------------------------------------------- + if (model->inputs().size() != 1) { + throw std::logic_error("Segmentation model wrapper supports topologies with only 1 input"); + } + const auto& input = model->input(); + inputsNames.push_back(input.get_any_name()); + + const ov::Layout& inputLayout = getInputLayout(input); + const ov::Shape& inputShape = input.get_shape(); + if (inputShape.size() != 4 || inputShape[ov::layout::channels_idx(inputLayout)] != 3) { + throw std::logic_error("3-channel 4-dimensional model's input is expected"); + } + + ov::preprocess::PrePostProcessor ppp(model); + ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"}); + + if (useAutoResize) { + ppp.input().tensor().set_spatial_dynamic_shape(); + + ppp.input() + .preprocess() + .convert_element_type(ov::element::f32) + .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR); + } + + ppp.input().model().set_layout(inputLayout); + model = ppp.build(); + // --------------------------- Prepare output ----------------------------------------------------- + if (model->outputs().size() != 1) { + throw std::logic_error("Segmentation model wrapper supports topologies with only 1 output"); + } + + const auto& output = model->output(); + outputsNames.push_back(output.get_any_name()); + + const ov::Shape& outputShape = output.get_shape(); + ov::Layout outputLayout(""); + switch (outputShape.size()) { + case 3: + outputLayout = "CHW"; + outChannels = 1; + outHeight = static_cast<int>(outputShape[ov::layout::height_idx(outputLayout)]); + outWidth = static_cast<int>(outputShape[ov::layout::width_idx(outputLayout)]); + break; + case 4: + outputLayout = "NCHW"; + outChannels = static_cast<int>(outputShape[ov::layout::channels_idx(outputLayout)]); + outHeight = static_cast<int>(outputShape[ov::layout::height_idx(outputLayout)]); + outWidth = static_cast<int>(outputShape[ov::layout::width_idx(outputLayout)]); + break; + default: + throw std::logic_error("Unexpected output tensor shape. Only 4D and 3D outputs are supported."); + } +} + +std::unique_ptr<ResultBase> SegmentationModel::postprocess(InferenceResult& infResult) { + ImageResult* result = new ImageResult(infResult.frameId, infResult.metaData); + const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>(); + const auto& outTensor = infResult.getFirstOutputTensor(); + + result->resultImage = cv::Mat(outHeight, outWidth, CV_8UC1); + + if (outChannels == 1 && outTensor.get_element_type() == ov::element::i32) { + cv::Mat predictions(outHeight, outWidth, CV_32SC1, outTensor.data<int32_t>()); + predictions.convertTo(result->resultImage, CV_8UC1); + } else if (outChannels == 1 && outTensor.get_element_type() == ov::element::i64) { + cv::Mat predictions(outHeight, outWidth, CV_32SC1); + const auto data = outTensor.data<int64_t>(); + for (size_t i = 0; i < predictions.total(); ++i) { + reinterpret_cast<int32_t*>(predictions.data)[i] = int32_t(data[i]); + } + predictions.convertTo(result->resultImage, CV_8UC1); + } else if (outTensor.get_element_type() == ov::element::f32) { + const float* data = outTensor.data<float>(); + for (int rowId = 0; rowId < outHeight; ++rowId) { + for (int colId = 0; colId < outWidth; ++colId) { + int classId = 0; + float maxProb = -1.0f; + for (int chId = 0; chId < outChannels; ++chId) { + float prob = data[chId * outHeight * outWidth + rowId * outWidth + colId]; + if (prob > maxProb) { + classId = chId; + maxProb = prob; + } + } // nChannels + + result->resultImage.at<uint8_t>(rowId, colId) = classId; + } // width + } // height + } + + cv::resize(result->resultImage, + result->resultImage, + cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight), + 0, + 0, + cv::INTER_NEAREST); + + return std::unique_ptr<ResultBase>(result); +} diff --git a/python/openvino/runtime/common/models/src/style_transfer_model.cpp b/python/openvino/runtime/common/models/src/style_transfer_model.cpp new file mode 100644 index 0000000..53e8561 --- /dev/null +++ b/python/openvino/runtime/common/models/src/style_transfer_model.cpp @@ -0,0 +1,107 @@ +/* +// Copyright (C) 2021-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/style_transfer_model.h" + +#include <stddef.h> + +#include <memory> +#include <stdexcept> +#include <string> +#include <vector> + +#include <opencv2/core.hpp> +#include <opencv2/imgproc.hpp> +#include <openvino/openvino.hpp> + +#include <utils/image_utils.h> +#include <utils/ocv_common.hpp> + +#include "models/input_data.h" +#include "models/internal_model_data.h" +#include "models/results.h" + +StyleTransferModel::StyleTransferModel(const std::string& modelFileName, const std::string& layout) + : ImageModel(modelFileName, false, layout) {} + +void StyleTransferModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output --------------------------------------------- + // --------------------------- Prepare input -------------------------------------------------- + if (model->inputs().size() != 1) { + throw std::logic_error("Style transfer model wrapper supports topologies with only 1 input"); + } + + inputsNames.push_back(model->input().get_any_name()); + + const ov::Shape& inputShape = model->input().get_shape(); + ov::Layout inputLayout = getInputLayout(model->input()); + + if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 || + inputShape[ov::layout::channels_idx(inputLayout)] != 3) { + throw std::logic_error("3-channel 4-dimensional model's input is expected"); + } + + netInputWidth = inputShape[ov::layout::width_idx(inputLayout)]; + netInputHeight = inputShape[ov::layout::height_idx(inputLayout)]; + + ov::preprocess::PrePostProcessor ppp(model); + ppp.input().preprocess().convert_element_type(ov::element::f32); + ppp.input().tensor().set_element_type(ov::element::u8).set_layout("NHWC"); + + ppp.input().model().set_layout(inputLayout); + + // --------------------------- Prepare output ----------------------------------------------------- + const ov::OutputVector& outputs = model->outputs(); + if (outputs.size() != 1) { + throw std::logic_error("Style transfer model wrapper supports topologies with only 1 output"); + } + outputsNames.push_back(model->output().get_any_name()); + + const ov::Shape& outputShape = model->output().get_shape(); + ov::Layout outputLayout{"NCHW"}; + if (outputShape.size() != 4 || outputShape[ov::layout::batch_idx(outputLayout)] != 1 || + outputShape[ov::layout::channels_idx(outputLayout)] != 3) { + throw std::logic_error("3-channel 4-dimensional model's output is expected"); + } + + ppp.output().tensor().set_element_type(ov::element::f32); + model = ppp.build(); +} + +std::unique_ptr<ResultBase> StyleTransferModel::postprocess(InferenceResult& infResult) { + ImageResult* result = new ImageResult; + *static_cast<ResultBase*>(result) = static_cast<ResultBase&>(infResult); + + const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>(); + const auto outputData = infResult.getFirstOutputTensor().data<float>(); + + const ov::Shape& outputShape = infResult.getFirstOutputTensor().get_shape(); + size_t outHeight = static_cast<int>(outputShape[2]); + size_t outWidth = static_cast<int>(outputShape[3]); + size_t numOfPixels = outWidth * outHeight; + + std::vector<cv::Mat> imgPlanes; + imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2])), + cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])), + cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0]))}; + cv::Mat resultImg; + cv::merge(imgPlanes, resultImg); + cv::resize(resultImg, result->resultImage, cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight)); + + result->resultImage.convertTo(result->resultImage, CV_8UC3); + + return std::unique_ptr<ResultBase>(result); +} diff --git a/python/openvino/runtime/common/models/src/super_resolution_model.cpp b/python/openvino/runtime/common/models/src/super_resolution_model.cpp new file mode 100644 index 0000000..164991a --- /dev/null +++ b/python/openvino/runtime/common/models/src/super_resolution_model.cpp @@ -0,0 +1,207 @@ +/* +// Copyright (C) 2021-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include "models/super_resolution_model.h" + +#include <stddef.h> + +#include <map> +#include <stdexcept> +#include <string> +#include <utility> +#include <vector> + +#include <opencv2/imgproc.hpp> +#include <openvino/openvino.hpp> + +#include <utils/image_utils.h> +#include <utils/ocv_common.hpp> +#include <utils/slog.hpp> + +#include "models/input_data.h" +#include "models/internal_model_data.h" +#include "models/results.h" + +SuperResolutionModel::SuperResolutionModel(const std::string& modelFileName, + const cv::Size& inputImgSize, + const std::string& layout) + : ImageModel(modelFileName, false, layout) { + netInputHeight = inputImgSize.height; + netInputWidth = inputImgSize.width; +} + +void SuperResolutionModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) { + // --------------------------- Configure input & output --------------------------------------------- + // --------------------------- Prepare input -------------------------------------------------- + const ov::OutputVector& inputs = model->inputs(); + if (inputs.size() != 1 && inputs.size() != 2) { + throw std::logic_error("Super resolution model wrapper supports topologies with 1 or 2 inputs only"); + } + std::string lrInputTensorName = inputs.begin()->get_any_name(); + inputsNames.push_back(lrInputTensorName); + ov::Shape lrShape = inputs.begin()->get_shape(); + if (lrShape.size() != 4) { + throw std::logic_error("Number of dimensions for an input must be 4"); + } + // in case of 2 inputs they have the same layouts + ov::Layout inputLayout = getInputLayout(model->inputs().front()); + + auto channelsId = ov::layout::channels_idx(inputLayout); + auto heightId = ov::layout::height_idx(inputLayout); + auto widthId = ov::layout::width_idx(inputLayout); + + if (lrShape[channelsId] != 1 && lrShape[channelsId] != 3) { + throw std::logic_error("Input layer is expected to have 1 or 3 channels"); + } + + // A model like single-image-super-resolution-???? may take bicubic interpolation of the input image as the + // second input + std::string bicInputTensorName; + if (inputs.size() == 2) { + bicInputTensorName = (++inputs.begin())->get_any_name(); + inputsNames.push_back(bicInputTensorName); + ov::Shape bicShape = (++inputs.begin())->get_shape(); + if (bicShape.size() != 4) { + throw std::logic_error("Number of dimensions for both inputs must be 4"); + } + if (lrShape[widthId] >= bicShape[widthId] && lrShape[heightId] >= bicShape[heightId]) { + std::swap(bicShape, lrShape); + inputsNames[0].swap(inputsNames[1]); + } else if (!(lrShape[widthId] <= bicShape[widthId] && lrShape[heightId] <= bicShape[heightId])) { + throw std::logic_error("Each spatial dimension of one input must surpass or be equal to a spatial" + "dimension of another input"); + } + } + + ov::preprocess::PrePostProcessor ppp(model); + for (const auto& input : inputs) { + ppp.input(input.get_any_name()).tensor().set_element_type(ov::element::u8).set_layout("NHWC"); + + ppp.input(input.get_any_name()).model().set_layout(inputLayout); + } + + // --------------------------- Prepare output ----------------------------------------------------- + const ov::OutputVector& outputs = model->outputs(); + if (outputs.size() != 1) { + throw std::logic_error("Super resolution model wrapper supports topologies with only 1 output"); + } + + outputsNames.push_back(outputs.begin()->get_any_name()); + ppp.output().tensor().set_element_type(ov::element::f32); + model = ppp.build(); + + const ov::Shape& outShape = model->output().get_shape(); + + const ov::Layout outputLayout("NCHW"); + const auto outWidth = outShape[ov::layout::width_idx(outputLayout)]; + const auto inWidth = lrShape[ov::layout::width_idx(outputLayout)]; + changeInputSize(model, static_cast<int>(outWidth / inWidth)); +} + +void SuperResolutionModel::changeInputSize(std::shared_ptr<ov::Model>& model, int coeff) { + std::map<std::string, ov::PartialShape> shapes; + const ov::Layout& layout = ov::layout::get_layout(model->inputs().front()); + const auto batchId = ov::layout::batch_idx(layout); + const auto heightId = ov::layout::height_idx(layout); + const auto widthId = ov::layout::width_idx(layout); + + const ov::OutputVector& inputs = model->inputs(); + std::string lrInputTensorName = inputs.begin()->get_any_name(); + ov::Shape lrShape = inputs.begin()->get_shape(); + + if (inputs.size() == 2) { + std::string bicInputTensorName = (++inputs.begin())->get_any_name(); + ov::Shape bicShape = (++inputs.begin())->get_shape(); + if (lrShape[heightId] >= bicShape[heightId] && lrShape[widthId] >= bicShape[widthId]) { + std::swap(bicShape, lrShape); + std::swap(bicInputTensorName, lrInputTensorName); + } + bicShape[batchId] = 1; + bicShape[heightId] = coeff * netInputHeight; + bicShape[widthId] = coeff * netInputWidth; + shapes[bicInputTensorName] = ov::PartialShape(bicShape); + } + + lrShape[batchId] = 1; + lrShape[heightId] = netInputHeight; + lrShape[widthId] = netInputWidth; + shapes[lrInputTensorName] = ov::PartialShape(lrShape); + + model->reshape(shapes); +} + +std::shared_ptr<InternalModelData> SuperResolutionModel::preprocess(const InputData& inputData, + ov::InferRequest& request) { + auto imgData = inputData.asRef<ImageInputData>(); + auto& img = imgData.inputImage; + + const ov::Tensor lrInputTensor = request.get_tensor(inputsNames[0]); + const ov::Layout layout("NHWC"); + + if (img.channels() != static_cast<int>(lrInputTensor.get_shape()[ov::layout::channels_idx(layout)])) { + cv::cvtColor(img, img, cv::COLOR_BGR2GRAY); + } + + if (static_cast<size_t>(img.cols) != netInputWidth || static_cast<size_t>(img.rows) != netInputHeight) { + slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl; + } + const size_t height = lrInputTensor.get_shape()[ov::layout::height_idx(layout)]; + const size_t width = lrInputTensor.get_shape()[ov::layout::width_idx(layout)]; + img = resizeImageExt(img, width, height); + request.set_tensor(inputsNames[0], wrapMat2Tensor(img)); + + if (inputsNames.size() == 2) { + const ov::Tensor bicInputTensor = request.get_tensor(inputsNames[1]); + const int h = static_cast<int>(bicInputTensor.get_shape()[ov::layout::height_idx(layout)]); + const int w = static_cast<int>(bicInputTensor.get_shape()[ov::layout::width_idx(layout)]); + cv::Mat resized; + cv::resize(img, resized, cv::Size(w, h), 0, 0, cv::INTER_CUBIC); + request.set_tensor(inputsNames[1], wrapMat2Tensor(resized)); + } + + return std::make_shared<InternalImageModelData>(img.cols, img.rows); +} + +std::unique_ptr<ResultBase> SuperResolutionModel::postprocess(InferenceResult& infResult) { + ImageResult* result = new ImageResult; + *static_cast<ResultBase*>(result) = static_cast<ResultBase&>(infResult); + const auto outputData = infResult.getFirstOutputTensor().data<float>(); + + std::vector<cv::Mat> imgPlanes; + const ov::Shape& outShape = infResult.getFirstOutputTensor().get_shape(); + const size_t outChannels = static_cast<int>(outShape[1]); + const size_t outHeight = static_cast<int>(outShape[2]); + const size_t outWidth = static_cast<int>(outShape[3]); + const size_t numOfPixels = outWidth * outHeight; + if (outChannels == 3) { + imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0])), + cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])), + cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2]))}; + } else { + imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0]))}; + // Post-processing for text-image-super-resolution models + cv::threshold(imgPlanes[0], imgPlanes[0], 0.5f, 1.0f, cv::THRESH_BINARY); + } + + for (auto& img : imgPlanes) { + img.convertTo(img, CV_8UC1, 255); + } + cv::Mat resultImg; + cv::merge(imgPlanes, resultImg); + result->resultImage = resultImg; + + return std::unique_ptr<ResultBase>(result); +} |
