summaryrefslogtreecommitdiff
path: root/python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp')
-rw-r--r--python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp277
1 files changed, 277 insertions, 0 deletions
diff --git a/python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp b/python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp
new file mode 100644
index 0000000..8322c3c
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp
@@ -0,0 +1,277 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_retinaface_pt.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/nms.hpp>
+#include <utils/ocv_common.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+ModelRetinaFacePT::ModelRetinaFacePT(const std::string& modelFileName,
+ float confidenceThreshold,
+ bool useAutoResize,
+ float boxIOUThreshold,
+ const std::string& layout)
+ : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, {"Face"}, layout), // Default label is "Face"
+ landmarksNum(0),
+ boxIOUThreshold(boxIOUThreshold) {}
+
+void ModelRetinaFacePT::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input ------------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("RetinaFacePT model wrapper expects models that have only 1 input");
+ }
+
+ const ov::Shape& inputShape = model->input().get_shape();
+ const ov::Layout& inputLayout = getInputLayout(model->input());
+
+ if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("Expected 3-channel input");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ inputTransform.setPrecision(ppp, model->input().get_any_name());
+ ppp.input().tensor().set_layout({"NHWC"});
+
+ if (useAutoResize) {
+ ppp.input().tensor().set_spatial_dynamic_shape();
+
+ ppp.input()
+ .preprocess()
+ .convert_element_type(ov::element::f32)
+ .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+ }
+
+ ppp.input().model().set_layout(inputLayout);
+
+ // --------------------------- Reading image input parameters -------------------------------------------
+ inputsNames.push_back(model->input().get_any_name());
+ netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+ netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+ // --------------------------- Prepare output -----------------------------------------------------
+ if (model->outputs().size() != 3) {
+ throw std::logic_error("RetinaFace model wrapper expects models that have 3 outputs");
+ }
+
+ landmarksNum = 0;
+
+ outputsNames.resize(2);
+ std::vector<uint32_t> outputsSizes[OUT_MAX];
+ const ov::Layout chw("CHW");
+ const ov::Layout nchw("NCHW");
+ for (auto& output : model->outputs()) {
+ auto outTensorName = output.get_any_name();
+ outputsNames.push_back(outTensorName);
+ ppp.output(outTensorName)
+ .tensor()
+ .set_element_type(ov::element::f32)
+ .set_layout(output.get_shape().size() == 4 ? nchw : chw);
+
+ if (outTensorName.find("bbox") != std::string::npos) {
+ outputsNames[OUT_BOXES] = outTensorName;
+ } else if (outTensorName.find("cls") != std::string::npos) {
+ outputsNames[OUT_SCORES] = outTensorName;
+ } else if (outTensorName.find("landmark") != std::string::npos) {
+ // Landmarks might be optional, if it is present, resize names array to fit landmarks output name to the
+ // last item of array Considering that other outputs names are already filled in or will be filled later
+ outputsNames.resize(std::max(outputsNames.size(), (size_t)OUT_LANDMARKS + 1));
+ outputsNames[OUT_LANDMARKS] = outTensorName;
+ landmarksNum =
+ output.get_shape()[ov::layout::width_idx(chw)] / 2; // Each landmark consist of 2 variables (x and y)
+ } else {
+ continue;
+ }
+ }
+
+ if (outputsNames[OUT_BOXES] == "" || outputsNames[OUT_SCORES] == "") {
+ throw std::logic_error("Bbox or cls layers are not found");
+ }
+
+ model = ppp.build();
+ priors = generatePriorData();
+}
+
+std::vector<size_t> ModelRetinaFacePT::filterByScore(const ov::Tensor& scoresTensor, const float confidenceThreshold) {
+ std::vector<size_t> indicies;
+ const auto& shape = scoresTensor.get_shape();
+ const float* scoresPtr = scoresTensor.data<float>();
+
+ for (size_t x = 0; x < shape[1]; ++x) {
+ const auto idx = (x * shape[2] + 1);
+ const auto score = scoresPtr[idx];
+ if (score >= confidenceThreshold) {
+ indicies.push_back(x);
+ }
+ }
+
+ return indicies;
+}
+
+std::vector<float> ModelRetinaFacePT::getFilteredScores(const ov::Tensor& scoresTensor,
+ const std::vector<size_t>& indicies) {
+ const auto& shape = scoresTensor.get_shape();
+ const float* scoresPtr = scoresTensor.data<float>();
+
+ std::vector<float> scores;
+ scores.reserve(indicies.size());
+
+ for (auto i : indicies) {
+ scores.push_back(scoresPtr[i * shape[2] + 1]);
+ }
+ return scores;
+}
+
+std::vector<cv::Point2f> ModelRetinaFacePT::getFilteredLandmarks(const ov::Tensor& landmarksTensor,
+ const std::vector<size_t>& indicies,
+ int imgWidth,
+ int imgHeight) {
+ const auto& shape = landmarksTensor.get_shape();
+ const float* landmarksPtr = landmarksTensor.data<float>();
+
+ std::vector<cv::Point2f> landmarks(landmarksNum * indicies.size());
+
+ for (size_t i = 0; i < indicies.size(); i++) {
+ const size_t idx = indicies[i];
+ const auto& prior = priors[idx];
+ for (size_t j = 0; j < landmarksNum; j++) {
+ landmarks[i * landmarksNum + j].x =
+ clamp(prior.cX + landmarksPtr[idx * shape[2] + j * 2] * variance[0] * prior.width, 0.f, 1.f) * imgWidth;
+ landmarks[i * landmarksNum + j].y =
+ clamp(prior.cY + landmarksPtr[idx * shape[2] + j * 2 + 1] * variance[0] * prior.height, 0.f, 1.f) *
+ imgHeight;
+ }
+ }
+ return landmarks;
+}
+
+std::vector<ModelRetinaFacePT::Box> ModelRetinaFacePT::generatePriorData() {
+ const float globalMinSizes[][2] = {{16, 32}, {64, 128}, {256, 512}};
+ const float steps[] = {8., 16., 32.};
+ std::vector<ModelRetinaFacePT::Box> anchors;
+ for (size_t stepNum = 0; stepNum < arraySize(steps); stepNum++) {
+ const int featureW = static_cast<int>(std::round(netInputWidth / steps[stepNum]));
+ const int featureH = static_cast<int>(std::round(netInputHeight / steps[stepNum]));
+
+ const auto& minSizes = globalMinSizes[stepNum];
+ for (int i = 0; i < featureH; i++) {
+ for (int j = 0; j < featureW; j++) {
+ for (auto minSize : minSizes) {
+ const float sKX = minSize / netInputWidth;
+ const float sKY = minSize / netInputHeight;
+ const float denseCY = (i + 0.5f) * steps[stepNum] / netInputHeight;
+ const float denseCX = (j + 0.5f) * steps[stepNum] / netInputWidth;
+ anchors.push_back(ModelRetinaFacePT::Box{denseCX, denseCY, sKX, sKY});
+ }
+ }
+ }
+ }
+ return anchors;
+}
+
+std::vector<Anchor> ModelRetinaFacePT::getFilteredProposals(const ov::Tensor& boxesTensor,
+ const std::vector<size_t>& indicies,
+ int imgWidth,
+ int imgHeight) {
+ std::vector<Anchor> rects;
+ rects.reserve(indicies.size());
+
+ const auto& shape = boxesTensor.get_shape();
+ const float* boxesPtr = boxesTensor.data<float>();
+
+ if (shape[1] != priors.size()) {
+ throw std::logic_error("rawBoxes size is not equal to priors size");
+ }
+
+ for (auto i : indicies) {
+ const auto pRawBox = reinterpret_cast<const Box*>(boxesPtr + i * shape[2]);
+ const auto& prior = priors[i];
+ const float cX = priors[i].cX + pRawBox->cX * variance[0] * prior.width;
+ const float cY = priors[i].cY + pRawBox->cY * variance[0] * prior.height;
+ const float width = prior.width * exp(pRawBox->width * variance[1]);
+ const float height = prior.height * exp(pRawBox->height * variance[1]);
+ rects.push_back(Anchor{clamp(cX - width / 2, 0.f, 1.f) * imgWidth,
+ clamp(cY - height / 2, 0.f, 1.f) * imgHeight,
+ clamp(cX + width / 2, 0.f, 1.f) * imgWidth,
+ clamp(cY + height / 2, 0.f, 1.f) * imgHeight});
+ }
+
+ return rects;
+}
+
+std::unique_ptr<ResultBase> ModelRetinaFacePT::postprocess(InferenceResult& infResult) {
+ // (raw_output, scale_x, scale_y, face_prob_threshold, image_size):
+ const auto boxesTensor = infResult.outputsData[outputsNames[OUT_BOXES]];
+ const auto scoresTensor = infResult.outputsData[outputsNames[OUT_SCORES]];
+
+ const auto& validIndicies = filterByScore(scoresTensor, confidenceThreshold);
+ const auto& scores = getFilteredScores(scoresTensor, validIndicies);
+
+ const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+ const auto& landmarks = landmarksNum ? getFilteredLandmarks(infResult.outputsData[outputsNames[OUT_LANDMARKS]],
+ validIndicies,
+ internalData.inputImgWidth,
+ internalData.inputImgHeight)
+ : std::vector<cv::Point2f>();
+
+ const auto& proposals =
+ getFilteredProposals(boxesTensor, validIndicies, internalData.inputImgWidth, internalData.inputImgHeight);
+
+ const auto& keptIndicies = nms(proposals, scores, boxIOUThreshold, !landmarksNum);
+
+ // --------------------------- Create detection result objects
+ // --------------------------------------------------------
+ RetinaFaceDetectionResult* result = new RetinaFaceDetectionResult(infResult.frameId, infResult.metaData);
+
+ result->objects.reserve(keptIndicies.size());
+ result->landmarks.reserve(keptIndicies.size() * landmarksNum);
+ for (auto i : keptIndicies) {
+ DetectedObject desc;
+ desc.confidence = scores[i];
+
+ //--- Scaling coordinates
+ desc.x = proposals[i].left;
+ desc.y = proposals[i].top;
+ desc.width = proposals[i].getWidth();
+ desc.height = proposals[i].getHeight();
+
+ desc.labelID = 0;
+ desc.label = labels[desc.labelID];
+ result->objects.push_back(desc);
+
+ //--- Filtering landmarks coordinates
+ for (uint32_t l = 0; l < landmarksNum; ++l) {
+ result->landmarks.emplace_back(landmarks[i * landmarksNum + l].x, landmarks[i * landmarksNum + l].y);
+ }
+ }
+
+ return std::unique_ptr<ResultBase>(result);
+}