summaryrefslogtreecommitdiff
path: root/python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp')
-rw-r--r--python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp264
1 files changed, 264 insertions, 0 deletions
diff --git a/python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp b/python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp
new file mode 100644
index 0000000..33a3604
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp
@@ -0,0 +1,264 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/hpe_model_associative_embedding.h"
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/image_utils.h>
+#include <utils/ocv_common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/associative_embedding_decoder.h"
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+const cv::Vec3f HpeAssociativeEmbedding::meanPixel = cv::Vec3f::all(128);
+const float HpeAssociativeEmbedding::detectionThreshold = 0.1f;
+const float HpeAssociativeEmbedding::tagThreshold = 1.0f;
+
+HpeAssociativeEmbedding::HpeAssociativeEmbedding(const std::string& modelFileName,
+ double aspectRatio,
+ int targetSize,
+ float confidenceThreshold,
+ const std::string& layout,
+ float delta,
+ RESIZE_MODE resizeMode)
+ : ImageModel(modelFileName, false, layout),
+ aspectRatio(aspectRatio),
+ targetSize(targetSize),
+ confidenceThreshold(confidenceThreshold),
+ delta(delta) {
+ resizeMode = resizeMode;
+ interpolationMode = cv::INTER_CUBIC;
+ }
+
+void HpeAssociativeEmbedding::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+ // --------------------------- Configure input & output -------------------------------------------------
+ // --------------------------- Prepare input Tensors ------------------------------------------------------
+ if (model->inputs().size() != 1) {
+ throw std::logic_error("HPE AE model wrapper supports topologies with only 1 input.");
+ }
+ inputsNames.push_back(model->input().get_any_name());
+
+ const ov::Shape& inputShape = model->input().get_shape();
+ const ov::Layout& inputLayout = getInputLayout(model->input());
+
+ if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 ||
+ inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+ throw std::logic_error("3-channel 4-dimensional model's input is expected");
+ }
+
+ ov::preprocess::PrePostProcessor ppp(model);
+ ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+ ppp.input().model().set_layout(inputLayout);
+
+ // --------------------------- Prepare output Tensors -----------------------------------------------------
+ const ov::OutputVector& outputs = model->outputs();
+ if (outputs.size() != 2 && outputs.size() != 3) {
+ throw std::logic_error("HPE AE model model wrapper supports topologies only with 2 or 3 outputs");
+ }
+
+ for (const auto& output : model->outputs()) {
+ const auto& outTensorName = output.get_any_name();
+ ppp.output(outTensorName).tensor().set_element_type(ov::element::f32);
+
+ for (const auto& name : output.get_names()) {
+ outputsNames.push_back(name);
+ }
+
+ const ov::Shape& outputShape = output.get_shape();
+ if (outputShape.size() != 4 && outputShape.size() != 5) {
+ throw std::logic_error("output tensors are expected to be 4-dimensional or 5-dimensional");
+ }
+ if (outputShape[ov::layout::batch_idx("NC...")] != 1 || outputShape[ov::layout::channels_idx("NC...")] != 17) {
+ throw std::logic_error("output tensors are expected to have 1 batch size and 17 channels");
+ }
+ }
+ model = ppp.build();
+
+ embeddingsTensorName = findTensorByName("embeddings", outputsNames);
+ heatmapsTensorName = findTensorByName("heatmaps", outputsNames);
+ try {
+ nmsHeatmapsTensorName = findTensorByName("nms_heatmaps", outputsNames);
+ } catch (const std::runtime_error&) { nmsHeatmapsTensorName = heatmapsTensorName; }
+
+ changeInputSize(model);
+}
+
+void HpeAssociativeEmbedding::changeInputSize(std::shared_ptr<ov::Model>& model) {
+ ov::Shape inputShape = model->input().get_shape();
+ const ov::Layout& layout = ov::layout::get_layout(model->input());
+ const auto batchId = ov::layout::batch_idx(layout);
+ const auto heightId = ov::layout::height_idx(layout);
+ const auto widthId = ov::layout::width_idx(layout);
+
+ if (!targetSize) {
+ targetSize = static_cast<int>(std::min(inputShape[heightId], inputShape[widthId]));
+ }
+ int inputHeight = aspectRatio >= 1.0 ? targetSize : static_cast<int>(std::round(targetSize / aspectRatio));
+ int inputWidth = aspectRatio >= 1.0 ? static_cast<int>(std::round(targetSize * aspectRatio)) : targetSize;
+ int height = static_cast<int>((inputHeight + stride - 1) / stride) * stride;
+ int width = static_cast<int>((inputWidth + stride - 1) / stride) * stride;
+ inputShape[batchId] = 1;
+ inputShape[heightId] = height;
+ inputShape[widthId] = width;
+ inputLayerSize = cv::Size(width, height);
+
+ model->reshape(inputShape);
+}
+
+std::shared_ptr<InternalModelData> HpeAssociativeEmbedding::preprocess(const InputData& inputData,
+ ov::InferRequest& request) {
+ auto& image = inputData.asRef<ImageInputData>().inputImage;
+ cv::Rect roi;
+ auto paddedImage = resizeImageExt(image, inputLayerSize.width, inputLayerSize.height, resizeMode, interpolationMode, &roi);
+ if (inputLayerSize.height - stride >= roi.height || inputLayerSize.width - stride >= roi.width) {
+ slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl;
+ }
+ request.set_input_tensor(wrapMat2Tensor(paddedImage));
+
+ return std::make_shared<InternalScaleData>(paddedImage.cols,
+ paddedImage.rows,
+ image.size().width / static_cast<float>(roi.width),
+ image.size().height / static_cast<float>(roi.height));
+}
+
+std::unique_ptr<ResultBase> HpeAssociativeEmbedding::postprocess(InferenceResult& infResult) {
+ HumanPoseResult* result = new HumanPoseResult(infResult.frameId, infResult.metaData);
+
+ const auto& aembds = infResult.outputsData[embeddingsTensorName];
+ const ov::Shape& aembdsShape = aembds.get_shape();
+ float* const aembdsMapped = aembds.data<float>();
+ std::vector<cv::Mat> aembdsMaps = split(aembdsMapped, aembdsShape);
+
+ const auto& heats = infResult.outputsData[heatmapsTensorName];
+ const ov::Shape& heatMapsShape = heats.get_shape();
+ float* const heatMapsMapped = heats.data<float>();
+ std::vector<cv::Mat> heatMaps = split(heatMapsMapped, heatMapsShape);
+
+ std::vector<cv::Mat> nmsHeatMaps = heatMaps;
+ if (nmsHeatmapsTensorName != heatmapsTensorName) {
+ const auto& nmsHeats = infResult.outputsData[nmsHeatmapsTensorName];
+ const ov::Shape& nmsHeatMapsShape = nmsHeats.get_shape();
+ float* const nmsHeatMapsMapped = nmsHeats.data<float>();
+ nmsHeatMaps = split(nmsHeatMapsMapped, nmsHeatMapsShape);
+ }
+ std::vector<HumanPose> poses = extractPoses(heatMaps, aembdsMaps, nmsHeatMaps);
+
+ // Rescale poses to the original image
+ const auto& scale = infResult.internalModelData->asRef<InternalScaleData>();
+ const float outputScale = inputLayerSize.width / static_cast<float>(heatMapsShape[3]);
+ float shiftX = 0.0, shiftY = 0.0;
+ float scaleX = 1.0, scaleY = 1.0;
+
+ if (resizeMode == RESIZE_KEEP_ASPECT_LETTERBOX) {
+ scaleX = scaleY = std::min(scale.scaleX, scale.scaleY);
+ if (aspectRatio >= 1.0)
+ shiftX = static_cast<float>((targetSize * scaleX * aspectRatio - scale.inputImgWidth * scaleX) / 2);
+ else
+ shiftY = static_cast<float>((targetSize * scaleY / aspectRatio - scale.inputImgHeight * scaleY) / 2);
+ scaleX = scaleY *= outputScale;
+ } else {
+ scaleX = scale.scaleX * outputScale;
+ scaleY = scale.scaleY * outputScale;
+ }
+
+ for (auto& pose : poses) {
+ for (auto& keypoint : pose.keypoints) {
+ if (keypoint != cv::Point2f(-1, -1)) {
+ keypoint.x = keypoint.x * scaleX + shiftX;
+ keypoint.y = keypoint.y * scaleY + shiftY;
+ }
+ }
+ result->poses.push_back(pose);
+ }
+
+ return std::unique_ptr<ResultBase>(result);
+}
+
+std::string HpeAssociativeEmbedding::findTensorByName(const std::string& tensorName,
+ const std::vector<std::string>& outputsNames) {
+ std::vector<std::string> suitableLayers;
+ for (auto& outputName : outputsNames) {
+ if (outputName.rfind(tensorName, 0) == 0) {
+ suitableLayers.push_back(outputName);
+ }
+ }
+ if (suitableLayers.empty()) {
+ throw std::runtime_error("Suitable tensor for " + tensorName + " output is not found");
+ } else if (suitableLayers.size() > 1) {
+ throw std::runtime_error("More than 1 tensor matched to " + tensorName + " output");
+ }
+ return suitableLayers[0];
+}
+
+std::vector<cv::Mat> HpeAssociativeEmbedding::split(float* data, const ov::Shape& shape) {
+ std::vector<cv::Mat> flattenData(shape[1]);
+ for (size_t i = 0; i < flattenData.size(); i++) {
+ flattenData[i] = cv::Mat(shape[2], shape[3], CV_32FC1, data + i * shape[2] * shape[3]);
+ }
+ return flattenData;
+}
+
+std::vector<HumanPose> HpeAssociativeEmbedding::extractPoses(std::vector<cv::Mat>& heatMaps,
+ const std::vector<cv::Mat>& aembdsMaps,
+ const std::vector<cv::Mat>& nmsHeatMaps) const {
+ std::vector<std::vector<Peak>> allPeaks(numJoints);
+ for (int i = 0; i < numJoints; i++) {
+ findPeaks(nmsHeatMaps, aembdsMaps, allPeaks, i, maxNumPeople, detectionThreshold);
+ }
+ std::vector<Pose> allPoses = matchByTag(allPeaks, maxNumPeople, numJoints, tagThreshold);
+ // swap for all poses
+ for (auto& pose : allPoses) {
+ for (size_t j = 0; j < numJoints; j++) {
+ Peak& peak = pose.getPeak(j);
+ std::swap(peak.keypoint.x, peak.keypoint.y);
+ }
+ }
+ std::vector<HumanPose> poses;
+ for (size_t i = 0; i < allPoses.size(); i++) {
+ Pose& pose = allPoses[i];
+ // Filtering poses with low mean scores
+ if (pose.getMeanScore() <= confidenceThreshold) {
+ continue;
+ }
+ for (size_t j = 0; j < heatMaps.size(); j++) {
+ heatMaps[j] = cv::abs(heatMaps[j]);
+ }
+ adjustAndRefine(allPoses, heatMaps, aembdsMaps, i, delta);
+ std::vector<cv::Point2f> keypoints;
+ for (size_t j = 0; j < numJoints; j++) {
+ Peak& peak = pose.getPeak(j);
+ keypoints.push_back(peak.keypoint);
+ }
+ poses.push_back({keypoints, pose.getMeanScore()});
+ }
+ return poses;
+}