1 files changed, 696 insertions, 0 deletions
diff --git a/python/openvino/runtime/dla_benchmark/average_precision.cpp b/python/openvino/runtime/dla_benchmark/average_precision.cpp
new file mode 100644
index 0000000..84008b7
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/average_precision.cpp
@@ -0,0 +1,696 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// The function of this file is to provide mAP and COCO AP calculation in metrics_eval
+// and metrics_update. The calculation is comprised with two parts, 1) data preprocessing,
+// and 2) metrics calculation. Data preprocessing consists of prediction box parsing;
+// resize and filtering; non-max suppression; and clipping. The preprocessed data is stored
+// in `PredictionEntry` and `AnnotationEntry` structs, which are used in the `metrics_update`
+// and `metrics_eval`. `metrics_update` updates intermidiate statistics to form the batched
+// statistics, and the `metrics_eval` calculated the integral of the ROC of P-R curve. All of
+// the metadata should be set in the header file and the runtime invariants are set using
+// `set_runtime`. The validate_yolo_wrapper is the main entery point of the subroutine.
+//
+// The mAP algorithm is built according to the section 2.2 in https://arxiv.org/pdf/1607.03476.pdf
+// and OpenVINO's accuracy_checker. The COCO AP algorithm is specified in
+// https://cocodataset.org/#detection-eval. The result is compared value-by-value with the
+// result from OpenVINO's accuracy_checker using dlsdk launcher. To obtain the the golden
+// result, apply the steps in https://docs.openvino.ai/latest/omz_models_model_yolo_v3_tf.html.
+
+#include "average_precision.hpp"
+#include <cmath>
+#if defined(_WIN32) || defined(_WIN64)
+#include <io.h>
+#else
+#include <dirent.h>
+#endif
+#if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
+#include <filesystem>
+namespace fs = std::filesystem;
+#endif
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <utility>
+#include <sstream>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <samples/slog.hpp>
+#include "utils.hpp"
+
+#define VERBOSE 0
+
+// Parses predicted boxes in `results_data` to a 2d tensor `raw_predictions`. The parameter
+// `batch` indicates the image which corresponds to those predicted boxes.
+// Order: conv2d_12[1x255x26x26] -> conv2d_9[1x255x13x13], NCHW order
+void parse_prediction_boxes(std::vector<double> &predicted_val, Tensor2d<double> &raw_predictions) {
+  raw_predictions.emplace_back(std::vector<double>{});
+  const std::vector<unsigned> &grid_sizes = yolo_meta.grid_sizes.at(runtime_vars.name);
+
+  int total_boxes{0};
+  std::for_each(std::begin(grid_sizes), std::end(grid_sizes), [&](unsigned n) {
+    total_boxes += std::pow(n, 2) * yolo_meta.box_per_channel;
+  });
+
+  for (int count = 0; count < total_boxes; count++) {
+    raw_predictions.emplace_back(Box<double>{});
+    raw_predictions[count].reserve(yolo_meta.pbox_size);
+  }
+
+  auto index_of = [=](int n, int c, int h, int w, int C, int H, int W) {
+    return n * C * H * W + c * H * W + h * W + w;
+  };
+
+  // first are boxes in 26x26 grid
+  // treat each tensor as 3 batchs
+  for (int grid : grid_sizes) {
+    // offset to where the data is retrieved
+    int data_offset{0};
+    // offset to where the data is inserted
+    int position_offset{0};
+    for (int n : grid_sizes) {
+      if (n == grid) break;
+      data_offset += pow(n, 2) * yolo_meta.channel;
+      position_offset += pow(n, 2) * yolo_meta.box_per_channel;
+    }
+
+    int N = yolo_meta.box_per_channel, C = yolo_meta.pbox_size, H = grid, W = grid;
+
+    for (int n = 0; n < N; n++) {
+      for (int c = 0; c < C; c++) {
+        for (int h = 0; h < H; h++) {
+          for (int w = 0; w < W; w++) {
+            // corresponds to #c data for grid #h,w, of the #n anchor
+            Box<double> &pbox = raw_predictions[position_offset + n * H * W + h * W + w];
+            // fills prediction boxes
+            pbox.emplace_back(predicted_val[data_offset + index_of(n, c, h, w, C, H, W)]);
+          }
+        }
+      }
+    }
+  }
+}
+
+// Parses annotation boxes stored in text file and stores in a 3d tensor `raw_annotation`.
+// Precondition: the file is formatted such that each line contains 5 doubles and separated
+// by spaces, i.e. [class, x, y, width, height]. Returns -3 if cannot read from file.
+int parse_annotation_boxes(Tensor3d<double> &raw_annotation, const std::string &path) {
+  int err = 0;
+  std::ifstream annotation_file(path);
+  if (!annotation_file.is_open()) {
+    slog::err << "Couldn't access path: " << path << slog::endl;
+    err = -3;
+  } else {
+    Tensor2d<double> annotation_box;
+    int class_id;
+    double x, y, w, h;
+    while (annotation_file >> class_id >> x >> y >> w >> h) {
+      annotation_box.emplace_back(Box<double>{x, y, w, h, (double)class_id});
+    }
+    raw_annotation.emplace_back(annotation_box);
+  }
+  return err;
+}
+
+// Extracts filenames in `path` with given extension specified in `extensions`.
+// Returns the number of file with extension `ext`, or -1 for error.
+int walk_dirent(std::vector<std::string> &names, const std::string &path, std::string ext) {
+#if defined(_WIN32) || defined(_WIN64)
+#if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
+  int count = 0;
+  for (const auto &entry : fs::directory_iterator(path)) {
+    if (fs::is_regular_file(entry)) {
+      std::string filename = entry.path().filename().string();
+      std::string file_extension = filename.substr(filename.find_last_of(".") + 1);
+      if (file_extension == ext) {
+        names.emplace_back(filename);
+        count++;
+      }
+    }
+  }
+#endif
+#else
+  DIR *dir = opendir(path.c_str());
+  int count = 0;
+  if (!dir) {
+    slog::err << "Couldn't access path: " << path << slog::endl;
+    count = -1;
+  } else {
+    for (struct dirent *dent; (dent = readdir(dir)) != nullptr;) {
+      std::string dirname(dent->d_name);
+      std::string stem = GetStem(dirname);
+      std::string extension = GetExtension(dirname);
+      if (stem == "" || stem == "." || extension != ext) continue;
+      names.emplace_back(stem);
+      count += 1;
+    }
+    closedir(dir);
+  }
+#endif
+  return count;
+}
+
+// Dispatches each step of collecting predicted boxes, annotation boxes, and shapes.
+// The function returns 0 on success, -1 for mismatch in the number of annotation files
+// and validation images, -2 for missing annotation file, -3 for failing to access annotation
+// file, and -4 for failing to access validation image.
+int collect_validation_dataset(std::vector<std::string> &image_paths,
+                               Tensor3d<double> &raw_annotations,
+                               Tensor2d<double> &shapes) {
+  int err = 0;
+
+  // set of annotation file name
+  std::vector<std::string> tmp;
+  int num_file = walk_dirent(tmp, runtime_vars.groundtruth_loc, runtime_vars.gt_extension);
+  if (num_file < (int)(runtime_vars.batch_size * runtime_vars.niter)) {
+    if (num_file >= 0) {
+      slog::err << "Not enough validation data found. " << runtime_vars.batch_size * runtime_vars.niter << " required, "
+                << num_file << " provided." << slog::endl;
+    }
+    err = -1;
+  } else {
+    std::set<std::string> annotation_file_index(tmp.begin(), tmp.end());
+
+    // gets all images, corresponding annotation, and shapes
+    std::string gt_path;
+    for (unsigned batch = 0; batch < runtime_vars.batch_size * runtime_vars.niter; batch++) {
+      std::string image_path(image_paths[batch]);
+      std::string img_name = GetStem(image_path);
+      if (annotation_file_index.find(img_name) == annotation_file_index.end()) {
+        slog::err << "Missing annotation file for validation image: " << image_paths[batch] << slog::endl;
+        err = -2;
+        break;
+      } else {
+        gt_path = runtime_vars.groundtruth_loc + "/" + img_name + "." + runtime_vars.gt_extension;
+
+        // gets image dimensions
+        cv::Mat image = cv::imread(image_paths[batch]);
+        if (image.data == nullptr || image.empty()) {
+          slog::err << "Couldn't open input image: " << image_paths[batch] << slog::endl;
+          err = -4;
+          break;
+        }
+
+        err = parse_annotation_boxes(raw_annotations, gt_path);
+        if (err != 0) break;
+        shapes.emplace_back(Box<double>{(double)image.cols, (double)image.rows});
+      }
+    }
+  }
+  return err;
+}
+
+// Removes items at `indices` in the vector `vec`
+template <typename T>
+void reduce_by_index(std::vector<T> &vec, std::vector<unsigned> indices) {
+  std::sort(indices.begin(), indices.end());
+  for (auto it = indices.rbegin(); it != indices.rend(); it++) {
+    vec.erase(vec.begin() + *it);
+  }
+}
+
+// Calculates and returns the Intersection over Union score for two boxes by
+// calculating their area of overlap and area of union.
+double intersection_over_union(Box<double> box1, Box<double> box2) {
+  using namespace std;
+  {
+    double intersect_length_x =
+        max(0.0, min(box1[X_MAX], box2[X_MAX]) - max(box1[X_MIN], box2[X_MIN]) + yolo_meta.boundary);
+    double intersect_length_y =
+        max(0.0, min(box1[Y_MAX], box2[Y_MAX]) - max(box1[Y_MIN], box2[Y_MIN]) + yolo_meta.boundary);
+    double intersection_of_area = intersect_length_x * intersect_length_y;
+    double box1_area =
+        (box1[X_MAX] - box1[X_MIN] + yolo_meta.boundary) * (box1[Y_MAX] - box1[Y_MIN] + yolo_meta.boundary);
+    double box2_area =
+        (box2[X_MAX] - box2[X_MIN] + yolo_meta.boundary) * (box2[Y_MAX] - box2[Y_MIN] + yolo_meta.boundary);
+    double union_of_area = box1_area + box2_area - intersection_of_area;
+    return (union_of_area > 0.0) ? intersection_of_area / union_of_area : 0.0;
+  }  // namespace std
+}
+
+// This function returns the index of the largest element in the vector `vec`.
+template <typename T>
+int argmax(std::vector<T> vec) {
+  return std::distance(vec.begin(), std::max_element(vec.begin(), vec.end()));
+}
+
+// This function returns the index of the largest element in the iterator from `begin` to `end`.
+template <typename Iter>
+int argmax(Iter begin, Iter end) {
+  return std::distance(begin, std::max_element(begin, end));
+}
+
+// Resize the coordinates of bounding boxes from relative ratio to grid cell to the actual coordinates in pixel.
+// This function resizes prediction boxes in the 2d tensor `raw_predictions` based on the definition in page 4 of
+// https://arxiv.org/pdf/1612.08242.pdf. The prediction boxes are also filtered based on their confidence score
+// and class specific score. The result is stored in an instance of `PredictionEntry` which is used for statistics
+// calculation.
+void resize_and_filter_prediction_boxes(Tensor2d<double> &raw_predictions,
+                                        PredictionEntry &prediction,
+                                        const unsigned batch) {
+  unsigned size = 0;
+
+#if VERBOSE == 1
+  unsigned c12 = 0, c9 = 0, c58 = 0, c66 = 0, c74 = 0;
+#endif
+
+  for (unsigned grid : yolo_meta.grid_sizes.at(runtime_vars.name)) {
+    unsigned offset{0};
+    for (unsigned n : yolo_meta.grid_sizes.at(runtime_vars.name)) {
+      if (n == grid) break;
+      offset += pow(n, 2) * yolo_meta.box_per_channel;
+    }
+    for (unsigned x = 0; x < grid; x++) {
+      for (unsigned y = 0; y < grid; y++) {
+        for (unsigned n = 0; n < yolo_meta.box_per_channel; n++) {
+          unsigned bbox_idx = offset + n * pow(grid, 2) + y * grid + x;
+          Box<double> &bbox = raw_predictions[bbox_idx];
+
+          // find the predicted label as the class with highest score
+          int label = argmax(bbox.begin() + (yolo_meta.pbox_size - yolo_meta.num_classes), bbox.end());
+          double cls_score = bbox[BBOX_CONFIDENCE] * bbox[(yolo_meta.pbox_size - yolo_meta.num_classes) + label];
+          // filter outliers with low confidence score or class score
+          if (bbox[BBOX_CONFIDENCE] < yolo_meta.confidence_threshold || cls_score < yolo_meta.confidence_threshold)
+            continue;
+          prediction.cls.push_back(label);
+          prediction.cls_score.push_back(cls_score);
+#if VERBOSE == 1
+          c74 += (unsigned)(grid == 52);
+          c66 += (unsigned)(grid == 26);
+          c58 += (unsigned)(grid == 13);
+          c12 += (unsigned)(grid == 26);
+          c9 += (unsigned)(grid == 13);
+#endif
+          // deduce anchor box width and height
+          unsigned dim = yolo_meta.anchor_sizes.at(runtime_vars.name).at(grid).size() / yolo_meta.box_per_channel;
+          double anchor_w = yolo_meta.anchor_sizes.at(runtime_vars.name).at(grid)[n * dim];
+          double anchor_h = yolo_meta.anchor_sizes.at(runtime_vars.name).at(grid)[n * dim + 1];
+
+          // calculate width and height of bbox
+          double bbox_center_x = (bbox[BBOX_X] + x) / grid;
+          double bbox_center_y = (bbox[BBOX_Y] + y) / grid;
+          double bbox_w = exp(bbox[BBOX_W]) * anchor_w / yolo_meta.dst_image_size[IMG_W];
+          double bbox_h = exp(bbox[BBOX_H]) * anchor_h / yolo_meta.dst_image_size[IMG_H];
+
+          // calculate actual coordinates of bbox
+          double x_max, x_min, y_max, y_min;
+          double w = runtime_vars.source_image_sizes[batch][IMG_W];
+          double h = runtime_vars.source_image_sizes[batch][IMG_H];
+
+          x_max = w * (bbox_center_x + bbox_w / 2.0);
+          x_min = w * (bbox_center_x - bbox_w / 2.0);
+          y_max = h * (bbox_center_y + bbox_h / 2.0);
+          y_min = h * (bbox_center_y - bbox_h / 2.0);
+
+          prediction.x_max.emplace_back(x_max);
+          prediction.x_min.emplace_back(x_min);
+          prediction.y_max.emplace_back(y_max);
+          prediction.y_min.emplace_back(y_min);
+
+          size += 1;
+        }
+      }
+    }
+  }
+  prediction.size = size;
+#if VERBOSE == 1
+  if (runtime_vars.name == "yolo-v3-tf") {
+    slog::info << "prediction boxes from conv2d58: " << c58 << slog::endl;
+    slog::info << "prediction boxes from conv2d66: " << c66 << slog::endl;
+    slog::info << "prediction boxes from conv2d74: " << c74 << slog::endl;
+  } else if (runtime_vars.name == "yolo-v3-tiny-tf") {
+    slog::info << "prediction boxes from conv2d12: " << c12 << slog::endl;
+    slog::info << "prediction boxes from conv2d9: " << c9 << slog::endl;
+  }
+#endif
+}
+
+// Returns indices of `vec` sorted in descending order.
+std::vector<unsigned> argsort_gt(const std::vector<double> &vec) {
+  std::vector<unsigned> order(vec.size());
+  std::generate(order.begin(), order.end(), [n = 0]() mutable { return n++; });
+  std::sort(order.begin(), order.end(), [&](int i1, int i2) { return vec[i1] > vec[i2]; });
+  return order;
+}
+
+// Performs non-maximum suppression algorithm to eliminate repetitive bounding boxes.
+// A bounding box is preserved iff. it has the highest confidence score over all
+// overlapping bounding boxes.
+void nms(PredictionEntry &prediction) {
+  if (prediction.size == 0) return;
+  std::vector<unsigned> &&order = argsort_gt(prediction.cls_score);
+  std::vector<unsigned> keep;
+  std::set<unsigned> discard;
+  unsigned top_score_idx;
+
+  while (discard.size() < order.size()) {
+    bool has_top = false;
+    for (unsigned idx : order) {
+      if (discard.find(idx) != discard.end()) continue;
+      if (!has_top) {
+        has_top = true;
+        top_score_idx = idx;
+        keep.emplace_back(top_score_idx);
+        discard.insert(top_score_idx);
+        continue;
+      }
+      double iou = intersection_over_union(prediction.box_at(idx), prediction.box_at(top_score_idx));
+      if (iou > yolo_meta.iou_threshold) {
+        discard.insert(idx);
+      }
+    }
+  }
+
+  std::vector<unsigned> discard_idx(discard.size());
+  std::vector<unsigned> indexes(discard.begin(), discard.end());
+  std::sort(indexes.begin(), indexes.end());
+  std::sort(keep.begin(), keep.end());
+  std::vector<unsigned>::iterator it =
+      std::set_difference(indexes.begin(), indexes.end(), keep.begin(), keep.end(), discard_idx.begin());
+  discard_idx.resize(it - discard_idx.begin());
+
+  // remove filtered predicted bounding boxes.
+  reduce_by_index(prediction.x_max, discard_idx);
+  reduce_by_index(prediction.x_min, discard_idx);
+  reduce_by_index(prediction.y_max, discard_idx);
+  reduce_by_index(prediction.y_min, discard_idx);
+  reduce_by_index(prediction.cls_score, discard_idx);
+  reduce_by_index(prediction.cls, discard_idx);
+  prediction.size -= discard_idx.size();
+}
+
+// Calculates the actual size of the groundtruth bounding boxes.
+void resize_annotation_boxes(Tensor3d<double> &raw_annotations, AnnotationEntry &annotation, const unsigned batch) {
+  for (auto &gt_box : raw_annotations[batch]) {
+    annotation.x_max.emplace_back(gt_box[BBOX_X] + gt_box[BBOX_W]);
+    annotation.x_min.emplace_back(gt_box[BBOX_X]);
+    annotation.y_max.emplace_back(gt_box[BBOX_Y] + gt_box[BBOX_H]);
+    annotation.y_min.emplace_back(gt_box[BBOX_Y]);
+    annotation.cls.emplace_back(gt_box[BBOX_CONFIDENCE]);
+  }
+  annotation.size = raw_annotations[batch].size();
+}
+
+// Limits the coordinates of predicted bounding boxes within the dimension of source image.
+void clip_box(PredictionEntry &prediction, const unsigned batch) {
+  if (prediction.size == 0) return;
+  double x_upper_bound = runtime_vars.source_image_sizes[batch][IMG_W];
+  double y_upper_bound = runtime_vars.source_image_sizes[batch][IMG_H];
+  auto _clip = [](double v, double lower, double upper) {  return (v < lower) ? lower : ((v > upper) ? upper : v); };
+  for (unsigned idx = 0; idx < prediction.size; idx++) {
+    prediction.x_max[idx] = _clip(prediction.x_max[idx], 0, x_upper_bound);
+    prediction.x_min[idx] = _clip(prediction.x_min[idx], 0, x_upper_bound);
+    prediction.y_max[idx] = _clip(prediction.y_max[idx], 0, y_upper_bound);
+    prediction.y_min[idx] = _clip(prediction.y_min[idx], 0, y_upper_bound);
+  }
+}
+
+// Calculates area under the PR curve using 11-intervaled sum.
+double average_precision(const std::vector<double> &precision, const std::vector<double> &recall, unsigned interval) {
+  double result = 0.0;
+  double step = 1 / (double)(interval - 1);
+  for (unsigned intvl = 0; intvl < interval; intvl++) {
+    double point = step * intvl;
+    double max_precision = 0.0;
+    for (unsigned idx = 0; idx < recall.size(); idx++) {
+      if (recall[idx] >= point) {
+        if (precision[idx] > max_precision) {
+          max_precision = precision[idx];
+        }
+      }
+    }
+    result += max_precision / (double)interval;
+  }
+  return result;
+}
+
+// Stores intermediate statistics for AP calculation. AP's are calculated from
+// true-positive, false-positive, and the number of targets, sorted
+// by the class score of the predicted bounding box.
+typedef struct _map_stats {
+  int num_gt_object;
+  std::vector<double> scores;
+  std::vector<int> true_positive;
+  std::vector<int> false_positive;
+
+  _map_stats() { this->num_gt_object = 0; }
+} mAPStats;
+
+// Calculates the 11-point interpolated mAP.
+std::vector<mAPStats> mean_average_precision(PredictionEntry &prediction, AnnotationEntry &annotation, double thresh) {
+  std::vector<int> class_list(yolo_meta.num_classes);
+  std::generate(class_list.begin(), class_list.end(), [n = 0]() mutable { return n++; });
+
+  std::vector<mAPStats> image_result(yolo_meta.num_classes, mAPStats{});
+
+  // average precision for each class
+  for (int category : class_list) {
+    // total number of bounding boxes in the annotation.
+    int num_gt_object =
+        std::count_if(annotation.cls.begin(), annotation.cls.end(), [&](int &cls) { return (cls == (int)category); });
+
+    // total number of predicted bounding boxes.
+    int num_pred_boxes =
+        std::count_if(prediction.cls.begin(), prediction.cls.end(), [&](int &cls) { return (cls == (int)category); });
+
+    image_result[category].num_gt_object = num_gt_object;
+
+    // stores the scores for sorting out the correct order of TP and FP.
+    image_result[category].true_positive.resize(num_pred_boxes, 0);
+    image_result[category].false_positive.resize(num_pred_boxes, 0);
+    image_result[category].scores.resize(num_pred_boxes, 0.0);
+    std::set<unsigned> matched_gtbox;
+
+    unsigned pred_num = 0;
+    std::vector<unsigned> &&sorted_pbox_idx = argsort_gt(prediction.cls_score);
+    for (unsigned &pbox_idx : sorted_pbox_idx) {
+      if (prediction.cls[pbox_idx] != category) continue;
+      image_result[category].scores[pred_num] = prediction.cls_score[pbox_idx];
+
+      unsigned most_overlapped_idx = 0;
+      double most_overlapped_iou = 0.0;
+
+      // finds the most overlapped predicted bounding box.
+      for (unsigned gtbox_idx = 0; gtbox_idx < annotation.size; gtbox_idx++) {
+        if (annotation.cls[gtbox_idx] != category) continue;
+        double iou = intersection_over_union(prediction.box_at(pbox_idx), annotation.box_at(gtbox_idx));
+        if (iou > most_overlapped_iou) {
+          most_overlapped_iou = iou;
+          most_overlapped_idx = gtbox_idx;
+        }
+      }
+      // when there is no ground truth, all predicted boxes are false positive,
+      // and they are preserved for batched AP calculation.
+      if (!num_gt_object) {
+        image_result[category].false_positive[pred_num++] = 1;
+      } else {
+        // the predicted bounding box is a true positive iff. it is the most overlapped,
+        // the matched groundtruth bounding box has not been matched previously, and
+        // the iou is above `thresh`.
+        if (most_overlapped_iou >= thresh) {
+          if (matched_gtbox.find(most_overlapped_idx) == matched_gtbox.end()) {
+            matched_gtbox.insert(most_overlapped_idx);
+            image_result[category].true_positive[pred_num++] = 1;
+          } else {
+            image_result[category].false_positive[pred_num++] = 1;
+          }
+        } else {
+          image_result[category].false_positive[pred_num++] = 1;
+        }
+      }
+    }
+  }
+  return image_result;
+}
+
+// Initializes runtime variables in `runtime_vars` struct.
+void set_runtime(std::string name,
+                 unsigned niter,
+                 unsigned batch_size,
+                 const std::string &input_loc,
+                 const std::string &annotation_loc) {
+  runtime_vars.name = name;
+  runtime_vars.niter = niter;
+  runtime_vars.batch_size = batch_size;
+  runtime_vars.groundtruth_loc = annotation_loc;
+  runtime_vars.input_loc = input_loc;
+}
+
+// Return type of function `validate_yolo`.
+struct metrics {
+  std::vector<mAPStats> map;
+  Tensor2d<mAPStats> coco;
+};
+
+// Main function that takes the results data and annotation location, and calculates mAP score for the network.
+struct metrics validate_yolo(std::vector<double> &results_data,
+                             Tensor3d<double> &raw_annotations,
+                             const unsigned batch) {
+  Tensor2d<double> raw_predictions;
+  PredictionEntry prediction;
+  AnnotationEntry annotation;
+
+  // executes accuracy check recipes.
+  try {
+    parse_prediction_boxes(results_data, raw_predictions);
+    resize_and_filter_prediction_boxes(raw_predictions, prediction, batch);
+    resize_annotation_boxes(raw_annotations, annotation, batch);
+    nms(prediction);
+    clip_box(prediction, batch);
+  } catch (const std::exception &e) {
+    slog::err << "Abort postprocessing." << slog::endl;
+    std::cerr << e.what() << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  // mAP
+  std::vector<mAPStats> map_stats = mean_average_precision(prediction, annotation, yolo_meta.pascal_voc_metric);
+
+  // COCO metric
+  Tensor2d<mAPStats> coco_ap_stats;
+  std::for_each(std::begin(yolo_meta.coco_metric), std::end(yolo_meta.coco_metric), [&](const double thresh) {
+    coco_ap_stats.emplace_back(mean_average_precision(prediction, annotation, thresh));
+  });
+
+  return {map_stats, coco_ap_stats};
+}
+
+// This function appends all of the elements in `v2` at the end of `v1` in order.
+template <typename T>
+void extend(std::vector<T> &v1, const std::vector<T> &v2) {
+  v1.reserve(v1.size() + v2.size());
+  v1.insert(v1.end(), v2.begin(), v2.end());
+}
+
+// Updates the batched statistics from individual image's result. The final batched AP and COCO AP is
+// calculated based on updated `batched_stats`.
+void metrics_update(std::vector<mAPStats> &batched_stats, const std::vector<mAPStats> &img_stats) {
+  for (unsigned cat = 0; cat < yolo_meta.num_classes; cat++) {
+    batched_stats[cat].num_gt_object += img_stats[cat].num_gt_object;
+    // updates batched statistics. omits the class where no prediction presents.
+    if (!img_stats[cat].scores.size()) continue;
+    extend(batched_stats[cat].scores, img_stats[cat].scores);
+    extend(batched_stats[cat].true_positive, img_stats[cat].true_positive);
+    extend(batched_stats[cat].false_positive, img_stats[cat].false_positive);
+  }
+}
+
+// Calculates AP using the given integral function.
+double metrics_eval(const std::vector<mAPStats> &stats, unsigned interval) {
+  std::vector<double> class_aps;
+  for (unsigned category = 0; category < yolo_meta.num_classes; category++) {
+    // omits the class when there is no prediction presents.
+    if (!stats[category].scores.size()) continue;
+    // the predictions are false-positive when there is no groundtruth for this
+    // class, and therefore the class AP is 0.0
+    if (stats[category].num_gt_object == 0 && stats[category].scores.size()) {
+      class_aps.push_back(0.0);
+      continue;
+    }
+
+    int TP = 0, FP = 0;
+    std::vector<double> precision, recall;
+
+    // sorts the tp and fp based on the order of confidence score.
+    std::vector<unsigned> &&sorted_stats_index = argsort_gt(stats[category].scores);
+    // calculates intermediate statistics calculation.
+    for (unsigned idx : sorted_stats_index) {
+      TP += stats[category].true_positive[idx];
+      FP += stats[category].false_positive[idx];
+      precision.emplace_back(TP / (double)(TP + FP));
+      recall.emplace_back(TP / (double)stats[category].num_gt_object);
+    }
+    // returns ROC of P-R curve.
+    class_aps.emplace_back(average_precision(precision, recall, interval));
+  }
+  return std::accumulate(class_aps.begin(), class_aps.end(), 0.0) / (double)class_aps.size();
+}
+
+// Wrapper of the function `validate_yolo`. This function prepares data and dispatches metrics calculations for each
+// validation image, accumulates metrics results, and returns the batched mAP and COCO AP.
+std::pair<double, double> validate_yolo_wrapper(std::map<std::string, ov::TensorVector> &raw_results,
+                                                const std::vector<ov::Output<const ov::Node>> &result_layout,
+                                                std::vector<std::string> input_files) {
+  slog::info << "Start validating yolo." << slog::endl;
+  std::ofstream fout;
+  fout.open("ap_report.txt");
+  // preserves all correct paths to validation images.
+  int num_image = runtime_vars.niter * runtime_vars.batch_size;
+  std::vector<std::string> input_image_paths;
+  std::sort(std::begin(input_files), std::end(input_files));
+  // input_files is guaranteed not to be empty since that case is filtered out.
+  for (auto &path : input_files) {
+    if (path == "") break;
+    if (num_image == 0) break;
+    input_image_paths.push_back(path);
+    num_image--;
+  }
+
+  // checks if there exists enough image files; this should always pass unless the image file is
+  // deleted right after the inferencing step.
+  if (num_image != 0) {
+    slog::err << "Not enough image input found. " << runtime_vars.batch_size * runtime_vars.niter << " required, "
+              << (runtime_vars.batch_size * runtime_vars.niter - num_image) << " provided." << slog::endl;
+    exit(EXIT_FAILURE);
+  }
+  // stores all annotation boxes for each image from groundtruth file.
+  // if an input image does not have a corresponding groundtruth file, an error occurs.
+  Tensor3d<double> raw_annotations;
+  int err = collect_validation_dataset(input_image_paths, raw_annotations, runtime_vars.source_image_sizes);
+  if (err) exit(EXIT_FAILURE);
+
+  // updates the metrics each image at a time to reduce memory overhead. the result for each image
+  // is accumulated in `batched_stats` and it will be used for batched mAP and COCO AP calculation.
+  metrics batched_stats;
+  batched_stats.map.resize(yolo_meta.num_classes, mAPStats{});
+  batched_stats.coco.resize(yolo_meta.coco_metric.size(), std::vector<mAPStats>{});
+  std::for_each(batched_stats.coco.begin(), batched_stats.coco.end(), [&](std::vector<mAPStats> &stats) {
+    stats.resize(yolo_meta.num_classes, mAPStats{});
+  });
+
+  for (unsigned batch = 0; batch < runtime_vars.niter; batch++) {
+    for (unsigned img = 0; img < runtime_vars.batch_size; img++) {
+      // stores the flattened output tensors from the resulting convolution layers.
+      std::vector<double> curr_img_data;
+      for (auto &item : result_layout) {
+        const std::string &name = item.get_any_name();
+        auto curr_outputBlob = raw_results.at(name).at(batch);
+        auto output_tensor_start = curr_outputBlob.data<float>();
+        unsigned output_size = curr_outputBlob.get_size() / runtime_vars.batch_size;
+        unsigned offset = img * output_size;
+        for (unsigned idx = 0; idx < output_size; idx++) {
+          curr_img_data.push_back(output_tensor_start[idx + offset]);
+        }
+      }
+
+      struct metrics &&curr_img_stats =
+          validate_yolo(curr_img_data, raw_annotations, img + batch * runtime_vars.batch_size);
+      metrics_update(batched_stats.map, curr_img_stats.map);
+      for (unsigned thresh = 0; thresh < yolo_meta.coco_metric.size(); thresh++) {
+        metrics_update(batched_stats.coco[thresh], curr_img_stats.coco[thresh]);
+      }
+
+      double img_AP = metrics_eval(curr_img_stats.map, yolo_meta.ap_interval);
+      // fout << "image " << input_files[img] << " AP @ 0.5" << std::endl;
+      fout << std::fixed << std::setprecision(10) << img_AP << std::endl;
+    }
+  }
+
+  double map = metrics_eval(batched_stats.map, yolo_meta.ap_interval);
+  double coco_ap = 0.0;
+  for (auto &coco_stats : batched_stats.coco) {
+    coco_ap += metrics_eval(coco_stats, yolo_meta.coco_interval);
+  }
+  coco_ap /= (double)yolo_meta.coco_metric.size();
+
+  fout << "\nAP at IoU=.50: " << std::fixed << std::setprecision(6) << map * 100 << "%" << std::endl;
+  fout << "AP at IoU=.50:.05:.95: " << std::fixed << std::setprecision(10) << coco_ap * 100 << "%" << std::endl;
+  fout.close();
+
+  std::cout << "ap_report.txt has been generated in the current directory." << std::endl;
+
+  return std::make_pair(map, coco_ap);
+}