17 files changed, 5870 insertions, 0 deletions
diff --git a/python/openvino/runtime/dla_benchmark/CMakeLists.txt b/python/openvino/runtime/dla_benchmark/CMakeLists.txt
new file mode 100644
index 0000000..3a50459
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/CMakeLists.txt
@@ -0,0 +1,82 @@
+# Copyright (C) 2018-2020 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set (TARGET_NAME "dla_benchmark")
+
+if (CMAKE_CXX_COMPILER_ID MATCHES MSVC)
+  set(CMAKE_CXX_STANDARD 20)
+else()
+  set (CMAKE_CXX_STANDARD 14)
+endif()
+set (CMAKE_CXX_STANDARD_REQUIRED ON)
+if (NOT WIN32)
+  if (NOT("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel"))
+    set (CMAKE_CXX_FLAGS "-std=c++14 ${CMAKE_CXX_FLAGS}")
+  endif()
+endif()
+
+file (GLOB MAIN_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../common/utils/src/*.cpp
+)
+
+file (GLOB MAIN_HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
+)
+
+# Create named folders for the sources within the .vcproj
+# Empty name lists them directly under the .vcproj
+source_group("src" FILES ${MAIN_SRC})
+source_group("include" FILES ${MAIN_HEADERS})
+
+if (DE10_AGILEX)
+  add_definitions(-DDE10_AGILEX)
+endif()
+
+# Find OpenCV components if exist
+find_package(OpenCV COMPONENTS core highgui imgcodecs imgproc videoio REQUIRED)
+
+# Create library file from sources.
+add_executable(${TARGET_NAME} ${MAIN_SRC} ${MAIN_HEADERS})
+
+# For FPGA plugin configs and properties.
+target_include_directories(${TARGET_NAME} PRIVATE
+    "$ENV{COREDLA_ROOT}/dla_plugin/inc/"
+    "$ENV{COREDLA_ROOT}/util/inc/"
+)
+
+if (NOT WIN32)
+    set (LIB_DL dl)
+endif()
+
+target_link_libraries(${TARGET_NAME} PRIVATE
+  openvino::runtime
+  openvino_dev_api
+  ${OpenCV_LIBRARIES}
+  coreDLAHeteroPlugin
+  format_reader
+  ie_samples_utils
+)
+
+if (NOT WIN32)
+    target_link_libraries(${TARGET_NAME} PRIVATE ${LIB_DL} pthread)
+endif()
+
+set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN/../lib")
+
+# For libcoreDlaRuntimePlugin.so - typically specified by $COREDLA_ROOT/runtime/plugins.xml
+set_target_properties(${TARGET_NAME} PROPERTIES BUILD_RPATH "\$ORIGIN/..")
+
+# Enable high graph logging by defining its macro
+# Change to add_compile_definitions() once we move to cmake >= 3.12
+if (DLA_ENABLE_LOGGING)
+  target_compile_definitions(${TARGET_NAME} PRIVATE -DENABLE_HG_LOGGING)
+endif()
+
+# Ensure number of inference request is 1 when using the system-console plugin
+if (SYSTEM_CONSOLE_PLATFORM)
+  target_compile_definitions(${TARGET_NAME} PRIVATE -DMAX_NUM_INFERENCE_REQUEST=1)
+endif()
+
+install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION "dla/bin" COMPONENT DEMO)
diff --git a/python/openvino/runtime/dla_benchmark/README.md b/python/openvino/runtime/dla_benchmark/README.md
new file mode 100644
index 0000000..9734013
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/README.md
@@ -0,0 +1,179 @@
+# Benchmark C++ Tool
+
+This topic demonstrates how to use the Benchmark C++ Tool to estimate deep learning inference performance on supported devices. Performance can be measured for two inference modes: synchronous (latency-oriented) and asynchronous (throughput-oriented).
+
+> **NOTE:** This topic describes usage of C++ implementation of the Benchmark Tool. For the Python* implementation, refer to [Benchmark Python* Tool](../python_demos/OpenVINO_benchmark_app/README.md).
+
+## New Features Added
+
+Some of the changes made in the dla_benchmark C++ tool for the Intel FPGA AI Suite are:
+* Dumping output values into a text file named `result.txt`.
+* In the `result.txt` file, in addition to output values, output tensor index is added to each value after the # sign to allow easier identification when the graph has multiple outputs.
+* In addition to `result.txt`, the dla_benchmark will generate another text file named `result_tensor_boundaries.txt` that lists which lines of the result.txt file are for which output tensor as well as the layout and dimension of each output tensor.
+* Top1/top5 accuracy check is added.
+
+> **NOTE**: The following README is directly from OpenVINO.
+
+## How It Works
+
+Upon start-up, the application reads command-line parameters and loads a network and images/binary files to the Inference Engine plugin, which is chosen depending on a specified device. The number of infer requests and execution approach depend on the mode defined with the `-api` command-line parameter.
+
+> **NOTE**: By default, Inference Engine samples, tools and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+
+If you run the application in the synchronous mode, it creates one infer request and executes the `Infer` method.
+If you run the application in the asynchronous mode, it creates as many infer requests as specified in the `-nireq` command-line parameter and executes the `StartAsync` method for each of them. If `-nireq` is not set, the application will use the default value for specified device.
+
+A number of execution steps is defined by one of the following parameters:
+* Number of iterations specified with the `-niter` command-line argument
+* Time duration specified with the `-t` command-line argument
+* Both of them (execution will continue until both conditions are met)
+* Predefined duration if `-niter` and `-t` are not specified. Predefined duration value depends on a device.
+
+During the execution, the application collects latency for each executed infer request.
+
+Reported latency value is calculated as a median value of all collected latencies. Reported throughput value is reported
+in frames per second (FPS) and calculated as a derivative from:
+* Reported latency in the Sync mode
+* The total execution time in the Async mode
+
+Throughput value also depends on batch size.
+
+The application can save a summary of the run, including the selected command line parameters and a copy of the high-level execution statistics (e.g. overall throughput, execution wall-clock time), by setting the `-save_run_summary` flag. This summary is saved in dla_benchmark_run_summary.csv.
+
+The application also saves executable graph information serialized to a XML file if you specify a path to it with the
+`-exec_graph_path` parameter.
+
+
+## Run the Tool
+Notice that the dla_benchmark usually produces optimal performance for any device out of the box.
+
+**So in most cases you don't need to play the app options explicitly and the plain device name is enough**, for example, for CPU:
+```sh
+./dla_benchmark -m <model> -i <input> -d CPU
+```
+
+But it is still may be non-optimal for some cases, especially for very small networks. More details can read in [Introduction to Performance Topics](./docs/IE_DG/Intro_to_Performance.md).
+
+As explained in the  [Introduction to Performance Topics](./docs/IE_DG/Intro_to_Performance.md) section, for all devices, including new [MULTI device](./docs/IE_DG/supported_plugins/MULTI.md) it is preferable to use the FP16 IR for the model.
+Also if latency of the CPU inference on the multi-socket machines is of concern, please refer to the same
+[Introduction to Performance Topics](./docs/IE_DG/Intro_to_Performance.md) document.
+
+Running the application with the `-h` option yields the following usage message:
+```
+./dla_benchmark -h
+InferenceEngine:
+        API version ............ <version>
+        Build .................. <number>
+[ INFO ] Parsing input parameters
+
+dla_benchmark [OPTION]
+Options:
+
+    -h, --help                Print a usage message
+    -i "<path>"               Optional. Path to a folder with images and/or binaries or to specific image or binary file.
+    -m "<path>"               Required. Path to an .xml file with a trained model.
+    -d "<device>"             Optional. Specify a target device to infer on (the list of available devices is shown below). Default value is CPU.
+                              Use "-d HETERO:<comma-separated_devices_list>" format to specify HETERO plugin.
+    -l "<absolute_path>"      Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.
+    -api "<sync/async>"       Optional. Enable Sync/Async API. Default value is "async".
+    -niter "<integer>"        Optional. Number of iterations. If not specified, the number of iterations is calculated depending on a device.
+    -nireq "<integer>"        Optional. Number of infer requests. Default value is determined automatically for a device.
+    -b "<integer>"            Optional. Batch size value. If not specified, the batch size value is determined from Intermediate Representation.
+    -stream_output            Optional. Print progress as a plain text. When specified, an interactive progress bar is replaced with a multiline output.
+    -t                        Optional. Time in seconds to execute topology.
+    -progress                 Optional. Show progress bar (can affect performance measurement). Default values is "false".
+
+  CPU-specific performance options:
+    -nstreams "<integer>"     Optional. Number of streams to use for inference on the CPU in throughput mode
+                              (for HETERO device cases use format <device1>:<nstreams1>,<device2>:<nstreams2> or just <nstreams>).
+                              Default value is determined automatically for a device.
+                              Please note that although the automatic selection usually provides a reasonable performance,
+                              it still may be non-optimal for some cases, especially for very small networks.
+    -nthreads "<integer>"     Optional. Number of threads to use for inference on the CPU (including HETERO cases).
+    -pin "YES"/"NUMA"/"NO"    Optional. Enable threads->cores ("YES", default), threads->(NUMA)nodes ("NUMA") or completely disable ("NO")
+                              CPU threads pinning for CPU-involved inference.
+
+  Statistics dumping options:
+    -save_run_summary         Optional. Enable saving a summary of the run containing the specified command-line parameters and a copy of the performance report printed to stdout.
+    -report_folder            Optional. Path to a folder where statistics report is stored.
+    -exec_graph_path          Optional. Path to a file where to store executable graph information serialized.
+```
+
+Running the application with the empty list of options yields the usage message given above and an error message.
+
+Application supports topologies with one or more inputs. If a topology is not data sensitive, you can skip the input parameter. In this case, inputs are filled with random values.
+If a model has only image input(s), please a provide folder with images or a path to an image as input.
+If a model has some specific input(s) (not images), please prepare a binary file(s), which is filled with data of appropriate precision and provide a path to them as input.
+If a model has mixed input types, input folder should contain all required files. Image inputs are filled with image files one by one. Binary inputs are filled with binary inputs one by one.
+
+To run the tool, you can use public or Intel's pre-trained models. To download the models, use the OpenVINO [Model Downloader](./tools/downloader/README.md) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/).
+
+> **NOTE**: Before running the tool with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+
+## Examples of Running the Tool
+
+This section provides step-by-step instructions on how to run the Benchmark Tool with the `googlenet-v1` public model on CPU or FPGA devices. As an input, the `car.png` file from the `<INSTALL_DIR>/deployment_tools/demo/` directory is used.
+
+> **NOTE:** The Internet access is required to execute the following steps successfully. If you have access to the Internet through the proxy server only, please make sure that it is configured in your OS environment.
+
+1. Download the model. Go to the the Model Downloader directory and run the `downloader.py` script with specifying the model name and directory to download the model to:
+   ```sh
+   cd <INSTAL_DIR>/deployment_tools/open_model_zoo/tools/downloader
+   ```
+   ```sh
+   python3 downloader.py --name googlenet-v1 -o <models_dir>
+   ```
+2. Convert the model to the Inference Engine IR format. Go to the Model Optimizer directory and run the `mo.py` script with specifying the path to the model, model format (which must be FP32 for CPU and FPG) and output directory to generate the IR files:
+   ```sh
+   cd <INSTALL_DIR>/deployment_tools/model_optimizer
+   ```
+   ```sh
+   python3 mo.py --input_model <models_dir>/public/googlenet-v1/googlenet-v1.caffemodel --data_type FP32 --output_dir <ir_dir>
+   ```
+3. Run the tool with specifying the `<INSTALL_DIR>/deployment_tools/demo/car.png` file as an input image, the IR of the `googlenet-v1` model and a device to perform inference on. The following commands demonstrate running the Benchmark Tool in the asynchronous mode on CPU and FPGA devices:
+
+   * On CPU:
+   ```sh
+   ./dla_benchmark -m <ir_dir>/googlenet-v1.xml -d CPU -api async -i <INSTALL_DIR>/deployment_tools/demo/car.png --progress true
+   ```
+   * On FPGA:
+   ```sh
+   ./dla_benchmark -m <ir_dir>/googlenet-v1.xml -d HETERO:FPGA,CPU -api async -i <INSTALL_DIR>/deployment_tools/demo/car.png --progress true
+   ```
+
+The application outputs the number of executed iterations, total duration of execution, latency and throughput.
+Additionally, if you set the `-save_run_summary` flag the application saves a report containing the selected command line parameters and a copy of the overall performance report printed to stdout. If you set `-exec_graph_path`, the application reports executable graph information serialized. All measurements are reported in milliseconds.
+
+Below are fragments of sample output for CPU and FPGA devices:
+
+* For CPU:
+   ```
+   [Step 8/9] Measuring performance (Start inference asyncronously, 60000 ms duration, 4 inference requests in parallel using 4 streams)
+   Progress: [....................] 100.00% done
+
+   [Step 9/9] Dumping statistics report
+   [ INFO ] Statistics collecting was not requested. No reports are dumped.
+   Progress: [....................] 100.00% done
+
+   Count:      4612 iterations
+   Duration:   60110.04 ms
+   Latency:    50.99 ms
+   Throughput: 76.73 FPS
+   ```
+
+* For FPGA:
+   ```
+   [Step 10/11] Measuring performance (Start inference asynchronously, 5 inference requests using 4 streams for CPU, limits: 120000 ms duration)
+   Progress: [....................] 100% done
+
+   [Step 11/11] Dumping statistics report
+   Count:      102515 iterations
+   Duration:   120007.38 ms
+   Latency:    5.84 ms
+   Throughput: 854.24 FP
+   ```
+
+## See Also
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
+* [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md)
+* [Model Downloader](./tools/downloader/README.md)
diff --git a/python/openvino/runtime/dla_benchmark/average_precision.cpp b/python/openvino/runtime/dla_benchmark/average_precision.cpp
new file mode 100644
index 0000000..84008b7
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/average_precision.cpp
@@ -0,0 +1,696 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// The function of this file is to provide mAP and COCO AP calculation in metrics_eval
+// and metrics_update. The calculation is comprised with two parts, 1) data preprocessing,
+// and 2) metrics calculation. Data preprocessing consists of prediction box parsing;
+// resize and filtering; non-max suppression; and clipping. The preprocessed data is stored
+// in `PredictionEntry` and `AnnotationEntry` structs, which are used in the `metrics_update`
+// and `metrics_eval`. `metrics_update` updates intermidiate statistics to form the batched
+// statistics, and the `metrics_eval` calculated the integral of the ROC of P-R curve. All of
+// the metadata should be set in the header file and the runtime invariants are set using
+// `set_runtime`. The validate_yolo_wrapper is the main entery point of the subroutine.
+//
+// The mAP algorithm is built according to the section 2.2 in https://arxiv.org/pdf/1607.03476.pdf
+// and OpenVINO's accuracy_checker. The COCO AP algorithm is specified in
+// https://cocodataset.org/#detection-eval. The result is compared value-by-value with the
+// result from OpenVINO's accuracy_checker using dlsdk launcher. To obtain the the golden
+// result, apply the steps in https://docs.openvino.ai/latest/omz_models_model_yolo_v3_tf.html.
+
+#include "average_precision.hpp"
+#include <cmath>
+#if defined(_WIN32) || defined(_WIN64)
+#include <io.h>
+#else
+#include <dirent.h>
+#endif
+#if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
+#include <filesystem>
+namespace fs = std::filesystem;
+#endif
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <utility>
+#include <sstream>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <samples/slog.hpp>
+#include "utils.hpp"
+
+#define VERBOSE 0
+
+// Parses predicted boxes in `results_data` to a 2d tensor `raw_predictions`. The parameter
+// `batch` indicates the image which corresponds to those predicted boxes.
+// Order: conv2d_12[1x255x26x26] -> conv2d_9[1x255x13x13], NCHW order
+void parse_prediction_boxes(std::vector<double> &predicted_val, Tensor2d<double> &raw_predictions) {
+  raw_predictions.emplace_back(std::vector<double>{});
+  const std::vector<unsigned> &grid_sizes = yolo_meta.grid_sizes.at(runtime_vars.name);
+
+  int total_boxes{0};
+  std::for_each(std::begin(grid_sizes), std::end(grid_sizes), [&](unsigned n) {
+    total_boxes += std::pow(n, 2) * yolo_meta.box_per_channel;
+  });
+
+  for (int count = 0; count < total_boxes; count++) {
+    raw_predictions.emplace_back(Box<double>{});
+    raw_predictions[count].reserve(yolo_meta.pbox_size);
+  }
+
+  auto index_of = [=](int n, int c, int h, int w, int C, int H, int W) {
+    return n * C * H * W + c * H * W + h * W + w;
+  };
+
+  // first are boxes in 26x26 grid
+  // treat each tensor as 3 batchs
+  for (int grid : grid_sizes) {
+    // offset to where the data is retrieved
+    int data_offset{0};
+    // offset to where the data is inserted
+    int position_offset{0};
+    for (int n : grid_sizes) {
+      if (n == grid) break;
+      data_offset += pow(n, 2) * yolo_meta.channel;
+      position_offset += pow(n, 2) * yolo_meta.box_per_channel;
+    }
+
+    int N = yolo_meta.box_per_channel, C = yolo_meta.pbox_size, H = grid, W = grid;
+
+    for (int n = 0; n < N; n++) {
+      for (int c = 0; c < C; c++) {
+        for (int h = 0; h < H; h++) {
+          for (int w = 0; w < W; w++) {
+            // corresponds to #c data for grid #h,w, of the #n anchor
+            Box<double> &pbox = raw_predictions[position_offset + n * H * W + h * W + w];
+            // fills prediction boxes
+            pbox.emplace_back(predicted_val[data_offset + index_of(n, c, h, w, C, H, W)]);
+          }
+        }
+      }
+    }
+  }
+}
+
+// Parses annotation boxes stored in text file and stores in a 3d tensor `raw_annotation`.
+// Precondition: the file is formatted such that each line contains 5 doubles and separated
+// by spaces, i.e. [class, x, y, width, height]. Returns -3 if cannot read from file.
+int parse_annotation_boxes(Tensor3d<double> &raw_annotation, const std::string &path) {
+  int err = 0;
+  std::ifstream annotation_file(path);
+  if (!annotation_file.is_open()) {
+    slog::err << "Couldn't access path: " << path << slog::endl;
+    err = -3;
+  } else {
+    Tensor2d<double> annotation_box;
+    int class_id;
+    double x, y, w, h;
+    while (annotation_file >> class_id >> x >> y >> w >> h) {
+      annotation_box.emplace_back(Box<double>{x, y, w, h, (double)class_id});
+    }
+    raw_annotation.emplace_back(annotation_box);
+  }
+  return err;
+}
+
+// Extracts filenames in `path` with given extension specified in `extensions`.
+// Returns the number of file with extension `ext`, or -1 for error.
+int walk_dirent(std::vector<std::string> &names, const std::string &path, std::string ext) {
+#if defined(_WIN32) || defined(_WIN64)
+#if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
+  int count = 0;
+  for (const auto &entry : fs::directory_iterator(path)) {
+    if (fs::is_regular_file(entry)) {
+      std::string filename = entry.path().filename().string();
+      std::string file_extension = filename.substr(filename.find_last_of(".") + 1);
+      if (file_extension == ext) {
+        names.emplace_back(filename);
+        count++;
+      }
+    }
+  }
+#endif
+#else
+  DIR *dir = opendir(path.c_str());
+  int count = 0;
+  if (!dir) {
+    slog::err << "Couldn't access path: " << path << slog::endl;
+    count = -1;
+  } else {
+    for (struct dirent *dent; (dent = readdir(dir)) != nullptr;) {
+      std::string dirname(dent->d_name);
+      std::string stem = GetStem(dirname);
+      std::string extension = GetExtension(dirname);
+      if (stem == "" || stem == "." || extension != ext) continue;
+      names.emplace_back(stem);
+      count += 1;
+    }
+    closedir(dir);
+  }
+#endif
+  return count;
+}
+
+// Dispatches each step of collecting predicted boxes, annotation boxes, and shapes.
+// The function returns 0 on success, -1 for mismatch in the number of annotation files
+// and validation images, -2 for missing annotation file, -3 for failing to access annotation
+// file, and -4 for failing to access validation image.
+int collect_validation_dataset(std::vector<std::string> &image_paths,
+                               Tensor3d<double> &raw_annotations,
+                               Tensor2d<double> &shapes) {
+  int err = 0;
+
+  // set of annotation file name
+  std::vector<std::string> tmp;
+  int num_file = walk_dirent(tmp, runtime_vars.groundtruth_loc, runtime_vars.gt_extension);
+  if (num_file < (int)(runtime_vars.batch_size * runtime_vars.niter)) {
+    if (num_file >= 0) {
+      slog::err << "Not enough validation data found. " << runtime_vars.batch_size * runtime_vars.niter << " required, "
+                << num_file << " provided." << slog::endl;
+    }
+    err = -1;
+  } else {
+    std::set<std::string> annotation_file_index(tmp.begin(), tmp.end());
+
+    // gets all images, corresponding annotation, and shapes
+    std::string gt_path;
+    for (unsigned batch = 0; batch < runtime_vars.batch_size * runtime_vars.niter; batch++) {
+      std::string image_path(image_paths[batch]);
+      std::string img_name = GetStem(image_path);
+      if (annotation_file_index.find(img_name) == annotation_file_index.end()) {
+        slog::err << "Missing annotation file for validation image: " << image_paths[batch] << slog::endl;
+        err = -2;
+        break;
+      } else {
+        gt_path = runtime_vars.groundtruth_loc + "/" + img_name + "." + runtime_vars.gt_extension;
+
+        // gets image dimensions
+        cv::Mat image = cv::imread(image_paths[batch]);
+        if (image.data == nullptr || image.empty()) {
+          slog::err << "Couldn't open input image: " << image_paths[batch] << slog::endl;
+          err = -4;
+          break;
+        }
+
+        err = parse_annotation_boxes(raw_annotations, gt_path);
+        if (err != 0) break;
+        shapes.emplace_back(Box<double>{(double)image.cols, (double)image.rows});
+      }
+    }
+  }
+  return err;
+}
+
+// Removes items at `indices` in the vector `vec`
+template <typename T>
+void reduce_by_index(std::vector<T> &vec, std::vector<unsigned> indices) {
+  std::sort(indices.begin(), indices.end());
+  for (auto it = indices.rbegin(); it != indices.rend(); it++) {
+    vec.erase(vec.begin() + *it);
+  }
+}
+
+// Calculates and returns the Intersection over Union score for two boxes by
+// calculating their area of overlap and area of union.
+double intersection_over_union(Box<double> box1, Box<double> box2) {
+  using namespace std;
+  {
+    double intersect_length_x =
+        max(0.0, min(box1[X_MAX], box2[X_MAX]) - max(box1[X_MIN], box2[X_MIN]) + yolo_meta.boundary);
+    double intersect_length_y =
+        max(0.0, min(box1[Y_MAX], box2[Y_MAX]) - max(box1[Y_MIN], box2[Y_MIN]) + yolo_meta.boundary);
+    double intersection_of_area = intersect_length_x * intersect_length_y;
+    double box1_area =
+        (box1[X_MAX] - box1[X_MIN] + yolo_meta.boundary) * (box1[Y_MAX] - box1[Y_MIN] + yolo_meta.boundary);
+    double box2_area =
+        (box2[X_MAX] - box2[X_MIN] + yolo_meta.boundary) * (box2[Y_MAX] - box2[Y_MIN] + yolo_meta.boundary);
+    double union_of_area = box1_area + box2_area - intersection_of_area;
+    return (union_of_area > 0.0) ? intersection_of_area / union_of_area : 0.0;
+  }  // namespace std
+}
+
+// This function returns the index of the largest element in the vector `vec`.
+template <typename T>
+int argmax(std::vector<T> vec) {
+  return std::distance(vec.begin(), std::max_element(vec.begin(), vec.end()));
+}
+
+// This function returns the index of the largest element in the iterator from `begin` to `end`.
+template <typename Iter>
+int argmax(Iter begin, Iter end) {
+  return std::distance(begin, std::max_element(begin, end));
+}
+
+// Resize the coordinates of bounding boxes from relative ratio to grid cell to the actual coordinates in pixel.
+// This function resizes prediction boxes in the 2d tensor `raw_predictions` based on the definition in page 4 of
+// https://arxiv.org/pdf/1612.08242.pdf. The prediction boxes are also filtered based on their confidence score
+// and class specific score. The result is stored in an instance of `PredictionEntry` which is used for statistics
+// calculation.
+void resize_and_filter_prediction_boxes(Tensor2d<double> &raw_predictions,
+                                        PredictionEntry &prediction,
+                                        const unsigned batch) {
+  unsigned size = 0;
+
+#if VERBOSE == 1
+  unsigned c12 = 0, c9 = 0, c58 = 0, c66 = 0, c74 = 0;
+#endif
+
+  for (unsigned grid : yolo_meta.grid_sizes.at(runtime_vars.name)) {
+    unsigned offset{0};
+    for (unsigned n : yolo_meta.grid_sizes.at(runtime_vars.name)) {
+      if (n == grid) break;
+      offset += pow(n, 2) * yolo_meta.box_per_channel;
+    }
+    for (unsigned x = 0; x < grid; x++) {
+      for (unsigned y = 0; y < grid; y++) {
+        for (unsigned n = 0; n < yolo_meta.box_per_channel; n++) {
+          unsigned bbox_idx = offset + n * pow(grid, 2) + y * grid + x;
+          Box<double> &bbox = raw_predictions[bbox_idx];
+
+          // find the predicted label as the class with highest score
+          int label = argmax(bbox.begin() + (yolo_meta.pbox_size - yolo_meta.num_classes), bbox.end());
+          double cls_score = bbox[BBOX_CONFIDENCE] * bbox[(yolo_meta.pbox_size - yolo_meta.num_classes) + label];
+          // filter outliers with low confidence score or class score
+          if (bbox[BBOX_CONFIDENCE] < yolo_meta.confidence_threshold || cls_score < yolo_meta.confidence_threshold)
+            continue;
+          prediction.cls.push_back(label);
+          prediction.cls_score.push_back(cls_score);
+#if VERBOSE == 1
+          c74 += (unsigned)(grid == 52);
+          c66 += (unsigned)(grid == 26);
+          c58 += (unsigned)(grid == 13);
+          c12 += (unsigned)(grid == 26);
+          c9 += (unsigned)(grid == 13);
+#endif
+          // deduce anchor box width and height
+          unsigned dim = yolo_meta.anchor_sizes.at(runtime_vars.name).at(grid).size() / yolo_meta.box_per_channel;
+          double anchor_w = yolo_meta.anchor_sizes.at(runtime_vars.name).at(grid)[n * dim];
+          double anchor_h = yolo_meta.anchor_sizes.at(runtime_vars.name).at(grid)[n * dim + 1];
+
+          // calculate width and height of bbox
+          double bbox_center_x = (bbox[BBOX_X] + x) / grid;
+          double bbox_center_y = (bbox[BBOX_Y] + y) / grid;
+          double bbox_w = exp(bbox[BBOX_W]) * anchor_w / yolo_meta.dst_image_size[IMG_W];
+          double bbox_h = exp(bbox[BBOX_H]) * anchor_h / yolo_meta.dst_image_size[IMG_H];
+
+          // calculate actual coordinates of bbox
+          double x_max, x_min, y_max, y_min;
+          double w = runtime_vars.source_image_sizes[batch][IMG_W];
+          double h = runtime_vars.source_image_sizes[batch][IMG_H];
+
+          x_max = w * (bbox_center_x + bbox_w / 2.0);
+          x_min = w * (bbox_center_x - bbox_w / 2.0);
+          y_max = h * (bbox_center_y + bbox_h / 2.0);
+          y_min = h * (bbox_center_y - bbox_h / 2.0);
+
+          prediction.x_max.emplace_back(x_max);
+          prediction.x_min.emplace_back(x_min);
+          prediction.y_max.emplace_back(y_max);
+          prediction.y_min.emplace_back(y_min);
+
+          size += 1;
+        }
+      }
+    }
+  }
+  prediction.size = size;
+#if VERBOSE == 1
+  if (runtime_vars.name == "yolo-v3-tf") {
+    slog::info << "prediction boxes from conv2d58: " << c58 << slog::endl;
+    slog::info << "prediction boxes from conv2d66: " << c66 << slog::endl;
+    slog::info << "prediction boxes from conv2d74: " << c74 << slog::endl;
+  } else if (runtime_vars.name == "yolo-v3-tiny-tf") {
+    slog::info << "prediction boxes from conv2d12: " << c12 << slog::endl;
+    slog::info << "prediction boxes from conv2d9: " << c9 << slog::endl;
+  }
+#endif
+}
+
+// Returns indices of `vec` sorted in descending order.
+std::vector<unsigned> argsort_gt(const std::vector<double> &vec) {
+  std::vector<unsigned> order(vec.size());
+  std::generate(order.begin(), order.end(), [n = 0]() mutable { return n++; });
+  std::sort(order.begin(), order.end(), [&](int i1, int i2) { return vec[i1] > vec[i2]; });
+  return order;
+}
+
+// Performs non-maximum suppression algorithm to eliminate repetitive bounding boxes.
+// A bounding box is preserved iff. it has the highest confidence score over all
+// overlapping bounding boxes.
+void nms(PredictionEntry &prediction) {
+  if (prediction.size == 0) return;
+  std::vector<unsigned> &&order = argsort_gt(prediction.cls_score);
+  std::vector<unsigned> keep;
+  std::set<unsigned> discard;
+  unsigned top_score_idx;
+
+  while (discard.size() < order.size()) {
+    bool has_top = false;
+    for (unsigned idx : order) {
+      if (discard.find(idx) != discard.end()) continue;
+      if (!has_top) {
+        has_top = true;
+        top_score_idx = idx;
+        keep.emplace_back(top_score_idx);
+        discard.insert(top_score_idx);
+        continue;
+      }
+      double iou = intersection_over_union(prediction.box_at(idx), prediction.box_at(top_score_idx));
+      if (iou > yolo_meta.iou_threshold) {
+        discard.insert(idx);
+      }
+    }
+  }
+
+  std::vector<unsigned> discard_idx(discard.size());
+  std::vector<unsigned> indexes(discard.begin(), discard.end());
+  std::sort(indexes.begin(), indexes.end());
+  std::sort(keep.begin(), keep.end());
+  std::vector<unsigned>::iterator it =
+      std::set_difference(indexes.begin(), indexes.end(), keep.begin(), keep.end(), discard_idx.begin());
+  discard_idx.resize(it - discard_idx.begin());
+
+  // remove filtered predicted bounding boxes.
+  reduce_by_index(prediction.x_max, discard_idx);
+  reduce_by_index(prediction.x_min, discard_idx);
+  reduce_by_index(prediction.y_max, discard_idx);
+  reduce_by_index(prediction.y_min, discard_idx);
+  reduce_by_index(prediction.cls_score, discard_idx);
+  reduce_by_index(prediction.cls, discard_idx);
+  prediction.size -= discard_idx.size();
+}
+
+// Calculates the actual size of the groundtruth bounding boxes.
+void resize_annotation_boxes(Tensor3d<double> &raw_annotations, AnnotationEntry &annotation, const unsigned batch) {
+  for (auto &gt_box : raw_annotations[batch]) {
+    annotation.x_max.emplace_back(gt_box[BBOX_X] + gt_box[BBOX_W]);
+    annotation.x_min.emplace_back(gt_box[BBOX_X]);
+    annotation.y_max.emplace_back(gt_box[BBOX_Y] + gt_box[BBOX_H]);
+    annotation.y_min.emplace_back(gt_box[BBOX_Y]);
+    annotation.cls.emplace_back(gt_box[BBOX_CONFIDENCE]);
+  }
+  annotation.size = raw_annotations[batch].size();
+}
+
+// Limits the coordinates of predicted bounding boxes within the dimension of source image.
+void clip_box(PredictionEntry &prediction, const unsigned batch) {
+  if (prediction.size == 0) return;
+  double x_upper_bound = runtime_vars.source_image_sizes[batch][IMG_W];
+  double y_upper_bound = runtime_vars.source_image_sizes[batch][IMG_H];
+  auto _clip = [](double v, double lower, double upper) {  return (v < lower) ? lower : ((v > upper) ? upper : v); };
+  for (unsigned idx = 0; idx < prediction.size; idx++) {
+    prediction.x_max[idx] = _clip(prediction.x_max[idx], 0, x_upper_bound);
+    prediction.x_min[idx] = _clip(prediction.x_min[idx], 0, x_upper_bound);
+    prediction.y_max[idx] = _clip(prediction.y_max[idx], 0, y_upper_bound);
+    prediction.y_min[idx] = _clip(prediction.y_min[idx], 0, y_upper_bound);
+  }
+}
+
+// Calculates area under the PR curve using 11-intervaled sum.
+double average_precision(const std::vector<double> &precision, const std::vector<double> &recall, unsigned interval) {
+  double result = 0.0;
+  double step = 1 / (double)(interval - 1);
+  for (unsigned intvl = 0; intvl < interval; intvl++) {
+    double point = step * intvl;
+    double max_precision = 0.0;
+    for (unsigned idx = 0; idx < recall.size(); idx++) {
+      if (recall[idx] >= point) {
+        if (precision[idx] > max_precision) {
+          max_precision = precision[idx];
+        }
+      }
+    }
+    result += max_precision / (double)interval;
+  }
+  return result;
+}
+
+// Stores intermediate statistics for AP calculation. AP's are calculated from
+// true-positive, false-positive, and the number of targets, sorted
+// by the class score of the predicted bounding box.
+typedef struct _map_stats {
+  int num_gt_object;
+  std::vector<double> scores;
+  std::vector<int> true_positive;
+  std::vector<int> false_positive;
+
+  _map_stats() { this->num_gt_object = 0; }
+} mAPStats;
+
+// Calculates the 11-point interpolated mAP.
+std::vector<mAPStats> mean_average_precision(PredictionEntry &prediction, AnnotationEntry &annotation, double thresh) {
+  std::vector<int> class_list(yolo_meta.num_classes);
+  std::generate(class_list.begin(), class_list.end(), [n = 0]() mutable { return n++; });
+
+  std::vector<mAPStats> image_result(yolo_meta.num_classes, mAPStats{});
+
+  // average precision for each class
+  for (int category : class_list) {
+    // total number of bounding boxes in the annotation.
+    int num_gt_object =
+        std::count_if(annotation.cls.begin(), annotation.cls.end(), [&](int &cls) { return (cls == (int)category); });
+
+    // total number of predicted bounding boxes.
+    int num_pred_boxes =
+        std::count_if(prediction.cls.begin(), prediction.cls.end(), [&](int &cls) { return (cls == (int)category); });
+
+    image_result[category].num_gt_object = num_gt_object;
+
+    // stores the scores for sorting out the correct order of TP and FP.
+    image_result[category].true_positive.resize(num_pred_boxes, 0);
+    image_result[category].false_positive.resize(num_pred_boxes, 0);
+    image_result[category].scores.resize(num_pred_boxes, 0.0);
+    std::set<unsigned> matched_gtbox;
+
+    unsigned pred_num = 0;
+    std::vector<unsigned> &&sorted_pbox_idx = argsort_gt(prediction.cls_score);
+    for (unsigned &pbox_idx : sorted_pbox_idx) {
+      if (prediction.cls[pbox_idx] != category) continue;
+      image_result[category].scores[pred_num] = prediction.cls_score[pbox_idx];
+
+      unsigned most_overlapped_idx = 0;
+      double most_overlapped_iou = 0.0;
+
+      // finds the most overlapped predicted bounding box.
+      for (unsigned gtbox_idx = 0; gtbox_idx < annotation.size; gtbox_idx++) {
+        if (annotation.cls[gtbox_idx] != category) continue;
+        double iou = intersection_over_union(prediction.box_at(pbox_idx), annotation.box_at(gtbox_idx));
+        if (iou > most_overlapped_iou) {
+          most_overlapped_iou = iou;
+          most_overlapped_idx = gtbox_idx;
+        }
+      }
+      // when there is no ground truth, all predicted boxes are false positive,
+      // and they are preserved for batched AP calculation.
+      if (!num_gt_object) {
+        image_result[category].false_positive[pred_num++] = 1;
+      } else {
+        // the predicted bounding box is a true positive iff. it is the most overlapped,
+        // the matched groundtruth bounding box has not been matched previously, and
+        // the iou is above `thresh`.
+        if (most_overlapped_iou >= thresh) {
+          if (matched_gtbox.find(most_overlapped_idx) == matched_gtbox.end()) {
+            matched_gtbox.insert(most_overlapped_idx);
+            image_result[category].true_positive[pred_num++] = 1;
+          } else {
+            image_result[category].false_positive[pred_num++] = 1;
+          }
+        } else {
+          image_result[category].false_positive[pred_num++] = 1;
+        }
+      }
+    }
+  }
+  return image_result;
+}
+
+// Initializes runtime variables in `runtime_vars` struct.
+void set_runtime(std::string name,
+                 unsigned niter,
+                 unsigned batch_size,
+                 const std::string &input_loc,
+                 const std::string &annotation_loc) {
+  runtime_vars.name = name;
+  runtime_vars.niter = niter;
+  runtime_vars.batch_size = batch_size;
+  runtime_vars.groundtruth_loc = annotation_loc;
+  runtime_vars.input_loc = input_loc;
+}
+
+// Return type of function `validate_yolo`.
+struct metrics {
+  std::vector<mAPStats> map;
+  Tensor2d<mAPStats> coco;
+};
+
+// Main function that takes the results data and annotation location, and calculates mAP score for the network.
+struct metrics validate_yolo(std::vector<double> &results_data,
+                             Tensor3d<double> &raw_annotations,
+                             const unsigned batch) {
+  Tensor2d<double> raw_predictions;
+  PredictionEntry prediction;
+  AnnotationEntry annotation;
+
+  // executes accuracy check recipes.
+  try {
+    parse_prediction_boxes(results_data, raw_predictions);
+    resize_and_filter_prediction_boxes(raw_predictions, prediction, batch);
+    resize_annotation_boxes(raw_annotations, annotation, batch);
+    nms(prediction);
+    clip_box(prediction, batch);
+  } catch (const std::exception &e) {
+    slog::err << "Abort postprocessing." << slog::endl;
+    std::cerr << e.what() << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  // mAP
+  std::vector<mAPStats> map_stats = mean_average_precision(prediction, annotation, yolo_meta.pascal_voc_metric);
+
+  // COCO metric
+  Tensor2d<mAPStats> coco_ap_stats;
+  std::for_each(std::begin(yolo_meta.coco_metric), std::end(yolo_meta.coco_metric), [&](const double thresh) {
+    coco_ap_stats.emplace_back(mean_average_precision(prediction, annotation, thresh));
+  });
+
+  return {map_stats, coco_ap_stats};
+}
+
+// This function appends all of the elements in `v2` at the end of `v1` in order.
+template <typename T>
+void extend(std::vector<T> &v1, const std::vector<T> &v2) {
+  v1.reserve(v1.size() + v2.size());
+  v1.insert(v1.end(), v2.begin(), v2.end());
+}
+
+// Updates the batched statistics from individual image's result. The final batched AP and COCO AP is
+// calculated based on updated `batched_stats`.
+void metrics_update(std::vector<mAPStats> &batched_stats, const std::vector<mAPStats> &img_stats) {
+  for (unsigned cat = 0; cat < yolo_meta.num_classes; cat++) {
+    batched_stats[cat].num_gt_object += img_stats[cat].num_gt_object;
+    // updates batched statistics. omits the class where no prediction presents.
+    if (!img_stats[cat].scores.size()) continue;
+    extend(batched_stats[cat].scores, img_stats[cat].scores);
+    extend(batched_stats[cat].true_positive, img_stats[cat].true_positive);
+    extend(batched_stats[cat].false_positive, img_stats[cat].false_positive);
+  }
+}
+
+// Calculates AP using the given integral function.
+double metrics_eval(const std::vector<mAPStats> &stats, unsigned interval) {
+  std::vector<double> class_aps;
+  for (unsigned category = 0; category < yolo_meta.num_classes; category++) {
+    // omits the class when there is no prediction presents.
+    if (!stats[category].scores.size()) continue;
+    // the predictions are false-positive when there is no groundtruth for this
+    // class, and therefore the class AP is 0.0
+    if (stats[category].num_gt_object == 0 && stats[category].scores.size()) {
+      class_aps.push_back(0.0);
+      continue;
+    }
+
+    int TP = 0, FP = 0;
+    std::vector<double> precision, recall;
+
+    // sorts the tp and fp based on the order of confidence score.
+    std::vector<unsigned> &&sorted_stats_index = argsort_gt(stats[category].scores);
+    // calculates intermediate statistics calculation.
+    for (unsigned idx : sorted_stats_index) {
+      TP += stats[category].true_positive[idx];
+      FP += stats[category].false_positive[idx];
+      precision.emplace_back(TP / (double)(TP + FP));
+      recall.emplace_back(TP / (double)stats[category].num_gt_object);
+    }
+    // returns ROC of P-R curve.
+    class_aps.emplace_back(average_precision(precision, recall, interval));
+  }
+  return std::accumulate(class_aps.begin(), class_aps.end(), 0.0) / (double)class_aps.size();
+}
+
+// Wrapper of the function `validate_yolo`. This function prepares data and dispatches metrics calculations for each
+// validation image, accumulates metrics results, and returns the batched mAP and COCO AP.
+std::pair<double, double> validate_yolo_wrapper(std::map<std::string, ov::TensorVector> &raw_results,
+                                                const std::vector<ov::Output<const ov::Node>> &result_layout,
+                                                std::vector<std::string> input_files) {
+  slog::info << "Start validating yolo." << slog::endl;
+  std::ofstream fout;
+  fout.open("ap_report.txt");
+  // preserves all correct paths to validation images.
+  int num_image = runtime_vars.niter * runtime_vars.batch_size;
+  std::vector<std::string> input_image_paths;
+  std::sort(std::begin(input_files), std::end(input_files));
+  // input_files is guaranteed not to be empty since that case is filtered out.
+  for (auto &path : input_files) {
+    if (path == "") break;
+    if (num_image == 0) break;
+    input_image_paths.push_back(path);
+    num_image--;
+  }
+
+  // checks if there exists enough image files; this should always pass unless the image file is
+  // deleted right after the inferencing step.
+  if (num_image != 0) {
+    slog::err << "Not enough image input found. " << runtime_vars.batch_size * runtime_vars.niter << " required, "
+              << (runtime_vars.batch_size * runtime_vars.niter - num_image) << " provided." << slog::endl;
+    exit(EXIT_FAILURE);
+  }
+  // stores all annotation boxes for each image from groundtruth file.
+  // if an input image does not have a corresponding groundtruth file, an error occurs.
+  Tensor3d<double> raw_annotations;
+  int err = collect_validation_dataset(input_image_paths, raw_annotations, runtime_vars.source_image_sizes);
+  if (err) exit(EXIT_FAILURE);
+
+  // updates the metrics each image at a time to reduce memory overhead. the result for each image
+  // is accumulated in `batched_stats` and it will be used for batched mAP and COCO AP calculation.
+  metrics batched_stats;
+  batched_stats.map.resize(yolo_meta.num_classes, mAPStats{});
+  batched_stats.coco.resize(yolo_meta.coco_metric.size(), std::vector<mAPStats>{});
+  std::for_each(batched_stats.coco.begin(), batched_stats.coco.end(), [&](std::vector<mAPStats> &stats) {
+    stats.resize(yolo_meta.num_classes, mAPStats{});
+  });
+
+  for (unsigned batch = 0; batch < runtime_vars.niter; batch++) {
+    for (unsigned img = 0; img < runtime_vars.batch_size; img++) {
+      // stores the flattened output tensors from the resulting convolution layers.
+      std::vector<double> curr_img_data;
+      for (auto &item : result_layout) {
+        const std::string &name = item.get_any_name();
+        auto curr_outputBlob = raw_results.at(name).at(batch);
+        auto output_tensor_start = curr_outputBlob.data<float>();
+        unsigned output_size = curr_outputBlob.get_size() / runtime_vars.batch_size;
+        unsigned offset = img * output_size;
+        for (unsigned idx = 0; idx < output_size; idx++) {
+          curr_img_data.push_back(output_tensor_start[idx + offset]);
+        }
+      }
+
+      struct metrics &&curr_img_stats =
+          validate_yolo(curr_img_data, raw_annotations, img + batch * runtime_vars.batch_size);
+      metrics_update(batched_stats.map, curr_img_stats.map);
+      for (unsigned thresh = 0; thresh < yolo_meta.coco_metric.size(); thresh++) {
+        metrics_update(batched_stats.coco[thresh], curr_img_stats.coco[thresh]);
+      }
+
+      double img_AP = metrics_eval(curr_img_stats.map, yolo_meta.ap_interval);
+      // fout << "image " << input_files[img] << " AP @ 0.5" << std::endl;
+      fout << std::fixed << std::setprecision(10) << img_AP << std::endl;
+    }
+  }
+
+  double map = metrics_eval(batched_stats.map, yolo_meta.ap_interval);
+  double coco_ap = 0.0;
+  for (auto &coco_stats : batched_stats.coco) {
+    coco_ap += metrics_eval(coco_stats, yolo_meta.coco_interval);
+  }
+  coco_ap /= (double)yolo_meta.coco_metric.size();
+
+  fout << "\nAP at IoU=.50: " << std::fixed << std::setprecision(6) << map * 100 << "%" << std::endl;
+  fout << "AP at IoU=.50:.05:.95: " << std::fixed << std::setprecision(10) << coco_ap * 100 << "%" << std::endl;
+  fout.close();
+
+  std::cout << "ap_report.txt has been generated in the current directory." << std::endl;
+
+  return std::make_pair(map, coco_ap);
+}
diff --git a/python/openvino/runtime/dla_benchmark/average_precision.hpp b/python/openvino/runtime/dla_benchmark/average_precision.hpp
new file mode 100644
index 0000000..821eaa1
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/average_precision.hpp
@@ -0,0 +1,156 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: The file defines functions calculate mAP and COCO AP metrics. See average_precision.cpp for a
+// detailed explaination.
+
+#ifndef DLA_BENCHMARK_OBJECT_DETECTION_H_
+#define DLA_BENCHMARK_OBJECT_DETECTION_H_
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+#include <utility>
+
+#include <inference_engine.hpp>
+
+#undef UNICODE
+
+// Indexes for raw bounding box.
+#define BBOX_X 0
+#define BBOX_Y 1
+#define BBOX_W 2
+#define BBOX_H 3
+#define BBOX_CONFIDENCE 4
+
+// Indices for input image shapes.
+#define IMG_W 0
+#define IMG_H 1
+
+// Indices for parsed bounding boxes.
+#define X_MAX 0
+#define X_MIN 1
+#define Y_MAX 2
+#define Y_MIN 3
+
+// Convenient aliases.
+template <typename T>
+using Box = std::vector<T>;
+
+template <typename T>
+using Tensor2d = std::vector<std::vector<T>>;
+
+template <typename T>
+using Tensor3d = std::vector<std::vector<std::vector<T>>>;
+
+using Blob_t = std::vector<InferenceEngine::BlobMap>;
+
+// A set of supported YOLO graphs and its variants.
+static std::set<std::string> supported_yolo_versions = {"yolo-v3-tf", "yolo-v3-tiny-tf"};
+
+// Each image will have a prediction entry containing coordinates,
+// class scores of prediction boxes, predicted class, and size.
+typedef struct prediction_entry_t {
+  std::vector<double> x_max;
+  std::vector<double> x_min;
+  std::vector<double> y_max;
+  std::vector<double> y_min;
+  // scores for highest class
+  std::vector<double> cls_score;
+  // class with highest probability
+  std::vector<int> cls;
+  unsigned size;
+
+  Box<double> box_at(unsigned idx) { return {x_max[idx], x_min[idx], y_max[idx], y_min[idx]}; }
+} PredictionEntry;
+
+// Each image will have an annotation entry containing coordinates and
+// the true label specified in `cls`.
+typedef struct annotation_entry_t {
+  std::vector<double> x_max;
+  std::vector<double> x_min;
+  std::vector<double> y_max;
+  std::vector<double> y_min;
+  std::vector<int> cls;
+  unsigned size;
+
+  Box<double> box_at(unsigned idx) { return {x_max[idx], x_min[idx], y_max[idx], y_min[idx]}; }
+} AnnotationEntry;
+
+// Stores runtime variables.
+static struct runtime_const_t {
+  // Actually means number of validation image.
+  unsigned niter;
+  unsigned batch_size;
+  std::string name;
+  std::string groundtruth_loc;
+  std::string input_loc;
+  std::string report_folder;
+  const std::string gt_extension = "txt";
+
+  Tensor2d<std::string> input_image_path;
+  Tensor2d<double> source_image_sizes;
+} runtime_vars;
+
+// Stores constants for evaluation.
+static struct meta_t {
+  // Filtering parameters,
+  const double confidence_threshold = 0.001;
+  const double iou_threshold = 0.5;
+
+  // Parameters for parsing and resizing.
+  const unsigned num_classes = 80;
+  const unsigned channel = 255;
+  const unsigned box_per_channel = 3;
+  const unsigned pbox_size = 85;
+  const std::vector<double> dst_image_size = {416, 416};
+
+  // Dimensions of grid cells and anchor boxes.
+  const std::map<std::string, std::map<unsigned, std::vector<double>>> anchor_sizes{
+      {
+          "yolo-v3-tiny-tf",
+          {{13, {81, 82, 135, 169, 344, 319}}, {26, {23, 27, 37, 58, 81, 82}}},
+      },
+      {
+          "yolo-v3-tf",
+          {{13, {116, 90, 156, 198, 373, 326}}, {26, {30, 61, 62, 45, 59, 119}}, {52, {10, 13, 16, 30, 33, 23}}},
+      }};
+  const std::map<std::string, std::vector<unsigned>> grid_sizes = {
+      {"yolo-v3-tiny-tf", {26, 13}},
+      {"yolo-v3-tf", {13, 26, 52}},
+  };
+
+  // Use of `boundary` in IoU calculation.
+  const int boundary = 1;
+
+  // IoU threshold for metrics calculation.
+  const double strict_metric = 0.75;
+  const double pascal_voc_metric = 0.5;
+  const std::vector<double> coco_metric = {0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95};
+
+  // AP calculation
+  const unsigned ap_interval = 11;
+  const unsigned coco_interval = 101;
+} yolo_meta;
+
+// Returns `true` if the given YOLO graph, `name`, is supported. Else, `false` is returned.
+bool inline is_yolo_supported(std::string &name) {
+  return (supported_yolo_versions.find(name) != supported_yolo_versions.end());
+}
+
+// Sets runtime variables.
+void set_runtime(std::string name,
+                 unsigned niter,
+                 unsigned batch_size,
+                 const std::string &input_loc,
+                 const std::string &annotation_loc);
+
+// Entry point of this subroutine.
+std::pair<double, double> validate_yolo_wrapper(std::map<std::string, ov::TensorVector> &raw_results,
+                                                const std::vector<ov::Output<const ov::Node>> &result_layout,
+                                                std::vector<std::string> input_files);
+
+#endif  // DLA_BENCHMARK__OBJECT_DETECTION_H_
diff --git a/python/openvino/runtime/dla_benchmark/convert_annotations.py b/python/openvino/runtime/dla_benchmark/convert_annotations.py
new file mode 100755
index 0000000..0f3d9e6
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/convert_annotations.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+# ============================================================================
+# This script takes two paths as input. The first path is to the annotation file
+# in json format. This annotation file is the validation data used in the 2017 COCO
+# competition for object detection, downloaded from https://cocodataset.org/
+# The second path indicates the folder where the user wants to store the converted
+# annotation files in plain text format. Each file in the destination folder contains
+# the true label and the bounding boxes for its corresponding validation image.
+# To use the average precision calculation in the dla_benchmark, you must
+# provide the text-formatted annotation files.
+# Note that 91 classes are used in the mscoco paper https://arxiv.org/pdf/1405.0312.pdf,
+# whereas 80 are used in validation 2014/2017 dataset.
+# ============================================================================
+
+import json
+import sys
+
+
+def cat80(cat: int) -> int:
+    '''
+    The validation dataset omits 11 classes and causes mismatches with
+    the predicted classes in the benckmark_app. This function maps the
+    class id from the json annotation file to those used in the dla_benchmark.
+    '''
+    diff = 1
+    if cat > 11:
+        diff += 1
+    if cat > 25:
+        diff += 1
+    if cat > 28:
+        diff += 2
+    if cat > 44:
+        diff += 1
+    if cat > 65:
+        diff += 1
+    if cat > 67:
+        diff += 2
+    if cat > 70:
+        diff += 1
+    if cat > 82:
+        diff += 1
+    if cat > 90:
+        diff += 1
+    return cat - diff
+
+
+def parse_annotation_file(path_to_annotation: str, destination_folder: str) -> int:
+    fin = open(path_to_annotation)
+    json_data = json.load(fin)
+    per_image_data = dict()
+
+    # Gets all bounding boxes and labels w.r.t. each validation image.
+    for annotation in json_data["annotations"]:
+        image_id = annotation["image_id"]
+        bbox_data = [str(cat80(annotation["category_id"]))] + list(map(str, annotation["bbox"]))
+        if image_id in per_image_data:
+            per_image_data[image_id].append(bbox_data)
+        else:
+            per_image_data[image_id] = [bbox_data]
+    fin.close()
+
+    # Creates and writes to text files.
+    for image_meta in json_data["images"]:
+        file_path = rf'{destination_folder}/{image_meta["file_name"][:-4]}.txt'
+        if image_meta["id"] in per_image_data:
+            bboxes = per_image_data[image_meta["id"]]
+        else:
+            bboxes = []
+        with open(file_path, "w") as fout:
+            fout.write("\n".join([" ".join(bbox) for bbox in bboxes]))
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        sys.exit(
+            ("Usage: {0} "
+             "<path to the validation file in json format> "
+             "<path to the folder to store the annotation text files> "
+             )
+            .format(sys.argv[0]))
+
+    json_instances = sys.argv[1]
+    destination = sys.argv[2]
+
+    try:
+        parse_annotation_file(json_instances, destination)
+    except Exception as err:
+        print(err)
+    else:
+        print("Finished.")
diff --git a/python/openvino/runtime/dla_benchmark/dla_benchmark.hpp b/python/openvino/runtime/dla_benchmark/dla_benchmark.hpp
new file mode 100644
index 0000000..8d3eb80
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/dla_benchmark.hpp
@@ -0,0 +1,495 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <gflags/gflags.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+/// @brief message for help argument
+static const char help_message[] = "Print a usage message";
+
+/// @brief message for images argument
+static const char input_message[] =
+    "Optional. Path to a folder with images and/or binaries or to specific image or binary file.";
+
+/// @brief message for model argument
+static const char model_message[] =
+    "Required unless running the ahead-of-time flow using -cm. Path to an .xml file with a trained model";
+
+static const char network_file_alias_message[] = "Required unless -m or -cm is present. Alias for -m";
+
+/// @brief message for compiled model argument
+static const char compiled_model_message[] = "Optional. Path to a .bin file with a trained compiled model";
+
+/// @brief message for execution mode
+static const char api_message[] = "Optional. Enable Sync/Async API. Default value is \"async\".";
+
+/// @brief message for compile/inference device type.
+static const char target_device_message[] =
+    "Optional. Specify a target device to infer on Use \"-d HETERO:<comma-separated_devices_list>\" format to specify HETERO plugin. ";
+
+/// @brief message for iterations count
+/** static const char iterations_count_message[] = "Optional. Number of iterations. " \
+"If not specified, the number of iterations is calculated depending on a device."; **/
+static const char iterations_count_message[] = "Required. Number of iterations.";
+
+/// @brief message for requests count
+static const char infer_requests_count_message[] =
+    "Optional. Number of infer requests. Default value is determined automatically for device.";
+
+/// @brief message for #threads for CPU inference
+static const char infer_num_threads_message[] =
+    "Optional. Number of threads to use for inference on the CPU "
+    "(including HETERO).";
+
+/// @brief message for #streams for CPU inference
+static const char infer_num_streams_message[] =
+    "Optional. Number of streams to use for inference on the CPU in throughput mode "
+    "(for HETERO device cases use format <dev1>:<nstreams1>,<dev2>:<nstreams2> or just <nstreams>). "
+    "Default value is determined automatically for a device. Please note that although the automatic selection "
+    "usually provides a reasonable performance, it still may be non - optimal for some cases, especially for "
+    "very small networks. See sample's README for more details.";
+
+/// @brief message for user library argument
+static const char custom_cpu_library_message[] =
+    "Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.";
+
+static const char batch_size_message[] =
+    "Optional. Batch size value. If not specified, the batch size value is determined from Intermediate "
+    "Representation.";
+
+static const char batch_size_alias_message[] = "Optional. Alias for -b.";
+
+static const char min_subgraph_layers_message[] =
+    "Optional. Minimum number of layers allowed in a subgraph that runs on FPGA. Subgraph with fewer"
+    " layers than this value will run on CPU in Hetero plugin. Must be >= 1";
+
+/// @brief message for CPU threads pinning option
+static const char infer_threads_pinning_message[] =
+    "Optional. Enable threads->cores (\"YES\", default), threads->(NUMA)nodes (\"NUMA\") "
+    "or completely disable (\"NO\") "
+    "CPU threads pinning for CPU-involved inference.";
+
+/// @brief message for stream_output option
+static const char stream_output_message[] =
+    "Optional. Print progress as a plain text. When specified, an interactive progress bar is replaced with a "
+    "multiline output.";
+
+/// @brief message for the save_run_summary option
+static const char save_run_summary_message[] =
+    "Optional. Enable saving a summary of the run containing the "
+    "specified command line parameters and a copy of the performance report "
+    "printed to stdout.";
+
+/// @brief message for report_folder option
+static const char report_folder_message[] = "Optional. Path to a folder where statistics report is stored.";
+
+// @brief message for progress bar option
+static const char progress_message[] =
+    "Optional. Show progress bar (can affect performance measurement). Default values is \"false\".";
+
+/// @brief message for the custom plugins.xml file option
+static const char plugins_message[] = "Optional. Select a custom plugins_xml file to use. "
+    "-plugins=emulation to use xml file for software emulation";
+
+/// @brief message for the custom plugins_xml_file.xml file option
+static const char old_plugins_message[] =
+    "***DEPRECATED OPTION*** Please use NEW -plugins option to specify which custom plugins xml file to use";
+
+/// @brief message for ground truth file
+static const char groundtruth_loc_message[] =
+    "Optional. Select a ground truth file to use for calculating top 1 top 5 results.";
+
+/// @brief message for architecture .arch file
+static const char arch_file_message[] = "Optional. Provide a path for the architecture .arch file.";
+
+/// @brief message for --arch flag.
+static const char arch_alias_message[] = "Optional. Alias for -arch_file.";
+
+/// @brief message performance estimation
+static const char perf_est_message[] = "Optional. Perform performance estimation.";
+
+/// @brief message folding_option flag
+static const char folding_option_message[] = "Optional. Set the folding options for dla compiler: options 0-3.";
+
+/// @brief message fold_preprocessing flag
+static const char fold_preprocessing_message[] = "Optional. Enable fold preprocessing option for dla compiler.";
+
+/// @brief message bgr flag
+static const char bgr_message[] = "Optional. Indicate images are in bgr format.";
+
+/// @brief message dump_output flag
+static const char dump_output_message[] = "Optional. Dumps output of graph to result.txt and result.bin file(s).";
+
+/// @brief message for output_dir option
+static const char output_dir_message[] = "Optional. Path to a folder where result files are dumped to.";
+
+/// @brief message encryption_key flag
+static const char encryption_key_message[] =
+    "Optional. Encryption key (using hexidecimal characters, 16 bytes- 32 hexidecimal char).";
+
+/// @brief message encryption_iv flag
+static const char encryption_iv_message[] =
+    "Optional. Initialization vector for encryption. (8 bytes - 16 hexidecimal char)";
+
+/// @brief message debug network flag
+static const char debug_network_message[] = "Optional. Dump the contents from the debug network.";
+
+/// @brief message emulator_decryption flag
+static const char emulator_decryption_message[] =
+    "Optional. Set to true to enable decryption using emulator. Disable encryption in the import.";
+
+/// @brief message hidden_help flag
+static const char hidden_help_message[] = "Print help options that are experimental or for internal use.";
+
+/// @brief message estimate_per_layer_latencies flag
+static const char estimate_per_layer_latencies_message[] =
+    "Optional. Estimates the number of cycles each layer will consume during execution based on the internal model "
+    "Performance Estimator uses to estimate throughput. For internal use only.";
+
+/// @brief message average_precision flag
+static const char enable_object_detection_ap_message[] =
+    "Optional. Set to true to show average precision and COCO average precision for YOLO graphs in the report.";
+
+/// @brief message yolo_version flag
+static const char yolo_version_message[] = "Optional. The version of the YOLO graph. Required for average precision report.";
+
+/// @brief message binary flag
+static const char bin_data_message[] =
+    "Optional. Specify that the input should be read as binary data (otherwise, if input tensor has depth 1, or 3 it "
+    "will default to U8 image processing).";
+
+/// @brief message pc flag
+static const char pc_message[] = "Optional. Report performance counters for the CPU subgraphs, if there is any.";
+
+/// @brief message pcsort flag
+static const char pcsort_message[] =
+    "Optional. Report performance counters for the CPU subgraphs and analysis sort hotpoint opts. "
+    "sort: Analysis opts time cost, print by hotpoint order; "
+    "no_sort: Analysis opts time cost, print by normal order; "
+    "simple_sort: Analysis opts time cost, only print EXECUTED opts by normal order.";
+
+/// @brief message scale flag
+static constexpr char input_image_scale_message[] =
+    "Optional. Scale factors for each channel in [R, G, B] format. "
+    "Applies normalization as (x - mean) / scale. "
+    "Example: -scale_values input[1, 1, 1]. Not performed on FPGA.";
+
+/// @brief message mean flag
+static constexpr char input_image_mean_message[] =
+    "Optional. Per-channel mean subtraction values in [R, G, B] format. "
+    "Used for model input normalization as (x - mean) / scale. "
+    "Example: -mean_values input[255,255,255]. Not performed on FPGA.";
+
+/// @brief message resize flag
+static const char input_image_resize_message[] =
+    "Optional. Image resizing when the input image dimensions do not match the model."
+    "'resize': Resizing the image to the model input size."
+    "'pad_resize': Pad the image with zeros and resize to model input size.";
+
+/// @brief message enable early-access features flag
+static const char enable_early_access_message[] =
+    "Optional. Enables early access (EA) features of FPGA AI Suite. These are features that are actively being "
+    "developed and have not yet met production quality standards. These features may have flaws. "
+    "Consult the FPGA AI Suite documentation for details.";
+
+/// @brief message report LSU memory access count
+static const char report_lsu_counters_message[] =
+    "Optional. Report the number of memory accesses made by the "
+    "input feature reader, output feature writer, and filter reader "
+    "of each CoreDLA instance since device initialization. No report from the counters by default.";
+
+/// @brief message for verbose flag
+static const char verbose_message[] = "Optional. If true DLA Benchmark outputs detailed logs.";
+
+/// @brief mesage for maximum file size flag
+static const char output_output_file_size_message[] =
+    "Optional. Maximum file size in MB that can be dumped to a txt. Used to avoid creating files that cannot be opened.";
+
+/// @brief Define flag for showing help message <br>
+DEFINE_bool(h, false, help_message);
+
+/// @brief Declare flag for showing help message <br>
+DECLARE_bool(help);
+
+/// @brief Define parameter for set image file <br>
+/// i or mif is a required parameter
+DEFINE_string(i, "", input_message);
+
+/// @brief Define parameter for set model file <br>
+/// It is a required parameter
+DEFINE_string(m, "", model_message);
+
+/// @brief Alias for -m
+DEFINE_string(network_file, "", network_file_alias_message);
+
+/// @brief Define parameter for compiled model file <br>
+/// It is not a required parameter
+DEFINE_string(cm, "", compiled_model_message);
+
+/// @brief Define execution mode
+DEFINE_string(api, "async", api_message);
+
+/// @brief device the target device to infer on <br>
+DEFINE_string(d, "", target_device_message);
+
+/// @brief Absolute path to CPU library with user layers <br>
+/// It is a required parameter
+DEFINE_string(l, "", custom_cpu_library_message);
+
+/// @brief Iterations count (default 0)
+/// Sync mode: iterations count
+/// Async mode: StartAsync counts
+DEFINE_int32(niter, 0, iterations_count_message);
+
+/// @brief Number of infer requests in parallel
+DEFINE_int32(nireq, 0, infer_requests_count_message);
+
+/// @brief Number of threads to use for inference on the CPU in throughput mode (also affects Hetero cases)
+DEFINE_int32(nthreads, 0, infer_num_threads_message);
+
+/// @brief Number of streams to use for inference on the CPU (also affects Hetero cases)
+DEFINE_string(nstreams, "", infer_num_streams_message);
+
+/// @brief Define parameter for batch size <br>
+/// Default is 1
+DEFINE_int32(b, 1, batch_size_message);
+
+/// @brief alias for -b
+DEFINE_int32(batch_size, 1, batch_size_alias_message);
+
+/// @brief Minimum number of layers allowed in a subgraph that runs on FPGA
+DEFINE_int32(min_subgraph_layers, 2, min_subgraph_layers_message);
+
+// @brief Enable plugin messages
+DEFINE_string(pin, "YES", infer_threads_pinning_message);
+
+/// @brief Enables multiline text output instead of progress bar
+DEFINE_bool(stream_output, false, stream_output_message);
+
+/// @brief Enables saving a summary of the run
+DEFINE_bool(save_run_summary, false, save_run_summary_message);
+
+/// @brief Path to a folder where statistics report is stored
+DEFINE_string(report_folder, "", report_folder_message);
+
+/// @brief Define flag for showing progress bar <br>
+DEFINE_bool(progress, false, progress_message);
+
+/// @brief Path to a plugins_xml file
+DEFINE_string(plugins, "", plugins_message);
+
+/// @brief Deprecated argument for path to a plugins_xml file
+DEFINE_string(plugins_xml_file, "", old_plugins_message);
+
+/// @brief Path to a groundtruth file
+DEFINE_string(groundtruth_loc, "", groundtruth_loc_message);
+
+/// @brief Path to arch file
+DEFINE_string(arch_file, "", arch_file_message);
+
+/// @brief Path to arch file, same as arch_file
+DEFINE_string(arch, "", arch_alias_message);
+
+/// @brief Define flag for enable performance estimation
+DEFINE_bool(perf_est, false, perf_est_message);
+
+/// @brief Define flag whether the image is in bgr format
+DEFINE_bool(bgr, false, bgr_message);
+
+/// @brief Define flag for enable output results dumping
+DEFINE_bool(dump_output, false, dump_output_message);
+
+/// @brief Define flag for output directory where result files are dumped to
+DEFINE_string(output_dir, "", output_dir_message);
+
+/// Select folding options; 0,1,2,3
+DEFINE_int32(folding_option, 1, folding_option_message);
+
+/// @brief Define flag for enabling folding preprocessing
+DEFINE_bool(fold_preprocessing, false, fold_preprocessing_message);
+
+/// @brief encryption key
+DEFINE_string(encryption_key, "", encryption_key_message);
+
+/// @brief initialization vector
+DEFINE_string(encryption_iv, "", encryption_iv_message);
+
+/// @brief Define flag for enabling dump of debug network values
+DEFINE_bool(debug_network, false, debug_network_message);
+
+/// @brief encryption_key
+DEFINE_bool(emulator_decryption, false, emulator_decryption_message);
+
+/// @brief Flag for printing the hidden help message
+DEFINE_bool(hidden_help, false, hidden_help_message);
+
+/// @brief Whether Performance Estimator should calculate theoretical per-layer cycle counts. Internal use only. Must be
+/// called with -perf_est.
+DEFINE_bool(estimate_per_layer_latencies, false, estimate_per_layer_latencies_message);
+
+/// @brief Show average precision in the report
+DEFINE_bool(enable_object_detection_ap, false, enable_object_detection_ap_message);
+
+/// @brief Let user specify the version of their YOLO graph.
+DEFINE_string(yolo_version, "", yolo_version_message);
+
+/// @brief Specify that the inputs should be read as binary.
+DEFINE_bool(bin_data, false, bin_data_message);
+
+/// @brief Report performance counters for the CPU subgraphs.
+DEFINE_bool(pc, false, pc_message);
+
+/// @brief Report performance counters for the CPU subgraphs and analysis sort hotpoint opts.
+DEFINE_string(pcsort, "", pcsort_message);
+
+/// @brief Define flag for using input image scale <br>
+DEFINE_string(scale_values, "", input_image_scale_message);
+
+/// @brief Define flag for using input image mean <br>
+DEFINE_string(mean_values, "", input_image_mean_message);
+
+/// @brief Define flag for using input image resize <br>
+DEFINE_string(resize_type, "", input_image_resize_message);
+
+/// @brief Enables early-access (EA) features of CoreDLA <br>
+DEFINE_bool(enable_early_access, false, enable_early_access_message);
+
+/// @brief Pass the name of the streaming input linux FIFO for use in the emulator model
+DEFINE_string(streaming_input_pipe, "", "");
+
+/// @brief Report the input feature reader, output feature writer, and filter reader memory access counts
+DEFINE_bool(report_lsu_counters, false, report_lsu_counters_message);
+
+/// @brief define flag dla benchmark verbosity
+DEFINE_bool(verbose, false, verbose_message);
+
+/// @brief maximum file size in MB that can be dumped to a txt. Used to avoid creating files that cannot be opened.
+DEFINE_int32(max_output_file_size, 200, output_output_file_size_message);
+
+/**
+ * @brief Options that impact graph compile.
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void ShowCompileOptions() {
+  std::cout << std::endl << "Graph Compile Options:" << std::endl;
+  std::cout << "    -folding_option                             " << folding_option_message << std::endl;
+  std::cout << "    -fold_preprocessing                         " << fold_preprocessing_message << std::endl;
+  std::cout << "    -min-subgraph-layers \"<integer>\"            " << min_subgraph_layers_message << std::endl;
+}
+
+/**
+ * @brief Options that evaluate the correctness of the inference result.
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void ShowAccuracyOptions() {
+  std::cout << std::endl << "Accuracy Options:" << std::endl;
+  std::cout << "    -dump_output                                " << dump_output_message << std::endl;
+  std::cout << "    -groundtruth_loc                            " << groundtruth_loc_message << std::endl;
+  std::cout << "    -enable_object_detection_ap                 " << enable_object_detection_ap_message << std::endl;
+  std::cout << "    -yolo_version \"yolo-v3-tf/yolo-v3-tiny-tf\"  " << yolo_version_message << std::endl;
+}
+
+/**
+ * @brief Shows options for statistic dumping, report dumping
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void ShowStatsOrReportDumpingOptions() {
+  std::cout << std::endl << "Statistics dumping options:" << std::endl;
+  std::cout << "    -perf_est                                   " << perf_est_message << std::endl;
+  std::cout << "    -progress                                   " << progress_message << std::endl;
+  std::cout << "    -stream_output                              " << stream_output_message << std::endl;
+  std::cout << "    -save_run_summary                           " << save_run_summary_message << std::endl;
+  std::cout << "    -report_folder                              " << report_folder_message << std::endl;
+}
+
+/**
+ * @brief Shows preprocessing options for input data
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void ShowPreprocessingOptions() {
+  std::cout << std::endl << "Preprocessing Options:" << std::endl;
+  std::cout << "    -bgr                                        " << bgr_message << std::endl;
+  std::cout << "    -resize_type \"resize/pad_resize\"            " << input_image_resize_message << std::endl;
+  std::cout << "    -scale_values                               " << input_image_scale_message << std::endl;
+  std::cout << "    -mean_values                                " << input_image_mean_message << std::endl;
+}
+
+/**
+ * @brief Shows help options for inference on the FPGA or any OpenVINO device.
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void ShowInferenceOptions() {
+  std::cout << std::endl << "Inference Options:" << std::endl;
+  std::cout << "    -api \"<sync/async>\"                         " << api_message << std::endl;
+  std::cout << "    -niter \"<integer>\"                          " << iterations_count_message << std::endl;
+  std::cout << "    -nireq \"<integer>\"                          " << infer_requests_count_message << std::endl;
+  std::cout << "    -b \"<integer>\"                              " << batch_size_message << std::endl;
+  std::cout << "    -batch-size \"<integer>\"                     " << batch_size_alias_message << std::endl;
+}
+
+/**
+ * @brief Shows help options for OpenVINO devices (CPU, GPU)
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void ShowOpenVINODeviceOptions() {
+  std::cout << std::endl << "CPU or GPU options:" << std::endl;
+  std::cout << "    -nstreams \"<integer>\"                       " << infer_num_streams_message << std::endl;
+  std::cout << "    -nthreads \"<integer>\"                       " << infer_num_threads_message << std::endl;
+  std::cout << "    -pin \"YES/NO\"                               " << infer_threads_pinning_message << std::endl;
+  std::cout << "    -l \"<absolute_path>\"                        " << custom_cpu_library_message << std::endl;
+  std::cout << "    -pc                                           " << pc_message << std::endl;
+  std::cout << "    -pcsort \"sort/no_sort/simple_sort\"          " << pcsort_message << std::endl;
+}
+
+/**
+ * @brief This function prints a help message outlining options that are hidden from the user.
+ * Options listed here should be experimental or features for internal use.
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void PrintHiddenHelp() {
+  std::cout << std::endl << "Hidden Options. Experimental, early access or internal options." << std::endl;
+  std::cout << "    -enable_early_access              " << enable_early_access_message << std::endl;
+  std::cout << "    -estimate_per_layer_latencies     " << estimate_per_layer_latencies_message << std::endl;
+  std::cout << "    -debug_network                    " << debug_network_message << std::endl;
+  std::cout << "    -max_output_file_size                    " << output_output_file_size_message << std::endl;
+}
+
+/**
+ * @brief This function shows a help message. Add your new option in the appropriate section.
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void ShowUsage() {
+  std::cout << std::endl;
+  std::cout << "dla_benchmark [OPTION]" << std::endl;
+  std::cout << "Options:" << std::endl;
+  std::cout << std::endl;
+  std::cout << "    -h, --help                                  " << help_message << std::endl;
+  std::cout << "    -m \"<path>\"                                 " << model_message << std::endl;
+  std::cout << "    -network-file \"<path>\"                      " << network_file_alias_message << std::endl;
+  std::cout << "    -cm \"<path>\"                                " << compiled_model_message << std::endl;
+  std::cout << "    -d \"<device>\"                               " << target_device_message << std::endl;
+  std::cout << "    -plugins                                    " << plugins_message << std::endl;
+  std::cout << "    -plugins_xml_file                           " << old_plugins_message << std::endl;
+  std::cout << "    -arch_file                                  " << arch_file_message << std::endl;
+  std::cout << "    -arch                                       " << arch_alias_message << std::endl;
+  std::cout << "    -i \"<path>\"                                 " << input_message << std::endl;
+  std::cout << "    -bin_data                                   " << bin_data_message << std::endl;
+  std::cout << "    -output_dir                                 " << output_dir_message << std::endl;
+  std::cout << "    -encryption_key                             " << encryption_key_message << std::endl;
+  std::cout << "    -encryption_iv                              " << encryption_iv_message << std::endl;
+  std::cout << "    -emulator_decryption                        " << emulator_decryption_message << std::endl;
+  std::cout << "    -verbose                                    " << verbose_message << std::endl;
+  std::cout << "    -hidden_help                                " << hidden_help_message << std::endl;
+  ShowInferenceOptions();
+  ShowCompileOptions();
+  ShowPreprocessingOptions();
+  ShowAccuracyOptions();
+  ShowStatsOrReportDumpingOptions();
+  ShowOpenVINODeviceOptions();
+}
+
diff --git a/python/openvino/runtime/dla_benchmark/infer_request_wrap.hpp b/python/openvino/runtime/dla_benchmark/infer_request_wrap.hpp
new file mode 100644
index 0000000..9ddc3dd
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/infer_request_wrap.hpp
@@ -0,0 +1,168 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: Wrappers for single inference requests and queues of inference requests.
+//              Largely based off OpenVino's benchmark_app/infer_request_wrap.hpp
+//              [openvinotoolkit/openvino › samples/cpp/benchmark_app/infer_request_wrap.hpp]
+//              Note: Not all functions of InferenceEngine::InferRequest is wrapped. More functions can be added.
+
+#pragma once
+
+#include <condition_variable>
+#include <functional>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <string>
+#include <vector>
+#include <algorithm>
+
+#include <openvino/openvino.hpp>
+#include "statistics_report.hpp"
+#include "utils.hpp"
+
+typedef std::function<void(size_t id, const double latency, const std::exception_ptr& ptr)> QueueCallbackFunction;
+
+// Wrapper class for InferenceEngine::InferRequest. Handles asynchronous callbacks
+class InferReqWrap final {
+ public:
+  using Ptr = std::shared_ptr<InferReqWrap>;
+
+  ~InferReqWrap() = default;
+
+  explicit InferReqWrap(ov::CompiledModel& model, size_t id, QueueCallbackFunction callbackQueue)
+      : _request(model.create_infer_request()), _id(id), _callbackQueue(callbackQueue) {
+    _request.set_callback([&](const std::exception_ptr& ptr) {
+      _endTime = Time::now();
+      _callbackQueue(_id, get_execution_time_in_milliseconds(), ptr);
+    });
+  }
+
+  void start_async() {
+    _startTime = Time::now();
+    _request.start_async();
+  }
+
+  void wait() { _request.wait(); }
+
+  void infer() {
+    _startTime = Time::now();
+    _request.infer();
+    _endTime = Time::now();
+    _callbackQueue(_id, get_execution_time_in_milliseconds(), nullptr);
+  }
+
+  std::vector<ov::ProfilingInfo> get_performance_counts() { return _request.get_profiling_info(); }
+
+  ov::Tensor get_tensor(const std::string& name) { return _request.get_tensor(name); }
+
+  double get_execution_time_in_milliseconds() const {
+    auto execTime = std::chrono::duration_cast<ns>(_endTime - _startTime);
+    return static_cast<double>(execTime.count()) * 0.000001;
+  }
+
+  void set_tensor(const std::string& name, const ov::Tensor& data) { _request.set_tensor(name, data); }
+
+  void set_tensor(const ov::Output<const ov::Node>& port, const ov::Tensor& data) { _request.set_tensor(port, data); }
+
+  ov::Tensor get_output_tensor() { return _request.get_output_tensor(); }
+
+ private:
+  ov::InferRequest _request;
+  Time::time_point _startTime;
+  Time::time_point _endTime;
+  size_t _id;
+  QueueCallbackFunction _callbackQueue;
+};
+
+// Handles a queue of inference requests.
+class InferRequestsQueue final {
+ public:
+  InferRequestsQueue(ov::CompiledModel& model, size_t nireq) {
+    for (size_t id = 0; id < nireq; id++) {
+      requests.push_back(std::make_shared<InferReqWrap>(model,
+                                                        id,
+                                                        std::bind(&InferRequestsQueue::put_idle_request,
+                                                                  this,
+                                                                  std::placeholders::_1,
+                                                                  std::placeholders::_2,
+                                                                  std::placeholders::_3)));
+      _idleIds.push(id);
+    }
+    reset_times();
+  }
+
+  ~InferRequestsQueue() {
+    // Inference Request guarantee that it will wait for all asynchronous internal tasks in destructor
+    // So it should be released before any context that the request can use inside internal asynchronous tasks
+    // For example all members of InferRequestsQueue would be destroyed before `requests` vector
+    // So requests can try to use this members from `put_idle_request()` that would be called from request callback
+    // To avoid this we should move this vector declaration after all members declaration or just clear it manually in
+    // destructor
+    requests.clear();
+  }
+
+  void reset_times() {
+    _startTime = Time::time_point::max();
+    _endTime = Time::time_point::min();
+    _latencies.clear();
+  }
+
+  double get_durations_in_milliseconds() {
+    return std::chrono::duration_cast<ns>(_endTime - _startTime).count() * 0.000001;
+  }
+
+  void put_idle_request(size_t id, const double latency, const std::exception_ptr& ptr = nullptr) {
+    std::unique_lock<std::mutex> lock(_mutex);
+    if (ptr) {
+      inferenceException = ptr;
+    } else {
+      _latencies.push_back(latency);
+      _idleIds.push(id);
+      _endTime = std::max(Time::now(), _endTime);
+    }
+    _cv.notify_one();
+  }
+
+  InferReqWrap::Ptr get_idle_request() {
+    std::unique_lock<std::mutex> lock(_mutex);
+    _cv.wait(lock, [this] {
+      if (inferenceException) {
+        std::rethrow_exception(inferenceException);
+      }
+      return _idleIds.size() > 0;
+    });
+    auto request = requests.at(_idleIds.front());
+    _idleIds.pop();
+    _startTime = std::min(Time::now(), _startTime);
+    return request;
+  }
+
+  void wait_all() {
+    std::unique_lock<std::mutex> lock(_mutex);
+    _cv.wait(lock, [this] {
+      if (inferenceException) {
+        std::rethrow_exception(inferenceException);
+      }
+      return _idleIds.size() == requests.size();
+    });
+  }
+
+  std::vector<double>& get_latencies() { return _latencies; }
+
+  Time::time_point get_start_time() { return _startTime; }
+
+  Time::time_point get_end_time() { return _endTime; }
+
+  std::vector<InferReqWrap::Ptr> requests;
+
+ private:
+  std::queue<size_t> _idleIds;
+  std::mutex _mutex;
+  std::condition_variable _cv;
+  Time::time_point _startTime;
+  Time::time_point _endTime;
+  std::vector<double> _latencies;
+  std::exception_ptr inferenceException = nullptr;
+};
diff --git a/python/openvino/runtime/dla_benchmark/inputs_filling.cpp b/python/openvino/runtime/dla_benchmark/inputs_filling.cpp
new file mode 100644
index 0000000..0d20a14
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/inputs_filling.cpp
@@ -0,0 +1,885 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: This file implements all supported formats of filling input tensors with input data.
+//              Functions in this file has been based off/modified from OpenVINO's input filling algorithms,
+//              which would be a good place to start for future OpenVINO uplifts.
+//              Ref: [openvinotoolkit/openvino › samples/cpp/benchmark_app/input_fillings.cpp]
+
+#include "inputs_filling.hpp"
+
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+#include <functional>
+#include <limits>
+#include <tuple>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/videoio.hpp>
+#include <samples/ocv_common.hpp>
+#include <samples/slog.hpp>
+#include "format_reader_ptr.h"
+#include "shared_tensor_allocator.hpp"
+#include "utils.hpp"
+
+/**
+ * @brief Struct to store info of an image read by the FormatReader::Reader class
+*/
+struct ReaderInfo {
+  std::shared_ptr<uint8_t> data;  // Image data
+  const size_t file_index;        // Index of the image in the file_paths vector
+  const size_t channels;          // Number of channels used by the reader to store the image
+
+  ReaderInfo(std::shared_ptr<uint8_t>& data, size_t file_index, size_t channels)
+      : data(data), file_index(file_index), channels(channels) {}
+};
+
+// Since the reader always expands the image being read into an rgb image.
+// The only way to tell that an image is in fact an rgb and not a grayscale
+// image, it to find if the values in channel 0 differ from channel 1 or 2.
+// Return true if this is a grayscale image or an rgb image than can safely
+// be considered a grayscale image since all channel values are the same.
+static bool IsGrayScaleImage(const ReaderInfo& reader_info, uint32_t image_size) {
+  const auto num_channels = reader_info.channels;
+  const auto& image_data = reader_info.data;
+  // Iterate through the image surface
+  for (size_t pid = 0; pid < image_size; pid++) {
+    // Iterate through the image channels
+    for (size_t ch = 1; ch < num_channels; ++ch) {
+      if (image_data.get()[pid * num_channels + ch] != image_data.get()[pid * num_channels]) return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+using uniformDistribution = typename std::conditional<
+    std::is_floating_point<T>::value,
+    std::uniform_real_distribution<T>,
+    typename std::conditional<std::is_integral<T>::value, std::uniform_int_distribution<T>, void>::type>::type;
+
+/**
+ * @brief Fills a tensor with image data from input files
+ *
+ * Helper function to GetStaticTensors(), not used outside this file.
+ * Determines which image to use based of of input_id, batch_size, input_size, and request_id.
+ * Reads that data as uint8 and creates an input tensor of type T corresponding to input element type.
+ *
+ * @param files vector of file paths to the input images
+ * @param input_id image input id, ie image 1, image 2...
+ * @param batch_size batch size of the tensor
+ * @param input_size number of images to be used
+ * @param request_id infer request id
+ * @param input_info InputInfo struct corresponding to the input node of the tensor
+ * @param input_name name of the input
+ * @param bgr boolean indicating if input channels need to be reversed
+ * @param verbose prints extra logging information if true
+ * @return ov::Tensor containing the input data extracted from the image
+*/
+template <typename T>
+ov::Tensor CreateTensorFromImage(const std::vector<std::string>& files,
+                                 const size_t input_id,
+                                 const size_t batch_size,
+                                 const size_t input_size,
+                                 const size_t request_id,
+                                 const dla_benchmark::InputInfo& input_info,
+                                 const std::string& input_name,
+                                 const FormatReader::Reader::ResizeType resize_type,
+                                 const bool bgr = false,
+                                 const bool verbose = false) {
+  size_t tensor_size =
+      std::accumulate(input_info.data_shape.begin(), input_info.data_shape.end(), 1, std::multiplies<size_t>());
+  auto allocator = std::make_shared<SharedTensorAllocator>(tensor_size * sizeof(T));
+  auto data = reinterpret_cast<T*>(allocator->get_buffer());
+  /** Collect images data ptrs **/
+  std::vector<ReaderInfo> vreader;
+  vreader.reserve(batch_size);
+
+  size_t img_batch_size = 1;
+  if (!input_info.layout.empty() && ov::layout::has_batch(input_info.layout)) {
+    img_batch_size = batch_size;
+  } else {
+    slog::warn << input_name << ": layout does not contain batch dimension. Assuming batch 1 for this input"
+               << slog::endl;
+  }
+
+  for (size_t i = 0, input_idx = request_id * batch_size * input_size + input_id; i < img_batch_size; i++, input_idx += input_size) {
+    input_idx %= files.size();
+    FormatReader::ReaderPtr reader(files[input_idx].c_str());
+    if (input_idx <= MAX_COUT_WITHOUT_VERBOSE || verbose) {
+      slog::info << "Prepare image " << files[input_idx] << slog::endl;
+      if (!verbose && input_idx == MAX_COUT_WITHOUT_VERBOSE) {
+        slog::info << "Truncating list of input files. Run with --verbose for complete list." << slog::endl;
+      }
+    }
+    if (reader.get() == nullptr) {
+      slog::warn << "Image " << files[input_idx] << " cannot be read!" << slog::endl << slog::endl;
+      continue;
+    }
+
+    /** Getting image data **/
+    std::shared_ptr<uint8_t> image_data(reader->getData(input_info.GetWidth(), input_info.GetHeight(), resize_type));
+    if (image_data) {
+      // Store the number of channels used in storing the image in the reader
+      // If the image is grayscale, the reader would will still store it as a three
+      // channel image and therefore to read the image correctly we need to read the
+      // first channel value and then skip the next two.
+      const auto reader_channels = reader->size() / (reader->width() * reader->height());
+      vreader.emplace_back(image_data, input_idx, reader_channels);
+    }
+  }
+
+  /** Fill input tensor with image. First b channel, then g and r channels **/
+  const size_t num_channels = input_info.GetChannels();
+  const size_t width = input_info.GetWidth();
+  const size_t height = input_info.GetHeight();
+  const size_t batch = input_info.GetBatch();
+
+  const size_t image_size = width * height;  // Calculate the image size
+
+  // Lambda expression for calculating the pixel index in inputBlobData
+  const auto get_index = [=](size_t image_id, size_t pid, size_t ch) {
+    // Reverse the channel index if bgr is set to true
+    return image_id * image_size * num_channels + (bgr ? ch : (num_channels - ch - 1)) * image_size + pid;
+  };
+
+  // Lambda expression for calculating the channel (if bgr)
+  const auto get_channel = [=](size_t ch) {
+    return bgr ? ch : (num_channels - ch - 1);
+  };
+
+  /** Iterate over all input images **/
+  for (size_t image_id = 0; image_id < vreader.size(); ++image_id) {
+    const auto& reader_info = vreader.at(image_id);
+    // Error out of the graph has a single channel input and the image is not grayscale
+    if (num_channels == 1 && !IsGrayScaleImage(reader_info, image_size)) {
+      THROW_IE_EXCEPTION
+          << "Graph input is grayscale (has a single channel) and the following image is in RGB format:\n\t"
+          << files.at(reader_info.file_index);
+    }
+    const auto reader_channels = reader_info.channels;
+    /** Iterate over all pixel in image (b,g,r) **/
+    for (size_t pid = 0; pid < image_size; pid++) {
+      /** Iterate over all channels **/
+      for (size_t ch = 0; ch < num_channels; ++ch) {
+        // check if scale values are 0
+        if (input_info.scale_values[get_channel(ch)] == 0) {
+          throw ov::Exception("Cannot apply scale value of 0");
+        }
+        // Reader is created with the assumption that the number of channels is always the maximum
+        data[get_index(image_id, pid, ch)] = static_cast<T>(
+            (reader_info.data.get()[pid * reader_channels + ch] - input_info.mean_values[get_channel(ch)]) /
+            input_info.scale_values[get_channel(ch)]);
+      }
+    }
+  }
+
+  auto tensor = ov::Tensor(input_info.type, {batch, num_channels, height, width}, ov::Allocator(allocator));
+  return tensor;
+}
+
+/**
+ * @brief Fills a tensor with video data from input files
+ *
+ * Helper function to GetStaticTensors(), not used outside this file.
+ * Determines which image to use based of of input_id, batch_size, input_size, and request_id.
+ * Reads that and creates an input tensor of type T corresponding to input element type.
+ *
+ * @param file_paths vector of file paths to the input images
+ * @param input_id binary input id, ie video 1, video 2...
+ * @param batch_size batch size of the tensor
+ * @param input_size number of images to be used
+ * @param request_id infer request id
+ * @param input_info InputInfo struct corresponding to the input node of the tensor
+ * @param input_name name of the input
+ * @param bgr boolean indicating if input channels need to be reversed
+ * @param verbose prints extra logging information if true
+ * @return ov::Tensor containing the input data extracted from the video
+*/
+template <typename T>
+ov::Tensor CreateTensorFromVideo(const std::vector<std::string>& file_paths,
+                                 const size_t input_id,
+                                 const size_t batch_size,
+                                 const size_t input_size,
+                                 const size_t request_id,
+                                 const dla_benchmark::InputInfo& input_info,
+                                 const std::string& input_name,
+                                 const bool bgr = false,
+                                 const bool verbose = false) {
+  size_t tensor_size =
+      std::accumulate(input_info.data_shape.begin(), input_info.data_shape.end(), 1, std::multiplies<size_t>());
+  auto allocator = std::make_shared<SharedTensorAllocator>(tensor_size * sizeof(T));
+  auto data = reinterpret_cast<T*>(allocator->get_buffer());
+
+  const size_t input_idx = (request_id * input_size + input_id) % file_paths.size();
+
+  const size_t channels = input_info.GetChannels();
+  const size_t height = input_info.GetHeight();
+  const size_t width = input_info.GetWidth();
+  const size_t frame_count = input_info.GetDepth();
+  const size_t batch = input_info.GetBatch();
+
+  std::vector<cv::Mat> frames_to_write;
+  frames_to_write.reserve(batch_size * frame_count);
+  if (verbose) slog::info << "Prepare Video " << file_paths[input_idx] << slog::endl;
+
+  // Open Video
+  cv::VideoCapture cap(file_paths[input_idx]);
+  if (!cap.isOpened()) {
+    throw std::runtime_error("Video file " + file_paths[input_idx] + " cannot be read!");
+  }
+
+  // Get amount of frames in video and calculate a step to partition the video into clips
+  size_t video_frames = 0;
+  size_t step;
+  size_t cur_video_pos = 0;
+  cv::Mat calc_frame;
+
+  // Using while loop instead of cv::get() since cv::get() isn't guaranteed to return
+  // the correct amount of frames
+  while ((cap.read(calc_frame))) {
+    if (calc_frame.empty()) {
+      break;
+    }
+    video_frames++;
+  }
+
+  // Reopen the file at the starting position
+  cap.release();
+  cap.open(file_paths[input_idx].c_str());
+  if (!cap.isOpened()) {
+    throw std::runtime_error("Video file " + file_paths[input_idx] + " cannot be read!");
+  }
+
+  if (verbose) {
+    slog::info << "Video file " << file_paths[input_idx] << " contains " << video_frames << " readable frames."
+               << slog::endl;
+  }
+
+  // Calculate step to partition video into "batch_size" amount of clips
+  if (batch_size == 1) {
+    step = frame_count;
+  } else if (video_frames < frame_count) {
+    step = 1;
+  } else {
+    step = std::max((size_t)1, (video_frames - frame_count) / (batch_size - 1));
+  }
+
+  // Get frames
+  for (size_t clip_start = 0; clip_start < batch_size * step; clip_start += step) {
+    // Attempt to set position using OpenCV + Video Codec
+    bool success = cap.set(cv::CAP_PROP_POS_FRAMES, clip_start);
+
+    // Unsupported by codec, set manually
+    if (!success) {
+      if (cur_video_pos < clip_start) {
+        while (cur_video_pos != clip_start) {
+          cap.read(calc_frame);
+          cur_video_pos++;
+        }
+      } else if (cur_video_pos > clip_start) {
+        // Reopen the file at the starting position
+        cap.release();
+        cap.open(file_paths[input_idx].c_str());
+        if (!cap.isOpened()) {
+          throw std::runtime_error("Video file " + file_paths[input_idx] + " cannot be read!");
+        }
+        cur_video_pos = 0;
+        while (cur_video_pos != clip_start) {
+          cap.read(calc_frame);
+          cur_video_pos++;
+        }
+      }
+    }
+
+    for (size_t curr_frame = 0; curr_frame < frame_count; curr_frame++) {
+      cv::Mat frame;
+      cap.read(frame);
+
+      // Frame is empty -> Clip is shorter than frame_count, loop from start of clip
+      if (frame.empty()) {
+        if (verbose)
+          slog::info << "A video clip was shorter than the desired frame count, looping video." << slog::endl;
+        bool success = cap.set(cv::CAP_PROP_POS_FRAMES, clip_start);
+
+        // If unsupported by codec, set manually
+        if (!success) {
+          // Reopen the file at the starting position
+          cap.release();
+          cap.open(file_paths[input_idx].c_str());
+          if (!cap.isOpened()) {
+            throw std::runtime_error("Video file " + file_paths[input_idx] + " cannot be read!");
+          }
+          cur_video_pos = 0;
+          while (cur_video_pos != clip_start) {
+            cap.read(calc_frame);
+            cur_video_pos++;
+          }
+        } else {
+          cur_video_pos = clip_start;
+        }
+
+        cap.read(frame);
+
+        // If it's still empty, then there's an error with reading
+        if (frame.empty()) {
+          slog::err << "Video file " << file_paths[input_idx] << " frames cannot be read!" << slog::endl << slog::endl;
+          continue;
+        }
+      }
+
+      cur_video_pos++;
+      // If bgr=false, convert to RGB
+      if (!bgr) {
+        cv::cvtColor(frame, frame, cv::COLOR_BGR2RGB);
+      }
+
+      // Check frame sizing, resize if it doesn't match expected blob size
+      cv::Mat resized_frame(frame);
+      if (static_cast<int>(width) != frame.size().width || static_cast<int>(height) != frame.size().height) {
+        // Resizes to 256 and centre crops based on actual needed dimensions, may add a flag for this in the future
+        // to be cleaner
+        if (static_cast<int>(width) < 256 && static_cast<int>(height) < 256) {
+          double scale;
+          if (frame.size().width <= frame.size().height)
+            scale = double(256) / frame.size().width;
+          else
+            scale = double(256) / frame.size().height;
+          cv::resize(frame, resized_frame, cv::Size(0, 0), scale, scale);
+          const int offsetW = (resized_frame.size().width - static_cast<int>(width)) / 2;
+          const int offsetH = (resized_frame.size().height - static_cast<int>(height)) / 2;
+          const cv::Rect roi(offsetW, offsetH, static_cast<int>(width), static_cast<int>(height));
+          resized_frame = resized_frame(roi).clone();
+        } else {
+          cv::resize(frame, resized_frame, cv::Size(width, height));
+        }
+      }
+      // Save frame to write
+      frames_to_write.emplace_back(resized_frame);
+    }
+  }
+
+  // Write frames to blob
+  for (size_t b = 0; b < batch_size; b++) {
+    size_t batch_offset = b * channels * frame_count * height * width;
+    for (size_t c = 0; c < channels; c++) {
+      size_t channel_offset = c * frame_count * height * width;
+      for (size_t frameId = b * frame_count; frameId < (b + 1) * frame_count; frameId++) {
+        const cv::Mat& frame_to_write = frames_to_write.at(frameId);
+        size_t frame_offset_id = frameId % frame_count;
+        size_t frame_offset = frame_offset_id * height * width;
+        for (size_t h = 0; h < height; h++) {
+          for (size_t w = 0; w < width; w++) {
+            data[batch_offset + channel_offset + frame_offset + h * width + w] = frame_to_write.at<cv::Vec3b>(h, w)[c];
+          }
+        }
+      }
+    }
+  }
+  cap.release();
+  return ov::Tensor(input_info.type, {batch, channels, frame_count, height, width}, ov::Allocator(allocator));
+}
+
+/**
+ * @brief Fills a tensor with image info data
+ *
+ * Helper function to GetStaticTensors(), not used outside this file.
+ *
+ * @param image_size Size of image width x height
+ * @param batch_size batch size of the tensor
+ * @param input_info InputInfo struct corresponding to the input node of the tensor
+ * @param input_name name of the input
+ * @return ov::Tensor containing the input data
+*/
+template <typename T>
+ov::Tensor CreateTensorImInfo(const std::pair<size_t, size_t>& image_size,
+                              size_t batch_size,
+                              const dla_benchmark::InputInfo& input_info,
+                              const std::string& input_name) {
+  size_t tensor_size =
+      std::accumulate(input_info.data_shape.begin(), input_info.data_shape.end(), 1, std::multiplies<size_t>());
+  auto allocator = std::make_shared<SharedTensorAllocator>(tensor_size * sizeof(T));
+  auto data = reinterpret_cast<T*>(allocator->get_buffer());
+
+  size_t info_batch_size = 1;
+  if (!input_info.layout.empty() && ov::layout::has_batch(input_info.layout)) {
+    info_batch_size = batch_size;
+  } else {
+    slog::warn << input_name << ": layout is not set or does not contain batch dimension. Assuming batch 1. "
+               << slog::endl;
+  }
+
+  for (size_t b = 0; b < info_batch_size; b++) {
+    size_t im_info_size = tensor_size / info_batch_size;
+    for (size_t i = 0; i < im_info_size; i++) {
+      size_t index = b * im_info_size + i;
+      if (0 == i)
+        data[index] = static_cast<T>(image_size.first);
+      else if (1 == i)
+        data[index] = static_cast<T>(image_size.second);
+      else
+        data[index] = 1;
+    }
+  }
+
+  auto tensor = ov::Tensor(input_info.type, input_info.data_shape, ov::Allocator(allocator));
+  return tensor;
+}
+
+/**
+ * @brief Fills a tensor with binary data from input files
+ *
+ * Helper function to GetStaticTensors(), not used outside this file.
+ * Determines which image to use based of of input_id, batch_size, input_size, and request_id.
+ * Reads that and creates an input tensor of type T corresponding to input element type.
+ *
+ * @param files vector of file paths to the input images
+ * @param input_id binary input id, ie binary 1, binary 2...
+ * @param batch_size batch size of the tensor
+ * @param input_size number of images to be used
+ * @param request_id infer request id
+ * @param input_info InputInfo struct corresponding to the input node of the tensor
+ * @param input_name name of the input
+ * @param verbose prints extra logging information if true
+ * @return ov::Tensor containing the input data extracted from the binary
+*/
+template <typename T>
+ov::Tensor CreateTensorFromBinary(const std::vector<std::string>& files,
+                                  const size_t input_id,
+                                  const size_t batch_size,
+                                  const size_t input_size,
+                                  const size_t request_id,
+                                  const dla_benchmark::InputInfo& input_info,
+                                  const std::string& input_name,
+                                  const bool verbose = false) {
+  size_t tensor_size =
+      std::accumulate(input_info.data_shape.begin(), input_info.data_shape.end(), 1, std::multiplies<size_t>());
+  auto allocator = std::make_shared<SharedTensorAllocator>(tensor_size * sizeof(T));
+  char* data = allocator->get_buffer();
+  size_t binary_batch_size = 1;
+  if (!input_info.layout.empty() && ov::layout::has_batch(input_info.layout)) {
+    binary_batch_size = batch_size;
+  } else {
+    slog::warn << input_name
+               << ": layout is not set or does not contain batch dimension. Assuming that binary "
+                  "data read from file contains data for all batches."
+               << slog::endl;
+  }
+
+  for (size_t b = 0, input_idx = request_id * batch_size * input_size + input_id; b < binary_batch_size; b++, input_idx += input_size) {
+    input_idx %= files.size();
+    if (input_idx <= MAX_COUT_WITHOUT_VERBOSE || verbose) {
+      slog::info << "Prepare binary file " << files[input_idx] << slog::endl;
+      if (!verbose && input_idx == MAX_COUT_WITHOUT_VERBOSE) {
+        slog::info << "Truncating list of input files. Run with --verbose for complete list." << slog::endl;
+      }
+    }
+    std::ifstream binary_file(files[input_idx], std::ios_base::binary | std::ios_base::ate);
+    OPENVINO_ASSERT(binary_file, "Cannot open ", files[input_idx]);
+
+    auto file_size = static_cast<std::size_t>(binary_file.tellg());
+    binary_file.seekg(0, std::ios_base::beg);
+    OPENVINO_ASSERT(binary_file.good(), "Can not read ", files[input_idx]);
+    auto input_size = tensor_size * sizeof(T) / binary_batch_size;
+    OPENVINO_ASSERT(file_size == input_size,
+                    "File ",
+                    files[input_idx],
+                    " contains ",
+                    file_size,
+                    " bytes, but the model expects ",
+                    input_size);
+
+    if (input_info.layout != "CN") {
+      binary_file.read(&data[b * input_size], input_size);
+    } else {
+      for (size_t i = 0; i < input_info.GetChannels(); i++) {
+        binary_file.read(&data[(i * binary_batch_size + b) * sizeof(T)], sizeof(T));
+      }
+    }
+  }
+
+  auto tensor = ov::Tensor(input_info.type, input_info.data_shape, ov::Allocator(allocator));
+  return tensor;
+}
+
+/**
+ * @brief Randomly fills input tensor, used when no input files is provided
+ *
+ * Helper function to GetStaticTensors(), not used outside this file.
+ *
+ * @param input_info InputInfo struct corresponding to the input node of the tensor
+ * @param rand_min Min. random value
+ * @param rand_max Max. random value
+ * @return ov::Tensor containing the the randomly generated input data
+*/
+template <typename T, typename T2>
+ov::Tensor CreateTensorRandom(const dla_benchmark::InputInfo& input_info,
+                              T rand_min = std::numeric_limits<uint8_t>::min(),
+                              T rand_max = std::numeric_limits<uint8_t>::max()) {
+  size_t tensor_size =
+      std::accumulate(input_info.data_shape.begin(), input_info.data_shape.end(), 1, std::multiplies<size_t>());
+  auto allocator = std::make_shared<SharedTensorAllocator>(tensor_size * sizeof(T));
+  auto data = reinterpret_cast<T*>(allocator->get_buffer());
+
+  std::mt19937 gen(0);
+  uniformDistribution<T2> distribution(rand_min, rand_max);
+  for (size_t i = 0; i < tensor_size; i++) {
+    data[i] = static_cast<T>(i%255);
+  }
+
+  ov::Shape tensor_shape = input_info.data_shape;
+  // FPGA model only supports channel first.
+  // The transpose for case NHWC and HWC below is ok since the tensor has randomly generated input data.
+  if (input_info.layout == "NHWC") {
+    // Use NCHW instead of NHWC since FPGA model only supports channel first.
+    tensor_shape = {input_info.GetBatch(), input_info.GetChannels(),
+                    input_info.GetHeight(), input_info.GetWidth()};
+  } else if (input_info.layout == "HWC") {
+    // Use CHW instead of HWC since FPGA model only supports channel first.
+    tensor_shape = {input_info.GetChannels(), input_info.GetHeight(), input_info.GetWidth()};
+  }
+
+  auto tensor = ov::Tensor(input_info.type, tensor_shape, ov::Allocator(allocator));
+  return tensor;
+}
+
+/**
+ * @brief Wrapper for CreateImageTensorFromImage, uses approriate stl data type for precision
+ *
+ * See CreateImageTensorFromImage for params. Helper for GetStaticTensors, not used outside this file.
+*/
+ov::Tensor GetImageTensor(const std::vector<std::string>& files,
+                          const size_t input_id,
+                          const size_t batch_size,
+                          const size_t input_size,
+                          const size_t request_id,
+                          const std::pair<std::string, dla_benchmark::InputInfo>& input_info,
+                          const FormatReader::Reader::ResizeType resize_type,
+                          const bool bgr = false,
+                          const bool verbose = false) {
+  // Edwinzha: All image data will be read as U8 but saved as a float in tensor data structure.
+  // Saving as U8 results in accuracy loss in diff check, especially in mobilenet graphs.
+  const ov::element::Type_t type = input_info.second.type;
+  if (type == ov::element::f16) {
+    return CreateTensorFromImage<ov::float16>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, resize_type, bgr, verbose);
+  } else  {
+    return CreateTensorFromImage<float>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, resize_type, bgr, verbose);
+  }
+}
+
+/**
+ * @brief Wrapper for CreateTensorFromVideo, uses appropriate stl data type for precision
+ *
+ * See CreateTensorFromVideo for params. Helper for GetStaticTensors, not used outside this file.
+*/
+ov::Tensor GetVideoTensor(const std::vector<std::string>& files,
+                          const size_t input_id,
+                          const size_t batch_size,
+                          const size_t input_size,
+                          const size_t request_id,
+                          const std::pair<std::string, dla_benchmark::InputInfo>& input_info,
+                          const bool bgr = false,
+                          const bool verbose = false) {
+  auto type = input_info.second.type;
+  if (type == ov::element::f32) {
+    return CreateTensorFromVideo<float>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, bgr, verbose);
+  } else if (type == ov::element::u8) {
+    return CreateTensorFromVideo<uint8_t>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, bgr, verbose);
+  } else if (type == ov::element::i32) {
+    return CreateTensorFromVideo<int32_t>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, bgr, verbose);
+  } else if (type == ov::element::f16) {
+    return CreateTensorFromVideo<ov::float16>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, bgr, verbose);
+  } else {
+    throw ov::Exception("Video input tensor type is not supported: " + input_info.first);
+  }
+}
+
+/**
+ * @brief Wrapper for CreateTensorRandom, uses appropriate stl data type for precision
+ *
+ * See CreateTensorRandom for params. Helper for GetStaticTensors, not used outside this file.
+*/
+ov::Tensor GetRandomTensor(const std::pair<std::string, dla_benchmark::InputInfo>& input_info) {
+  auto type = input_info.second.type;
+  if (type == ov::element::f32) {
+    return CreateTensorRandom<float, float>(input_info.second);
+  } else if (type == ov::element::f16) {
+    return CreateTensorRandom<short, short>(input_info.second);
+  } else if (type == ov::element::i32) {
+    return CreateTensorRandom<int32_t, int32_t>(input_info.second);
+  } else if (type == ov::element::u8) {
+    // uniform_int_distribution<uint8_t> is not allowed in the C++17
+    // standard and vs2017/19
+    return CreateTensorRandom<uint8_t, uint32_t>(input_info.second);
+  } else if (type == ov::element::i8) {
+    // uniform_int_distribution<int8_t> is not allowed in the C++17 standard
+    // and vs2017/19
+    return CreateTensorRandom<int8_t, int32_t>(
+        input_info.second, std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
+  } else if (type == ov::element::u16) {
+    return CreateTensorRandom<uint16_t, uint16_t>(input_info.second);
+  } else if (type == ov::element::i16) {
+    return CreateTensorRandom<int16_t, int16_t>(input_info.second);
+  } else {
+    throw ov::Exception("Random input tensor type is not supported: " + input_info.first);
+  }
+}
+
+/**
+ * @brief Wrapper for CreateTensorImInfo, uses appropriate stl data type for precision
+ *
+ * See CreateTensorImInfo for params. Helper for GetStaticTensors, not used outside this file.
+*/
+ov::Tensor GetImInfoTensor(const std::pair<size_t, size_t>& image_size,
+                           size_t batch_size,
+                           const std::pair<std::string, dla_benchmark::InputInfo>& input_info) {
+  auto type = input_info.second.type;
+  if (type == ov::element::f32) {
+    return CreateTensorImInfo<float>(image_size, batch_size, input_info.second, input_info.first);
+  } else if (type == ov::element::f64) {
+    return CreateTensorImInfo<double>(image_size, batch_size, input_info.second, input_info.first);
+  } else if (type == ov::element::f16) {
+    return CreateTensorImInfo<ov::float16>(image_size, batch_size, input_info.second, input_info.first);
+  } else if (type == ov::element::i32) {
+    return CreateTensorImInfo<int32_t>(image_size, batch_size, input_info.second, input_info.first);
+  } else if (type == ov::element::i64) {
+    return CreateTensorImInfo<int64_t>(image_size, batch_size, input_info.second, input_info.first);
+  } else {
+    throw ov::Exception("Image info input tensor type is not supported:" + input_info.first);
+  }
+}
+
+/**
+ * @brief Wrapper for GetBinaryTensor, uses appropriate stl data type for precision
+ *
+ * See GetBinaryTensor for params. Helper for GetStaticTensors, not used outside this file.
+*/
+ov::Tensor GetBinaryTensor(const std::vector<std::string>& files,
+                           const size_t input_id,
+                           const size_t batch_size,
+                           const size_t input_size,
+                           const size_t request_id,
+                           const std::pair<std::string, dla_benchmark::InputInfo>& input_info,
+                           const bool verbose = false) {
+  const auto& type = input_info.second.type;
+  if (type == ov::element::f32) {
+    return CreateTensorFromBinary<float>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, verbose);
+  } else if (type == ov::element::f16) {
+    return CreateTensorFromBinary<ov::float16>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, verbose);
+  } else if (type == ov::element::i32) {
+    return CreateTensorFromBinary<int32_t>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, verbose);
+  } else if ((type == ov::element::u8)) {
+    return CreateTensorFromBinary<uint8_t>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, verbose);
+  } else {
+    throw ov::Exception("Binary input tensor type is not supported: " + input_info.first);
+  }
+}
+
+/**
+ * @brief Main function used by DLA benchmark, creates input tensors based off of input files and precision
+ *
+ * Only creates static tensors (no dims of -1). Calls all other functions in this file.
+ *
+ * @param input_files vector of input file paths
+ * @param batch_size batch size of input
+ * @param inputs_info map of input name to InputInfo struct which contains useful input information
+ *                    such as precision, tensor layout
+ * @param requests_num number of infer requests
+ * @param bgr boolean indicating if channels are reversed, corresponds to user bgr flag
+ * @param is_binary_data boolean indicating if the image data should be binary, corresponding to user binary flag
+ * @param verbose Verbosity boolean. If true, additional logs are printed
+ * @return A map of input name with tensor vectors. TensorVector being an alias of ov::Tensors where
+ *         each index corresponds to the batch
+*/
+std::map<std::string, ov::TensorVector> GetStaticTensors(const std::vector<std::string>& input_files,
+                                                         const size_t& batch_size,
+                                                         dla_benchmark::InputsInfo& inputs_info,
+                                                         size_t requests_num,
+                                                         std::string resize_type,
+                                                         bool bgr = false,
+                                                         bool is_binary_data = false,
+                                                         bool verbose = false) {
+  std::map<std::string, ov::TensorVector> blobs;
+  std::vector<std::pair<size_t, size_t>> net_input_im_sizes;
+  std::vector<std::tuple<size_t, size_t, size_t>> net_input_vid_sizes;
+  FormatReader::Reader::ResizeType resize_type_enum;
+
+  if (resize_type == "resize") {
+    resize_type_enum = FormatReader::Reader::ResizeType::RESIZE;
+  } else if (resize_type == "pad_resize") {
+    resize_type_enum = FormatReader::Reader::ResizeType::PAD_RESIZE;
+  } else {
+    slog::err << resize_type << " is not a valid -resize_type option" << slog::endl;
+    exit(1);
+  }
+
+  for (auto& item : inputs_info) {
+    const std::string& name = item.first;
+    const auto& input_info = item.second;
+    if (input_info.IsImage() && !is_binary_data) {
+      net_input_im_sizes.emplace_back(input_info.GetWidth(), input_info.GetHeight());
+    } else if (input_info.IsVideo()) {
+      net_input_vid_sizes.emplace_back(input_info.GetDepth(), input_info.GetWidth(), input_info.GetHeight());
+    }
+    slog::info << "Network input '" << name << "' precision " << input_info.type << ", dimensions "
+               << input_info.layout.to_string() << ": ";
+    slog::info << "[";
+    for (size_t i = 0; i < input_info.data_shape.size(); ++i) {
+      slog::info << input_info.data_shape[i];
+      if (i < input_info.data_shape.size() - 1) {
+        slog::info << " ";
+      }
+    }
+    slog::info << "]" << slog::endl;
+  }
+
+  size_t img_input_count = net_input_im_sizes.size();
+  size_t vid_input_count = net_input_vid_sizes.size();
+  size_t bin_input_count = inputs_info.size() - img_input_count - vid_input_count;
+
+  std::vector<std::string> binary_files;
+  std::vector<std::string> image_files;
+  std::vector<std::string> video_files;
+
+  if (input_files.empty()) {
+    slog::warn << "No input files were given: all inputs will be filled with random values!" << slog::endl;
+  } else {
+    binary_files = FilterFilesByExtensions(input_files, supported_binary_extensions);
+    std::sort(std::begin(binary_files), std::end(binary_files));
+
+    auto bins_to_be_used = bin_input_count * batch_size * requests_num;
+    if (bins_to_be_used > 0 && binary_files.empty()) {
+      std::stringstream ss;
+      for (auto& ext : supported_binary_extensions) {
+        if (!ss.str().empty()) {
+          ss << ", ";
+        }
+        ss << ext;
+      }
+      slog::warn << "No supported binary inputs found! Please check your file "
+                    "extensions: "
+                 << ss.str() << slog::endl;
+    } else if (bins_to_be_used > binary_files.size()) {
+      slog::warn << "Some binary input files will be duplicated: " << bins_to_be_used << " files are required but only "
+                 << binary_files.size() << " are provided" << slog::endl;
+    } else if (bins_to_be_used < binary_files.size()) {
+      slog::warn << "Some binary input files will be ignored: only " << bins_to_be_used << " are required from "
+                 << binary_files.size() << slog::endl;
+    }
+
+    image_files = FilterFilesByExtensions(input_files, supported_image_extensions);
+    std::sort(std::begin(image_files), std::end(image_files));
+
+    auto imgs_to_be_used = img_input_count * batch_size * requests_num;
+    if (imgs_to_be_used > 0 && image_files.empty()) {
+      std::stringstream ss;
+      for (auto& ext : supported_image_extensions) {
+        if (!ss.str().empty()) {
+          ss << ", ";
+        }
+        ss << ext;
+      }
+      slog::warn << "No supported image inputs found! Please check your file "
+                    "extensions: "
+                 << ss.str() << slog::endl;
+    } else if (imgs_to_be_used > image_files.size()) {
+      slog::warn << "Some image input files will be duplicated: " << imgs_to_be_used << " files are required but only "
+                 << image_files.size() << " are provided" << slog::endl;
+    } else if (imgs_to_be_used < image_files.size()) {
+      slog::warn << "Some image input files will be ignored: only " << imgs_to_be_used << " are required from "
+                 << image_files.size() << slog::endl;
+    }
+
+    video_files = FilterFilesByExtensions(input_files, supported_video_extensions);
+    std::sort(std::begin(video_files), std::end(video_files));
+    auto vids_to_be_used = vid_input_count * requests_num;
+    if (vids_to_be_used > 0 && video_files.empty()) {
+      std::stringstream ss;
+      for (auto& ext : supported_video_extensions) {
+        if (!ss.str().empty()) {
+          ss << ", ";
+        }
+        ss << ext;
+      }
+      slog::warn << "No supported video inputs found! Please check your file extensions: " << ss.str() << slog::endl;
+    } else if (vids_to_be_used > video_files.size()) {
+      slog::warn << "Some video input files will be duplicated: " << vids_to_be_used << " files are required but only "
+                 << video_files.size() << " are provided" << slog::endl;
+    } else if (vids_to_be_used < video_files.size()) {
+      slog::warn << "Some video input files will be ignored: only " << vids_to_be_used << " are required from "
+                 << video_files.size() << slog::endl;
+    }
+  }
+
+  for (size_t i = 0; i < requests_num; ++i) {
+    size_t img_input_id = 0;
+    size_t bin_input_id = 0;
+    size_t vid_input_id = 0;
+
+    for (auto& item : inputs_info) {
+      const std::string& input_name = item.first;
+      const auto& input_info = item.second;
+      if (item.second.IsImage() && !is_binary_data) {
+        if (!image_files.empty()) {
+          // Fill with images
+          blobs[input_name].push_back(GetImageTensor(
+              image_files, img_input_id++, batch_size, img_input_count, i, {input_name, input_info}, resize_type_enum, bgr, verbose));
+          continue;
+        }
+      } else if (input_info.IsVideo()) {
+        if (!video_files.empty()) {
+          // Fill with videos
+          blobs[input_name].push_back(GetVideoTensor(
+              video_files, vid_input_id++, batch_size, vid_input_count, i, {input_name, input_info}, bgr, verbose));
+          continue;
+        }
+      } else {
+        if (!binary_files.empty()) {
+          // Fill with binary files
+          blobs[input_name].push_back(
+              GetBinaryTensor(binary_files, bin_input_id++, batch_size, bin_input_count, i, {input_name, input_info}, verbose));
+          continue;
+        }
+        if (input_info.IsImageInfo() && (net_input_im_sizes.size() == 1)) {
+          // Most likely it is image info: fill with image information
+          auto image_size = net_input_im_sizes.at(0);
+          blobs[input_name].push_back(GetImInfoTensor(image_size, batch_size, {input_name, input_info}));
+          continue;
+        }
+      }
+      // Fill random
+      slog::info << "No suitable input data found, filling input tensors with random data.\n";
+      blobs[input_name].push_back(GetRandomTensor({input_name, input_info}));
+    }
+  }
+
+  return blobs;
+}
+
+/**
+ * @brief Copies data from a source OpenVINO Tensor to a destination Tensor.
+ *
+ * @param dst The destination Tensor where data will be copied.
+ * @param src The source Tensor from which data will be copied.
+ */
+void CopyTensorData(ov::Tensor& dst, const ov::Tensor& src) {
+  if (src.get_shape() != dst.get_shape() || src.get_byte_size() != dst.get_byte_size()) {
+    throw std::runtime_error(
+        "Source and destination tensors shapes and byte sizes are expected to be equal for data copying.");
+  }
+
+  memcpy(dst.data(), src.data(), src.get_byte_size());
+}
diff --git a/python/openvino/runtime/dla_benchmark/inputs_filling.hpp b/python/openvino/runtime/dla_benchmark/inputs_filling.hpp
new file mode 100644
index 0000000..e392bd7
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/inputs_filling.hpp
@@ -0,0 +1,45 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: This file defines methods to fill input data into tensors
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+#include "infer_request_wrap.hpp"
+
+/**
+ * @brief Main function used by DLA benchmark, creates input tensors based off of input files and precision
+ *
+ * Only creates static tensors (no dims of -1). Calls all other functions in this file.
+ *
+ * @param input_files vector of input file paths
+ * @param batch_size batch size of input
+ * @param inputs_info map of input name to InputInfo struct which contains useful input information
+ *                    such as precision, tensor layout
+ * @param requests_num number of infer requests
+ * @param bgr boolean indicating if channels are reversed, corresponds to user bgr flag
+ * @param is_binary_data boolean indicating if the image data should be binary, corresponding to user binary flag
+ * @param verbose Verbosity boolean. If true, additional logs are printed
+ * @return A map of input name with tensor vectors. TensorVector being an alias of ov::Tensors where
+ *         each index corresponds to the batch
+*/
+std::map<std::string, ov::TensorVector> GetStaticTensors(const std::vector<std::string>& input_files,
+                                                         const size_t& batch_size,
+                                                         dla_benchmark::InputsInfo& app_inputs_info,
+                                                         size_t requests_num,
+                                                         std::string resize_type,
+                                                         bool bgr,
+                                                         bool is_binary_data,
+                                                         bool verbose);
+/**
+ * @brief Copies data from a source OpenVINO Tensor to a destination Tensor.
+ *
+ * @param dst The destination Tensor where data will be copied.
+ * @param src The source Tensor from which data will be copied.
+ */
+void CopyTensorData(ov::Tensor& dst, const ov::Tensor& src);
diff --git a/python/openvino/runtime/dla_benchmark/main.cpp b/python/openvino/runtime/dla_benchmark/main.cpp
new file mode 100644
index 0000000..9d9055d
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/main.cpp
@@ -0,0 +1,1575 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: Main file of DLA benchmark. Entry point of DLA for just in time, ahead of time execution
+//              and any use case of DLA performing inference. This file is responsible for the end to end flow of DLA,
+//              from reading user input arguments, creating input tensors, compiling models, running inference
+//              dumping results. DLA benchmark is loosely based off of OpenVINO's sample benchmark app.
+//              For future OpenVINO uplifts viewing their sample app is a good place to start.
+//              Ref: [openvinotoolkit/openvino › samples/cpp/benchmark_app/main.cpp]
+
+#include <algorithm>
+#include <chrono>
+#include <cstddef>
+#include <map>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+#if defined(_WIN32) || defined(_WIN64)
+#include <io.h>
+#define NOMINMAX
+#include <Windows.h>
+#else
+#include <dirent.h>
+#include <unistd.h>
+#endif
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <regex>
+
+#include <samples/args_helper.hpp>
+#include <samples/common.hpp>
+#include <samples/slog.hpp>
+
+// DLA utils
+#include "dla_stl_utils.h"
+#include "dla_defines.h"
+
+// DLA benchmark
+#include "average_precision.hpp"
+#include "dla_benchmark.hpp"
+#include "dla_plugin_config.hpp"
+#include "infer_request_wrap.hpp"
+#include "inputs_filling.hpp"
+#include "progress_bar.hpp"
+#include "statistics_report.hpp"
+#include "top1_top5.hpp"
+#include "utils.hpp"
+
+using DebugNetworkData = std::map<std::string, uint64_t>;
+using LSUCounterData   = std::map<std::string, uint64_t>;
+
+static const size_t progressBarDefaultTotalCount = 1000;
+
+// Get value from env variable named 'name', if it exists.
+// If not, returns provided default value.
+template <class T>
+T GetEnvOrDefault(const char* name, T default_value) {
+  char* str_val = std::getenv(name);
+  T result = default_value;
+  if (str_val != NULL) {
+    std::stringstream ss;
+    ss << str_val;
+    ss >> result;
+  }
+  return result;
+}
+
+bool ExistsTest(const std::string& name) {
+  struct stat buffer;
+  return (stat(name.c_str(), &buffer) == 0);
+}
+
+bool isFile(const std::string& path) {
+#if defined(_WIN32) || defined(_WIN64)
+  std::cout << "Windows-specific implementation for checking if something is a file" << std::endl;
+  // Windows-specific implementation
+  DWORD fileAttr = GetFileAttributesA(path.c_str());
+  if (fileAttr == INVALID_FILE_ATTRIBUTES) {
+    // The path does not exist or an error occurred.
+    return false;
+  }
+  // Check if it's not a directory.
+  return !(fileAttr & FILE_ATTRIBUTE_DIRECTORY);
+#else
+  // UNIX-specific implementation
+  struct stat buffer;
+  if (stat(path.c_str(), &buffer) == 0) {
+    return S_ISREG(buffer.st_mode);
+  }
+  return false;
+#endif
+}
+
+// This function appears in dla_aot_splitter/src/main.cpp too
+bool DirOpenTest(const std::string& name) {
+#if (!defined(_WIN32) && !defined(_WIN64))
+  // If we can open the directory then return true
+  DIR* dp = opendir(name.c_str());
+  if (dp != nullptr) {
+    closedir(dp);
+    return true;
+  }
+#endif  // !_WIN32 && !_WIN64
+  struct stat sb;
+  if (stat(name.c_str(), &sb) == 0) {
+    if ((sb.st_mode & S_IFMT) != S_IFREG) {
+      slog::err << "File " << name << " cannot be opened!" << slog::endl;
+      throw std::logic_error("File cannot be opened!");
+    }
+  }
+  return true;
+}
+
+// Define a custom comparison function to sort based on ASCII names
+bool CompareOutputNodeNames(const ov::Output<const ov::Node>& node1, const ov::Output<const ov::Node>& node2) {
+  return node1.get_any_name() < node2.get_any_name();
+}
+
+// copy arguments into a new array to split the '-i=<arg>' into
+// two arguments (i.e. '-i' and '<arg>') to overcome a bug
+// parseInputFilesArguments function where is doesn't recognize
+// the -i=<arg> format
+void ParseCommandLine(int argc, char** argv) {
+  int num_args = argc;
+  // allocated enough memory in case we needed to split the -i argument into two
+  char** arguments = new char*[num_args + 1];
+  for (int i = 0, j = 0; j < argc; ++i, ++j) {
+    if (strstr(argv[j], "-i=")) {
+      // number of arguments will increase by one after splitting
+      num_args++;
+      arguments[i] = new char[3];
+      strcpy(arguments[i++], "-i");
+      // copy the reset of the argument (i.e. post "-i=")
+      arguments[i] = new char[strlen(argv[j]) - 2];
+      strcpy(arguments[i], argv[j] + 3);
+      continue;
+    }
+    arguments[i] = new char[strlen(argv[j]) + 1];
+    strcpy(arguments[i], argv[j]);
+  }
+  // the parse function is modifying the arguments point so we need to keep
+  // a copy of the original pointer value to delete it properly
+  char** orig_arg_ptr = arguments;
+  gflags::ParseCommandLineNonHelpFlags(&num_args, &arguments, true);
+  // delete the allocated memory
+  for (int i = 0; i < num_args; ++i) {
+    delete[] orig_arg_ptr[i];
+  }
+  delete[] orig_arg_ptr;
+}
+
+bool CheckAndSetPluginsPath(const char* coredla_root) {
+  // plugins_xml_file should probably be removed in the future
+  if (!FLAGS_plugins_xml_file.empty()) {
+    FLAGS_plugins = FLAGS_plugins_xml_file;
+    slog::warn << "====================================================================" << slog::endl;
+    slog::warn << "Warning: -plugins_xml_file option is deprecated, please use -plugins." << slog::endl;
+    slog::warn << "====================================================================" << slog::endl;
+  }
+
+  const char* coredla_work = std::getenv("COREDLA_WORK");
+  std::string coredla_root_str = coredla_root;
+  if (FLAGS_plugins.empty()) {
+    if (coredla_work == nullptr) {
+      FLAGS_plugins = coredla_root_str + "/runtime/plugins.xml";
+    } else {
+      std::string coredla_work_str = coredla_work;
+      FLAGS_plugins = coredla_work_str + "/runtime/plugins.xml";
+    }
+
+    if (ExistsTest(FLAGS_plugins)) {
+      slog::info << "Using default plugins xml file - " << FLAGS_plugins << slog::endl;
+      return true;
+    }
+  }
+
+  if (ExistsTest(FLAGS_plugins) && isFile(FLAGS_plugins)) {
+    slog::info << "Using custom plugins xml file - " << FLAGS_plugins << slog::endl;
+    return true;
+  }
+  // Check if user wants a shortcut to software emulation xml file if a path does not exist
+  if (FLAGS_plugins.find("emulation") != std::string::npos) {
+    // Potential paths for the plugins_emulation.xml file
+    std::string deployed_loc_plugins = coredla_root_str + "/bin/plugins_emulation.xml";
+    std::string developer_loc_plugins = coredla_root_str + "/build/coredla/dla/bin/plugins_emulation.xml";
+
+    if (ExistsTest(deployed_loc_plugins))
+      FLAGS_plugins = deployed_loc_plugins;
+    else if (ExistsTest(developer_loc_plugins))
+      FLAGS_plugins = developer_loc_plugins;
+  } else {
+    // if user didn't specify emulation and user did not pass any xml file, raise an error
+    throw std::invalid_argument("Invalid argument for -plugins. Use 'emulation' or a path to custom xml file");
+  }
+
+  if (ExistsTest(FLAGS_plugins)) {
+    slog::info << "Using custom emulation xml file - " << FLAGS_plugins << slog::endl;
+    return true;
+  }
+
+  return false;
+}
+
+bool ParseAndCheckCommandLine(int argc, char* argv[], size_t& net_size) {
+  // ---------------------------Parsing and validating input arguments--------------------------------------
+  slog::info << "Parsing input parameters" << slog::endl;
+
+  // Check for any flags that are missing their preceding dashes
+  // GFlags quietly ignores any flags missing their dashes, which can cause
+  // dla_benchmark to run with settings other than what the user intended
+
+  // GFlags supports two different styles of flag:
+  // 1. --<flag>
+  // 2. -<flag>
+  // It also supports two different ways of specifying values for flags which
+  // take values:
+  // 1. --<flag>=<value>
+  // 2. --<flag> <value>
+
+  // If we are not expecting a flag, we are expecting a value for the
+  // preceding flag
+  bool expecting_flag = true;
+  // Start at 1 to skip the command itself
+  for (int i = 1; i < argc; i++) {
+    if (expecting_flag) {
+      // A flag is always denoted by the first char being '-'
+      if (argv[i][0] != '-') {
+        slog::err << "Argument " << argv[i] << " is invalid. You"
+                  << " may have forgotten a preceding '-'." << slog::endl;
+        throw std::logic_error("One or more invalid arguments");
+      }
+
+      char* flag_name_start = (argv[i][1] == '-') ? &argv[i][2] : &argv[i][1];
+      std::string flag_name;
+
+      gflags::CommandLineFlagInfo flag_info;
+      if (strstr(flag_name_start, "=")) {
+        flag_name = std::string(flag_name_start, size_t(strstr(flag_name_start, "=") - flag_name_start));
+      } else {
+        flag_name = std::string(flag_name_start);
+      }
+
+      // We expect a flag in the next argv if the current flag is a bool,
+      // because bool flags do not take a value.
+      // If GetCommandLineFlagInfo returns false, we assume the current
+      // flag is a boolean because boolean flags can be specified as
+      // -no<flag>, which is equivalent to -<flag>=false, or the flag
+      // simply being omitted. However, "no<flag>" is not recognized by
+      // GetCommandLineFlagInfo.
+      // Therefore, if the name is not recognized either the flag is a
+      // boolean flag or doesn't exist. In the latter case, gflags errors
+      // when we call ParseCommandLine so we can assume here it's a bool.
+      if (!GetCommandLineFlagInfo(flag_name.c_str(), &flag_info) || strstr(argv[i], "=") || flag_info.type == "bool") {
+        expecting_flag = true;
+      } else {
+        expecting_flag = false;
+      }
+    } else {
+      // If we were expecting a value, doesn't matter what it is
+      // gflags will check all values are the correct type, and
+      // dla_benchmark checks if the values received are sane
+      expecting_flag = true;
+    }
+  }
+
+  ParseCommandLine(argc, argv);
+
+  if (FLAGS_help || FLAGS_h) {
+    ShowUsage();
+    // CoreDLA: Version 2020.3 of OpenVINO assumes that the PAC board with OPAE on it
+    // is an OpenCL/DLAv1 device.  Since it is not, it then errors-out when the device
+    // does not response as expected to the OpenCL query.
+    // showAvailableDevices();
+    std::cout << "\n";
+    return false;
+  }
+
+  if (FLAGS_hidden_help) {
+    PrintHiddenHelp();
+    return false;
+  }
+
+  if (FLAGS_cm.empty()) {
+    std::string network_file_flag;
+    if (!FLAGS_m.empty()) {
+      if (!FLAGS_network_file.empty()) {
+        throw std::invalid_argument(
+            "Both --network-file and -m are specified. Please only use one of the two arguments.");
+      }
+      network_file_flag = FLAGS_m;
+    } else if (!FLAGS_network_file.empty()) {
+      network_file_flag = FLAGS_network_file;
+    } else {
+      throw std::logic_error("Model is required but not set. Please set -m option.");
+    }
+
+    std::vector<std::string> m_paths = split(network_file_flag, MULTIGRAPH_SEP);
+    net_size = m_paths.size();
+    slog::info << "Found " << net_size << " graph" << (net_size == 1 ? "" : "s") << slog::endl;
+    for (auto& m_path : m_paths) {
+      if (!ExistsTest(m_path)) {
+        slog::err << "network file: " << m_path << " doesn't exist. Please provide a valid path with -m." << slog::endl;
+        throw std::logic_error("Model file path does not exist.");
+      }
+    }
+  } else {
+    std::vector<std::string> m_paths = split(FLAGS_cm, MULTIGRAPH_SEP);
+    net_size = m_paths.size();
+    slog::info << "Found " << net_size << " compiled graph" << (net_size == 1 ? "" : "s") << slog::endl;
+    for (auto& m_path : m_paths) {
+      if (!ExistsTest(m_path)) {
+        slog::err << "compiled model file: " << FLAGS_cm << " doesn't exist. Please provide a valid path with -cm."
+                  << slog::endl;
+        throw std::logic_error("Compiled model file path does not exist.");
+      }
+    }
+  }
+
+  if (FLAGS_api != "async" && FLAGS_api != "sync") {
+    throw std::logic_error("Incorrect API. Please set -api option to `sync` or `async` value.");
+  }
+
+  if (FLAGS_niter <= 0) {
+    throw std::logic_error("-niter is a required flag and its value must be positive");
+  }
+
+  const char* coredla_root = std::getenv("COREDLA_ROOT");
+  if (coredla_root == nullptr) {
+    slog::err << "ERROR: COREDLA_ROOT environment variable is not set." << slog::endl;
+    throw std::logic_error("Please set up correct environment variables first");
+  }
+
+  if (!CheckAndSetPluginsPath(coredla_root)) {
+    slog::err << "plugins_xml file: " << FLAGS_plugins_xml_file << " doesn't exist. Please provide a valid path."
+              << slog::endl;
+    throw std::logic_error("plugins_xml file path does not exist.");
+  }
+
+  // Checks required arguments for the mAP calculation subroutine.
+  if (FLAGS_enable_object_detection_ap) {
+    if (!FLAGS_yolo_version.size() || !is_yolo_supported(FLAGS_yolo_version)) {
+      slog::err << "Please specify the version of your YOLO graph by setting the -yolo_version option to "
+                   "`yolo-v3-tiny-tf` or `yolo-v3-tf` value."
+                << slog::endl;
+      throw std::logic_error("Incorrect YOLO version.");
+    }
+  }
+
+  // Checks if output directory exists and can be opened
+  if (!FLAGS_output_dir.empty()) {
+    if (!ExistsTest(FLAGS_output_dir)) {
+      slog::err << "Specified output directory: " << FLAGS_output_dir << " does not exist" << slog::endl;
+      throw std::logic_error("Output directory does not exist");
+    }
+    // Test whether the path can be opened if it's a directory
+    DirOpenTest(FLAGS_output_dir);
+  }
+
+  return true;
+}
+
+static void next_step(const std::string additional_info = "") {
+  static size_t step_id = 0;
+  static const std::map<size_t, std::string> step_names = {{1, "Parsing and validating input arguments"},
+                                                           {2, "Loading OpenVINO Runtime"},
+                                                           {3, "Setting device configuration"},
+                                                           {4, "Reading the Intermediate Representation network"},
+                                                           {5, "Resizing network to match image sizes and given batch"},
+                                                           {6, "Configuring input of the model"},
+                                                           {7, "Loading the model to the device"},
+                                                           {8, "Setting optimal runtime parameters"},
+                                                           {9, "Creating infer requests and preparing input tensors"},
+                                                           {10, "Measuring performance"},
+                                                           {11, "Dumping statistics report"},
+                                                           {12, "Dumping the output values"}};
+
+  step_id++;
+  if (step_names.count(step_id) == 0)
+    THROW_IE_EXCEPTION << "Step ID " << step_id << " is out of total steps number " << step_names.size();
+
+  std::cout << "[Step " << step_id << "/" << step_names.size() << "] " << step_names.at(step_id)
+            << (additional_info.empty() ? "" : " (" + additional_info + ")") << std::endl;
+}
+
+template <typename T>
+T GetMedianValue(const std::vector<T>& vec) {
+  std::vector<T> sorted_vec(vec);
+  std::sort(sorted_vec.begin(), sorted_vec.end());
+  return (sorted_vec.size() % 2 != 0)
+             ? sorted_vec[sorted_vec.size() / 2ULL]
+             : (sorted_vec[sorted_vec.size() / 2ULL] + sorted_vec[sorted_vec.size() / 2ULL - 1ULL]) /
+                   static_cast<T>(2.0);
+}
+
+void ReadDebugNetworkInfo(ov::Core core) {
+  if (FLAGS_debug_network) {
+    // On hardware timeout exception, fetch Debug CSR values from all modules attached to the Debug Network
+    std::vector<DebugNetworkData> debug_csr_return =
+        core.get_property("FPGA", "COREDLA_DEBUG_NETWORK_INFO").as<std::vector<DebugNetworkData>>();
+    slog::info << "Dumping Debug Network profiling counters" << slog::endl;
+    for (auto i = 0U; i < debug_csr_return.size(); i++) {
+      std::cout << "---------- CoreDLA instance " << i << " ----------" << std::endl;
+      // Print debug info for all instances
+      for (auto& instance_csr_return : debug_csr_return[i]) {
+        std::cout << instance_csr_return.first << ": " << instance_csr_return.second << std::endl;
+      }
+    }
+  }
+}
+
+void PrintLSUCounterInfo(ov::Core core) {
+  std::vector<LSUCounterData> lsu_counter_vec =
+    core.get_property("FPGA", "COREDLA_LSU_ACCESS_COUNT").as<std::vector<LSUCounterData>>();
+    slog::info << "Dumping LSU memory access counters" << slog::endl;
+    for (auto i = 0U; i < lsu_counter_vec.size(); i++) {
+      std::cout << "---------- CoreDLA instance " << i << " ----------" << std::endl;
+      for (const auto& entry : lsu_counter_vec.at(i)) {
+        std::cout << entry.first <<": " << entry.second << std::endl;
+      }
+    }
+}
+
+// Returns true if last char of csv is a comma
+bool is_last_char_comma(FILE* file) {
+  if (file == nullptr) return 0;
+
+  int i = -1;
+  std::vector<char> white_space_chars = {'\n', ' ', '\t', '\r', '\f', '\v'};
+  char last_char[1];
+  do {
+    if (std::fseek(file, i, SEEK_END) != 0) {
+      return 0;
+    }
+    if (std::fread(last_char, 1, 1, file) == 0) {
+      return 0;
+    }
+    i--;
+  } while (std::count(white_space_chars.begin(), white_space_chars.end(), last_char[0]) != 0);
+
+  return last_char[0] == ',';
+}
+
+bool fileExists(std::string& path) {
+  struct stat buffer;
+  return (stat(path.c_str(), &buffer) == 0);
+}
+
+void append_value_if_incomplete_to_csv(std::string path, double value) {
+  try {
+    if (!fileExists(path)) {
+      return;
+    }
+
+    FILE* data_file = fopen(path.c_str(), "rb");
+    if (data_file == nullptr) {
+      return;
+    }
+    bool is_comma = is_last_char_comma(data_file);
+    fclose(data_file);
+
+    if (is_comma) {
+      FILE* append_file = fopen(path.c_str(), "a");
+      if (append_file == nullptr) {
+        return;
+      }
+      fprintf(append_file, "%f\n", value);
+      fclose(append_file);
+    }
+  } catch (...) {
+    return;
+  }
+}
+
+/**
+ * @brief The entry point of the dla benchmark
+ */
+int main(int argc, char* argv[]) {
+  std::shared_ptr<StatisticsReport> statistics;
+  try {
+    // Declaring the CompiledModel object as a pointer to workaround the segfault
+    // that occurs when destructing the object. Now that it's declared as a pointer
+    // the complier won't automatically call the destructor of the object at the end
+    // of this scope and we won't delete the allocated memory either
+    std::vector<ov::CompiledModel*> compiled_models;
+    size_t net_size = 0;  // parse the size of networks for arguments check
+
+    size_t return_code = 0;  // universal return code, return this value after dumping out Debug info
+
+    // ----------------- 1. Parsing and validating input arguments -------------------------------------------------
+    next_step();
+
+    if (!ParseAndCheckCommandLine(argc, argv, net_size)) {
+      return 0;
+    }
+
+    bool is_model_compiled = !FLAGS_cm.empty();
+    if (is_model_compiled) {
+      slog::info << "Model is compiled" << slog::endl;
+    }
+
+    std::string arch_file_flag;
+    if (!FLAGS_arch_file.empty()) {
+      if (!FLAGS_arch.empty()) {
+        throw std::invalid_argument(
+            "Both --arch and -arch_file are specified. Please only use one of the two arguments.");
+      }
+      arch_file_flag = FLAGS_arch_file;
+    } else if (!FLAGS_arch.empty()) {
+      arch_file_flag = FLAGS_arch;
+    }
+
+    bool flag_b_default = gflags::GetCommandLineFlagInfoOrDie("b").is_default;
+    bool flag_batch_size_default = gflags::GetCommandLineFlagInfoOrDie("batch_size").is_default;
+
+    size_t batch_size_flag;
+    if (!flag_b_default) {
+      if (!flag_batch_size_default) {
+        throw std::invalid_argument(
+            "Both --batch-size and -b are specified. Please only use one of the two arguments.");
+      }
+      batch_size_flag = FLAGS_b;
+    } else {
+      batch_size_flag = FLAGS_batch_size;
+    }
+
+    if (batch_size_flag > 10000 || batch_size_flag <= 0) {
+      throw std::invalid_argument(
+          "Batch size is too big (>10000) or not a postive number (<=0). Specify the batch size within the specified "
+          "range.");
+    }
+
+    std::string network_file_flag;
+    if (!FLAGS_m.empty()) {
+      if (!FLAGS_network_file.empty()) {
+        throw std::invalid_argument(
+            "Both --network-file and -m are specified. Please only use one of the two arguments.");
+      }
+      network_file_flag = FLAGS_m;
+    } else if (!FLAGS_network_file.empty()) {
+      network_file_flag = FLAGS_network_file;
+    }
+
+    // langsu: ideally use boost to create a sub-folder for ddrfree files
+    // but ed4 toolchain doesn't have boost yet.
+    std::string output_dir;
+    std::string parameter_rom_output_dir;
+    std::string separator = dla::util::path_separator;
+    if (!FLAGS_output_dir.empty()) {
+      output_dir = FLAGS_output_dir + separator;
+      parameter_rom_output_dir = output_dir;
+    } else {
+      output_dir = "." + separator;
+      parameter_rom_output_dir = output_dir;
+    }
+
+    // The set of arguments printed is meant to be a useful summary to the
+    // user, rather than all of the arguments to dla_benchmark
+    slog::info << "Printing summary of arguments being used by dla_benchmark" << slog::endl
+               << "API (-api) ........................... " << FLAGS_api << slog::endl
+               << "Device (-d) .......................... " << FLAGS_d << slog::endl
+               << "Batch size (-b) ...................... " << batch_size_flag << slog::endl
+               << (!FLAGS_cm.empty() ? "Compiled model (-cm) ................. "
+                                     : "Model (-m) ........................... ")
+               << (!FLAGS_cm.empty() ? FLAGS_cm : network_file_flag) << slog::endl
+               << "Num iterations (-niter) .............. "
+               << (FLAGS_niter > 0 ? std::to_string(FLAGS_niter) : "Not specified") << slog::endl
+               << "Input images directory (-i) .......... "
+               << (!FLAGS_i.empty() ? FLAGS_i : "Not specified, will use randomly-generated images") << slog::endl
+               << "Num CPU threads (-nthreads) .......... "
+               << (FLAGS_nthreads > 0 ? std::to_string(FLAGS_nthreads) : "Not specified") << slog::endl
+               << "Architecture file (-arch_file) ....... " << arch_file_flag << slog::endl
+               << "Num inference requests (-nireq) ...... "
+               << (FLAGS_nireq > 0 ? std::to_string(FLAGS_nireq) : "Not specified") << slog::endl
+               << "Plugins file (-plugins) ..... " << FLAGS_plugins << slog::endl
+               << "Groundtruth file (-groundtruth_loc) .. "
+               << (!FLAGS_groundtruth_loc.empty() ? FLAGS_groundtruth_loc : "Not specified") << slog::endl
+               << "Reverse input image channels (-bgr) .. " << (FLAGS_bgr ? "True" : "False") << slog::endl
+               << "EA features " << (FLAGS_enable_early_access ? "enabled." : "disabled.") << slog::endl;
+
+    if (FLAGS_save_run_summary) {
+      std::vector<gflags::CommandLineFlagInfo> flags;
+      StatisticsReport::Parameters command_line_arguments;
+      gflags::GetAllFlags(&flags);
+
+      for (auto& flag : flags) {
+        if (!flag.is_default) {
+          command_line_arguments.push_back({flag.name, flag.current_value});
+        }
+      }
+
+      if (!FLAGS_pcsort.empty() &&
+          (FLAGS_pcsort != "simple_sort" && FLAGS_pcsort != "sort" && FLAGS_pcsort != "no_sort")) {
+        slog::err << "Invalid -pcsort option: " << FLAGS_pcsort << ". Please use one of sort, simple_sort, no_sort."
+                  << slog::endl;
+        return 1;
+      }
+
+      statistics =
+          std::make_shared<StatisticsReport>(StatisticsReport::Config{FLAGS_save_run_summary, FLAGS_report_folder});
+      statistics->addParameters(StatisticsReport::Category::COMMAND_LINE_PARAMETERS, command_line_arguments);
+    }
+
+    /** This vector stores paths to the processed images **/
+    auto multi_input_files = VectorMap<std::vector<std::string>>(
+        SplitMultiInputFilesArguments(net_size),  // get input directory list
+        [&](const std::vector<std::string>& input_args) mutable {
+          std::vector<std::string> files;
+          for (auto& input_arg : input_args) {
+            // Test if the path exists
+            if (!ExistsTest(input_arg)) {
+              slog::err << "Specified image path: " << input_arg << " does not exist" << slog::endl;
+              throw std::logic_error("Image path does not exist");
+            }
+            // Test whether the path can be opened if it's a directory
+            DirOpenTest(input_arg);
+            readInputFilesArguments(files, input_arg);
+          }
+          return files;
+        });
+
+    if (multi_input_files.size() == 0) {
+      // failed to read input files
+      slog::err << "Failed to read input files" << slog::endl;
+      return 1;
+    }
+
+    if (FLAGS_nstreams.empty()) {
+      slog::warn << "-nstreams default value is determined automatically for a device. " << slog::endl;
+      std::cout << "\tAlthough the automatic selection usually provides a reasonable performance, \n"
+                << "\tbut it still may be non-optimal for some cases, for more information look at README."
+                << std::endl;
+    }
+
+#ifdef DISABLE_JIT
+    if (!network_file_flag.empty()) {
+      slog::err << "Runtime compiled without support for Just-in-Time (JIT) execution!" << slog::endl
+                << "Either specify a compiled model using -cm <compiled_model.bin> "
+                << "or recompile the runtime without the -disable_jit flag." << slog::endl;
+      return 1;
+    }
+#endif
+
+    uint32_t num_batches = 1;
+
+    // ----------------- 2. Loading OpenVINO Runtime/Inference Engine
+    // -----------------------------------------------------------
+    next_step();
+
+    // Get optimal runtime parameters for device
+    std::string device_name = FLAGS_d;
+    if (is_model_compiled) {
+      auto compiled_graph_paths = split(FLAGS_cm, MULTIGRAPH_SEP);  // separate each AOT file path
+      for (auto& compiled_graph : compiled_graph_paths) {
+        std::filebuf obj_file_buf;
+        // There does not seem to be a way to get the device from the OpenVINO executable network
+        // Instead we manually read through the xml header in the AOT graph to get the device name (an ugly hack
+        // unfortunately)
+        obj_file_buf.open(compiled_graph.c_str(), std::ios::in | std::ios::binary);
+        std::istream obj_istream(&obj_file_buf);
+        std::string xml_header, current_device;
+        getline(obj_istream, xml_header);                               // retrieve xml header from AOT bin file
+        if (xml_header.find("TARGET_FALLBACK") != std::string::npos) {  // uses hetero plugin
+          int start_index = xml_header.find("TARGET_FALLBACK") + 24;
+          int end_index = xml_header.find("</hetero_config>") - 3;
+          current_device =
+              "HETERO:" + xml_header.substr(start_index, end_index - start_index);  // get device from xml header
+        } else {
+          current_device = "FPGA";
+        }
+        if (device_name == "") {  // device flag not specified in AOT flow
+          device_name = current_device;
+        } else {
+          if (current_device != device_name) {  // print error for non-matching devices
+            throw std::logic_error(
+                "The AOT file does not target the expected device.  "
+                "The device specified to dla_benchmark using the -d flag must be the same as the "
+                "device specified to dla_compiler using the --fplugin flag.");
+          }
+        }
+      }
+    } else {
+      if (device_name == "") device_name = "CPU";  // default device for JIT flow is CPU
+    }
+    ov::Core core(FLAGS_plugins);
+
+    if (device_name.find("CPU") != std::string::npos) {
+      core.set_property("FPGA", {{DLIAPlugin::properties::cpu_used.name(), true}});
+    }
+
+    if (arch_file_flag != "" && device_name.find("FPGA") != std::string::npos) {
+      core.set_property("FPGA", {{DLIAPlugin::properties::arch_path.name(), arch_file_flag}});
+      if (!ExistsTest(arch_file_flag)) {
+        slog::err << "architecture file: " << arch_file_flag << " doesn't exist. Please provide a valid path."
+                  << slog::endl;
+        throw std::logic_error("architecture file path does not exist.");
+      }
+      if (FLAGS_encryption_key != "") {
+        core.set_property("FPGA", {{DLIAPlugin::properties::encryption_key.name(), FLAGS_encryption_key}});
+      }
+      if (FLAGS_encryption_iv != "") {
+        core.set_property("FPGA", {{DLIAPlugin::properties::encryption_iv.name(), FLAGS_encryption_iv}});
+      }
+      // If emulator is used, do not perform decryption of compiled results  in the import step
+      if (FLAGS_emulator_decryption) {
+        core.set_property("FPGA", {{DLIAPlugin::properties::emulator_decryption.name(), CONFIG_VALUE(YES)}});
+      }
+      if (FLAGS_min_subgraph_layers < 1) {
+        slog::err << "-min-subgraph-layers must be >= 1" << slog::endl;
+        return 1;
+      }
+      core.set_property("FPGA", {{DLIAPlugin::properties::min_subgraph_layers.name(), FLAGS_min_subgraph_layers}});
+    }
+
+    if (device_name.find("CPU") != std::string::npos && !FLAGS_l.empty()) {
+      // CPU extensions is loaded as a shared library and passed as a pointer to base extension
+      core.add_extension(FLAGS_l);
+      slog::info << "CPU extensions is loaded " << FLAGS_l << slog::endl;
+    }
+
+    slog::info << "OpenVINO: " << ov::get_openvino_version() << slog::endl;
+    slog::info << "Device info: " << core.get_versions(device_name) << slog::endl;
+
+    // ----------------- 3. Setting device configuration -----------------------------------------------------------
+    next_step();
+
+    auto devices = ParseDevices(device_name);
+    std::map<std::string, uint32_t> device_nstreams = ParseNStreamsValuePerDevice(devices, FLAGS_nstreams);
+    for (auto& pair : device_nstreams) {
+      auto key = std::string(pair.first + "_THROUGHPUT_STREAMS");
+      std::vector<std::string> supported_config_keys =
+          core.get_property(pair.first, METRIC_KEY(SUPPORTED_CONFIG_KEYS)).as<std::vector<std::string>>();
+      if (std::find(supported_config_keys.begin(), supported_config_keys.end(), key) == supported_config_keys.end()) {
+        throw std::logic_error(
+            "Device " + pair.first + " doesn't support config key '" + key + "'! " +
+            "Please specify -nstreams for correct devices in format  <dev1>:<nstreams1>,<dev2>:<nstreams2>");
+      }
+    }
+
+    // pc is for CPU only at the moment
+    bool perf_count = FLAGS_pc;
+    std::string perf_count_sort = FLAGS_pcsort;
+    for (auto& device : devices) {
+      if (device == "CPU") {  // CPU supports few special performance-oriented keys
+        if (perf_count || !perf_count_sort.empty()) {
+          core.set_property("CPU", {{CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(YES)}});
+        }
+        // limit threading for CPU portion of inference
+        if (FLAGS_nthreads != 0)
+          core.set_property(device, {{CONFIG_KEY(CPU_THREADS_NUM), std::to_string(FLAGS_nthreads)}});
+        core.set_property(device, {{CONFIG_KEY(CPU_BIND_THREAD), FLAGS_pin}});
+        // Set CPU to optimize throughput
+        core.set_property(device, ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT));
+        // for CPU execution, more throughput-oriented execution via streams
+        if (FLAGS_api == "async") {
+          core.set_property(
+              device,
+              ov::streams::num(device_nstreams.count(device) > 0 ? ov::streams::Num(device_nstreams.at(device))
+                                                                 : ov::streams::AUTO));
+        }
+        device_nstreams[device] = core.get_property(device, ov::streams::num);
+      } else if (device == ("GPU")) {
+        if (FLAGS_api == "async") {
+          core.set_property(
+              device,
+              ov::streams::num(device_nstreams.count(device) > 0 ? ov::streams::Num(device_nstreams.at(device))
+                                                                 : ov::streams::AUTO));
+        }
+        device_nstreams[device] = core.get_property(device, ov::streams::num);
+      }
+    }
+
+    auto double_to_string = [](const double number) {
+      std::stringstream ss;
+      ss << std::fixed << std::setprecision(4) << number;
+      return ss.str();
+    };
+    auto get_total_ms_time = [](Time::time_point& start_time) {
+      return std::chrono::duration_cast<ns>(Time::now() - start_time).count() * 0.000001;
+    };
+
+    size_t batch_size = batch_size_flag;
+    std::vector<std::string> topology_names;
+    ov::element::Type precision = ov::element::undefined;
+    // Vector stores which model (multigraph), InputsInfo is a map of input names and its respctive
+    // input information
+    std::vector<dla_benchmark::InputsInfo> input_infos;
+    if (!is_model_compiled) {
+#ifndef DISABLE_JIT
+      // We choose to ifdef out this block of code because it's more readable than
+      // pulling the block in the "else" out using ifdefs
+      // ----------------- 4. Reading the Intermediate Representation network ----------------------------------------
+      next_step();
+
+      LOG_AND_PRINT(Logger::INFO, "Loading network files\n");
+
+      auto start_time_read = Time::now();
+      // get list of graphs
+      std::vector<std::shared_ptr<ov::Model>> models =
+          VectorMap<std::shared_ptr<ov::Model>>(split(network_file_flag, MULTIGRAPH_SEP), [&](const std::string& m) {
+            std::shared_ptr<ov::Model> model = core.read_model(m);
+            // Assign rt info IMMEDIATELY when DLA benchmark reads the model.
+            // Applying transformations or reshaping may change node names.
+            // Mixed Precision is an EA only feature for 2024.2
+            if (FLAGS_enable_early_access) {
+              for (auto&& node : model->get_ops()) {
+                if (dla::util::NodeTypeUsesPE(node->get_type_name())) {
+                  node->get_rt_info()[DLA_PE_PRECISION_MODE] =
+                      dla::util::ParseNodeForRTInfo(node->get_friendly_name(), DLA_PE_PRECISION_MODE);
+                }
+              }
+            }
+            printInputAndOutputsInfoShort(*model);
+            return model;
+          });
+
+      auto duration_ms = double_to_string(get_total_ms_time(start_time_read));
+      slog::info << "Read network(s) took " << duration_ms << " ms" << slog::endl;
+      if (statistics)
+        statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                  {{"read network time (ms)", duration_ms}});
+
+      // ----------------- 5. Resizing network to match image sizes and given batch ----------------------------------
+      next_step();
+
+      for (size_t i = 0; i < models.size(); i++) {
+        const auto& model_inputs = std::const_pointer_cast<const ov::Model>(models[i])->inputs();
+        bool reshape = false;
+        input_infos.push_back(
+            GetInputsInfo(batch_size, model_inputs, reshape, FLAGS_bin_data, FLAGS_mean_values, FLAGS_scale_values));
+        if (reshape) {
+          dla_benchmark::PartialShapes shapes = {};
+          for (auto& item : input_infos.back()) shapes[item.first] = item.second.partial_shape;
+          slog::info << "Reshaping model to batch: " << batch_size << slog::endl;
+          models[i]->reshape(shapes);
+        }
+        topology_names.push_back(models[i]->get_friendly_name());
+      }
+
+      // ----------------- 6. Configuring input and output
+      // ----------------------------------------------------------------------
+      next_step();
+      // Set input layouts for all models and their inputs
+      size_t input_info_idx = 0;
+      for (std::shared_ptr<ov::Model> model : models) {
+        auto preproc = ov::preprocess::PrePostProcessor(model);
+        const auto& inputs = model->inputs();
+        for (size_t i = 0; i < inputs.size(); i++) {
+          ov::preprocess::InputInfo& input_info = preproc.input(i);
+          const size_t input_rank = inputs[i].get_partial_shape().size();
+          const ov::Layout& layout = ov::Layout(dla::util::getTensorLayout(input_rank));
+          const ov::element::Type_t type = input_infos[input_info_idx].at(inputs[i].get_any_name()).type;
+          input_info.tensor().set_element_type(type).set_layout(layout);
+        }
+
+        const auto& outputs = model->outputs();
+        for (size_t i = 0; i < outputs.size(); i++) {
+          const size_t output_rank = outputs[i].get_partial_shape().size();
+          const ov::Layout& layout = ov::Layout(dla::util::getTensorLayout(output_rank));
+          preproc.output(i).tensor().set_element_type(ov::element::f32).set_layout(layout);
+        }
+        // Once the build() method is called, the pre(post)processing steps
+        // for layout and precision conversions are inserted automatically
+        model = preproc.build();
+        input_info_idx++;
+      }
+      // ----------------- 7. Loading the model to the device --------------------------------------------------------
+      next_step();
+
+      // Get the value from the command line arguments (if the command line argument wasn't
+      // used by the user the default value set in dla_benchmark.hpp will be used)
+      int folding_option = FLAGS_folding_option;
+      bool fold_preprocessing = FLAGS_fold_preprocessing;
+      bool estimate_per_layer = FLAGS_estimate_per_layer_latencies;
+      bool enable_early_access = FLAGS_enable_early_access;
+      // TODO(arooney): Remove this once LT hang is fixed.
+      bool multi_infer_req = false;
+      if (FLAGS_nireq > 1 && FLAGS_api == "async") {
+        multi_infer_req = true;
+      }
+
+      core.set_property("FPGA", {{DLIAPlugin::properties::folding_option.name(), std::to_string(folding_option)}});
+      core.set_property("FPGA",
+                        {{DLIAPlugin::properties::fold_preprocessing.name(), fold_preprocessing}});
+      core.set_property("FPGA",
+                        {{DLIAPlugin::properties::per_layer_estimation.name(), estimate_per_layer}});
+      core.set_property("FPGA",
+                        {{DLIAPlugin::properties::enable_early_access.name(), enable_early_access}});
+      core.set_property("FPGA",
+                        {{DLIAPlugin::properties::multiple_inferences.name(), multi_infer_req}});
+      core.set_property("FPGA", {{DLIAPlugin::properties::streaming_input_pipe.name(), FLAGS_streaming_input_pipe}});
+
+      auto start_time = Time::now();
+      auto individual_start_time = Time::now();  // timer for each individual graph loading
+      compiled_models = VectorMap<ov::CompiledModel*>(models, [&](std::shared_ptr<ov::Model> model) {
+        // Apply Low Precision transformations to handle quantized graphs
+        // Mohamed_I: currently, this only works if the entire graph fits on the FPGA
+        // because the CPU plugin calls common_optimizations again which has some transformations
+        // that cause the graph to fail (I suspect it's the ConvolutionMultiplyFusion, but I
+        // cannot disable it from the CPU)
+
+        bool FPGA_used = device_name.find("FPGA") != std::string::npos;
+        bool CPU_used = device_name.find("CPU") != std::string::npos;
+
+        ov::AnyMap config;
+        config.emplace(DLIAPlugin::properties::cpu_used.name(), CPU_used);
+        config.emplace(DLIAPlugin::properties::export_dir.name(), output_dir);
+        config.emplace(DLIAPlugin::properties::parameter_rom_export_dir.name(), parameter_rom_output_dir);
+
+        for (auto&& node : model->get_ops()) {
+          if (std::string("FakeQuantize") == node->get_type_name()) {
+            config.emplace(DLIAPlugin::properties::apply_low_precision_transforms.name(), true);
+            if (CPU_used && FPGA_used) {
+              std::cerr << "ERROR: Quantized graphs only supported through HETERO:FPGA or CPU." << std::endl;
+              throw std::logic_error("HETERO:FPGA,CPU plugin is not supported for quantization.");
+            }
+          }
+        }
+
+        auto compiled_model = new ov::CompiledModel();
+        *compiled_model = core.compile_model(model, device_name, config);
+        duration_ms = double_to_string(get_total_ms_time(individual_start_time));
+        individual_start_time = Time::now();
+        slog::info << "Compile model ( " << model->get_friendly_name() << " ) took " << duration_ms << " ms"
+                   << slog::endl;
+        return compiled_model;
+      });
+      duration_ms = double_to_string(get_total_ms_time(start_time));
+      slog::info << "Load network(s) took " << duration_ms << " ms" << slog::endl;
+      if (statistics)
+        statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                  {{"load network time (ms)", duration_ms}});
+#endif
+    } else {
+      next_step();
+      slog::info << "Skipping the step for compiled network" << slog::endl;
+      next_step();
+      slog::info << "Skipping the step for compiled network" << slog::endl;
+      next_step();
+      slog::info << "Skipping the step for compiled network" << slog::endl;
+      // ----------------- 7. Loading the model to the device --------------------------------------------------------
+      next_step();
+      auto compiled_graph_paths = split(FLAGS_cm, MULTIGRAPH_SEP);
+      compiled_models = vectorMapWithIndex<ov::CompiledModel*>(
+          split(FLAGS_cm, MULTIGRAPH_SEP),  // get a list of compiled graphs
+          [&](const std::string& compiled_graph_path, size_t index) {
+            std::stringstream generated_name;
+            generated_name << "Graph_" << index;
+            slog::info << "Importing model from " << compiled_graph_paths[index] << " to " << device_name << " as "
+                       << generated_name.str() << slog::endl;
+            auto start_time = Time::now();
+            std::ifstream model_stream(compiled_graph_paths[index].c_str(), std::ios_base::in | std::ios_base::binary);
+            if (!model_stream.is_open()) {
+              throw std::runtime_error("Cannot open compiled model file: " + compiled_graph_paths[index]);
+            }
+            auto compiled_model = new ov::CompiledModel();
+            core.set_property("FPGA",
+                              {{DLIAPlugin::properties::streaming_input_pipe.name(), FLAGS_streaming_input_pipe}});
+            // Import specific configs
+            ov::AnyMap config;
+            config.emplace(DLIAPlugin::properties::export_dir.name(), output_dir);
+            config.emplace(DLIAPlugin::properties::parameter_rom_export_dir.name(), parameter_rom_output_dir);
+            *compiled_model = core.import_model(model_stream, device_name, config);
+            topology_names.push_back(generated_name.str());
+            model_stream.close();
+            printInputAndOutputsInfoShort(*compiled_model);
+            auto duration_ms = double_to_string(get_total_ms_time(start_time));
+            slog::info << "Import model took " << duration_ms << " ms" << slog::endl;
+            if (statistics)
+              statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                        {{"import model time (ms)", duration_ms}});
+            if (batch_size == 0) {
+              batch_size = 1;
+            }
+            const auto& inputs = compiled_model->inputs();
+            for (const auto& item : inputs) {
+              const auto& shape = item.get_shape();
+              if (shape[0] != batch_size) {
+                slog::err << "Batch size of the compiled model is " << shape[0] << " and batch size provided is "
+                          << batch_size << slog::endl;
+                std::cout << "Set the same batch size = " << shape[0] << " when running the app" << std::endl;
+                std::cout << "Or recompile model with batch size = " << batch_size << std::endl;
+                exit(5);
+              }
+            }
+            bool reshape_required = false;
+            input_infos.push_back(GetInputsInfo(batch_size,
+                                                compiled_model->inputs(),
+                                                reshape_required,
+                                                FLAGS_bin_data,
+                                                FLAGS_mean_values,
+                                                FLAGS_scale_values));
+            return compiled_model;
+          });
+    }
+    // ----------------- 8. Setting optimal runtime parameters -----------------------------------------------------
+    next_step();
+
+    // Number of requests
+    uint32_t nireq = FLAGS_nireq;
+#if defined(__arm__) | defined(__aarch64__)
+    // In OpenVINO 2022.3 Arm plugin, when a AOT graph is compiled on CPU and dla_benchmark has -nireq > 1
+    // the program will be killed. We force nireq = 1 for HETERO:CPU graph only.
+    // Note: -d CPU doesn't need to be checked for AOT because dlac does not support -fplugin CPU.
+    if (device_name == "HETERO:CPU" && nireq > 1) {
+      slog::warn << "-nireq > 1 is not supported for HETERO:CPU graph. Forcing -nireq = 1" << slog::endl;
+      nireq = 1;
+    }
+
+#endif
+
+    if (nireq == 0) {
+      if (FLAGS_api == "sync") {
+        nireq = 1;
+      } else {
+        try {
+          nireq = 0;
+          for (auto& compiled_model : compiled_models) {
+            auto req = compiled_model->get_property(ov::optimal_number_of_infer_requests);
+            if (nireq == 0 || nireq > req) nireq = req;
+          }
+        } catch (const std::exception& ex) {
+          throw ov::Exception("Every device used with the dla_benchmark should support " +
+                              std::string(ov::optimal_number_of_infer_requests.name()) +
+                              " Failed to query the metric for the " + device_name + " with error: " + ex.what());
+        }
+      }
+    }
+#ifdef MAX_NUM_INFERENCE_REQUEST
+    if (nireq > MAX_NUM_INFERENCE_REQUEST) {
+      slog::warn << "-nireq > "<< MAX_NUM_INFERENCE_REQUEST << " is not supported for the underlying device. Forcing -nireq = 1" << slog::endl;
+      nireq = 1;
+    }
+#endif
+
+    // Iteration limit
+    uint32_t niter = FLAGS_niter;
+    if (niter > 0) {
+      // Round up niter to a multiple of nireq
+      niter = ((niter + nireq - 1) / nireq) * nireq;
+      // We previously checked that FLAGS_niter >= 0, so okay to cast to uint.
+      if (static_cast<uint32_t>(FLAGS_niter) != niter) {
+        slog::warn << "Number of iterations was aligned by request number from " << FLAGS_niter << " to " << niter
+                   << " using number of requests " << nireq << slog::endl;
+      }
+      num_batches = niter;
+    } else if (niter > 0) {
+      num_batches = niter;
+    }
+
+    // Graph-request limit on device
+    if (device_name.find("FPGA") != std::string::npos) {
+      int ip_num_instances = core.get_property("FPGA", "COREDLA_NUM_INSTANCES").as<int>();
+      int numOutstandingInferRequest = nireq * net_size / ip_num_instances;
+      int maxOutstandingInferRequest = core.get_property("FPGA", "COREDLA_DMA_CSR_DESCRIPTOR_QUEUE_SIZE").as<int>();
+      if (maxOutstandingInferRequest > 0 && numOutstandingInferRequest > maxOutstandingInferRequest) {
+        slog::err << "Possible number of outstanding inference requests per instance (" << numOutstandingInferRequest
+                  << ") "
+                  << "exceeds the CSR descriptor queue limit (" << maxOutstandingInferRequest << ")" << slog::endl;
+        return 1;
+      }
+    }
+
+    if (statistics) {
+      for (auto& topology_name : topology_names) {
+        statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG,
+                                  {
+                                      {"topology", topology_name},
+                                      {"target device", device_name},
+                                      {"API", FLAGS_api},
+                                      {"precision", std::string(precision.get_type_name())},
+                                      {"batch size", std::to_string(batch_size)},
+                                      {"number of iterations", std::to_string(niter)},
+                                      {"number of parallel infer requests", std::to_string(nireq)},
+                                  });
+      }
+      for (auto& nstreams : device_nstreams) {
+        std::stringstream ss;
+        ss << "number of " << nstreams.first << " streams";
+        statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG,
+                                  {
+                                      {ss.str(), std::to_string(nstreams.second)},
+                                  });
+      }
+    }
+
+    // ----------------- 9. Creating infer requests and filling input blobs ----------------------------------------
+    next_step();
+
+    // Data structure hierarchy
+    // Outermost vec: which model it corresponds to (multigraph)
+    // Map: input/output name and its corresponding TensorVector
+    // TensorVector: An alias for vector<ov::tensor> where each vector element correspond to the batch
+    std::vector<std::map<std::string, ov::TensorVector>> input_data_tensors;
+    std::vector<std::map<std::string, ov::TensorVector>> output_tensors(compiled_models.size());
+
+    std::vector<std::unique_ptr<InferRequestsQueue>> infer_request_queues;
+    const std::string resize_type = FLAGS_resize_type.empty() ? "resize" : FLAGS_resize_type;
+    for (size_t net_idx = 0; net_idx < compiled_models.size(); net_idx++) {
+      // Handle the case that use same inputs for all networks
+      const auto& inputFiles =
+          net_idx >= multi_input_files.size() ? multi_input_files.back() : multi_input_files[net_idx];
+      input_data_tensors.push_back(GetStaticTensors(inputFiles.empty() ? std::vector<std::string>{} : inputFiles,
+                                                    batch_size,
+                                                    input_infos[net_idx],
+                                                    num_batches,
+                                                    resize_type,
+                                                    FLAGS_bgr,
+                                                    FLAGS_bin_data,
+                                                    FLAGS_verbose));
+      // Use unique_ptr to create InferRequestsQueue objects and avoid copying mutex and cv
+      infer_request_queues.push_back(
+          std::move(std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(*(compiled_models[net_idx]), nireq))));
+    }
+
+    // ----------------- 10. Measuring performance ------------------------------------------------------------------
+    size_t progress_bar_total_count = progressBarDefaultTotalCount;
+
+    std::stringstream ss;
+    ss << "Start inference " << FLAGS_api << "ronously";
+    if (FLAGS_api == "async") {
+      if (!ss.str().empty()) {
+        ss << ", ";
+      }
+      ss << infer_request_queues.size() * infer_request_queues.at(0)->requests.size() << " inference requests";
+      std::stringstream device_ss;
+      for (auto& nstreams : device_nstreams) {
+        if (!device_ss.str().empty()) {
+          device_ss << ", ";
+        }
+        device_ss << nstreams.second << " streams for " << nstreams.first;
+      }
+      if (!device_ss.str().empty()) {
+        ss << " using " << device_ss.str();
+      }
+    }
+    ss << ", limits: " << niter << " iterations with each graph, " << compiled_models.size() << " graph(s)";
+    progress_bar_total_count = niter;
+    next_step(ss.str());
+
+    /** Start inference & calculate performance **/
+    /** to align number if iterations to guarantee that last infer requests are executed in the same conditions **/
+    ProgressBar progress_bar(progress_bar_total_count, FLAGS_stream_output, FLAGS_progress);
+    std::vector<size_t> iterations(compiled_models.size(), 0);
+    try {
+      while ((niter != 0LL && iterations.back() < niter) || (FLAGS_api == "async" && iterations.back() % nireq != 0)) {
+        // set up all infer request and prep all i/o Blobs
+        for (size_t net_id = 0; net_id < compiled_models.size(); net_id++) {
+          for (size_t iireq = 0; iireq < nireq; iireq++) {
+            auto infer_request = infer_request_queues.at(net_id)->get_idle_request();
+            if (!infer_request) {
+              THROW_IE_EXCEPTION << "No idle Infer Requests!";
+            }
+
+            if (niter != 0LL) {
+              const auto& outputs = compiled_models[net_id]->outputs();
+              for (const auto& output : outputs) {
+                const std::string& name = output.get_any_name();
+                output_tensors.at(net_id)[name].emplace_back(output.get_element_type(), output.get_shape());
+                infer_request->set_tensor(output, output_tensors.at(net_id).at(name).at(iterations.at(net_id)));
+              }
+              const auto& inputs = compiled_models[net_id]->inputs();
+              for (auto& input : inputs) {
+                const std::string& name = input.get_any_name();
+                const auto& data = input_data_tensors.at(net_id).at(name)[iterations.at(net_id)];
+                infer_request->set_tensor(input, data);
+              }
+            }
+
+            // Execute one request/batch
+            if (FLAGS_api == "sync") {
+              infer_request->infer();
+            } else {
+              // As the inference request is currently idle, the wait() adds no additional overhead (and should return
+              // immediately). The primary reason for calling the method is exception checking/re-throwing. Callback,
+              // that governs the actual execution can handle errors as well, but as it uses just error codes it has no
+              // details like ‘what()’ method of `std::exception` So, rechecking for any exceptions here.
+              infer_request->wait();
+              infer_request->start_async();
+            }
+            iterations.at(net_id)++;
+            if (net_id == compiled_models.size() - 1) {
+              progress_bar.addProgress(1);
+            }
+          }
+        }
+      }
+
+      // wait the latest inference executions
+      for (auto& infer_request_queue : infer_request_queues) {
+        infer_request_queue->wait_all();
+      }
+    } catch (const std::exception& ex) {
+      slog::err << "Inference failed:" << slog::endl;
+      slog::err << ex.what() << slog::endl;
+      ReadDebugNetworkInfo(core);
+      PrintLSUCounterInfo(core);
+      // Instead of setting return_code = 1 and continuing, exit immediately.
+      // High risk of segfaulting / weird behavior when inference fails.
+      return 1;
+    }
+
+    size_t iteration = iterations.back();
+
+    std::vector<double> all_latencies;
+    auto start_time = infer_request_queues.at(0)->get_start_time();
+    auto end_time = infer_request_queues.at(0)->get_end_time();
+    for (auto& infer_request_queue : infer_request_queues) {
+      auto& latencies = infer_request_queue->get_latencies();
+      all_latencies.insert(all_latencies.end(), latencies.begin(), latencies.end());
+      start_time = std::min(start_time, infer_request_queue->get_start_time());
+      end_time = std::max(end_time, infer_request_queue->get_end_time());
+    }
+    double latency = GetMedianValue<double>(all_latencies);
+    double total_duration = std::chrono::duration_cast<ns>(end_time - start_time).count() * 0.000001;
+    double total_fps = (FLAGS_api == "sync")
+                           ? compiled_models.size() * batch_size * 1000.0 / latency
+                           : compiled_models.size() * batch_size * 1000.0 * iteration / total_duration;
+
+    int ip_num_instances = 0;
+    double ip_duration = 0.0;
+    double ip_fps = 0.0;
+    double ip_fps_per_fmax = 0.0;
+    double estimated_ipFps = 0.0;
+    double estimated_ipFpsPerFmax = 0.0;
+    double fmax_core = -1.0;
+    double estimated_ipFps_assumed_fmax = 0.0;
+    if (device_name.find("FPGA") != std::string::npos) {
+      ip_num_instances = core.get_property("FPGA", "COREDLA_NUM_INSTANCES").as<int>();
+      // even if hardware has 2 instances, only 1 instance actually gets used if only 1 inference is performed
+      size_t ip_num_instances_used = std::min((size_t)ip_num_instances, iteration);
+      ip_duration = core.get_property("FPGA", "IP_ACTIVE_TIME").as<double>();
+      if (ip_duration) {
+        if (ip_duration != 0.0) {
+          ip_fps = (FLAGS_api == "sync")
+                       ? compiled_models.size() * batch_size * 1000.0 / latency / ip_num_instances_used
+                       : compiled_models.size() * batch_size * 1000.0 * iteration / ip_duration / ip_num_instances_used;
+        }
+        fmax_core = core.get_property("FPGA", "COREDLA_CLOCK_FREQUENCY").as<double>();
+        if (fmax_core > 0.0) {
+          ip_fps_per_fmax = ip_fps / fmax_core;
+        } else {
+          slog::warn << "Warning: could not estimate clk_dla frequency on the FPGA" << slog::endl;
+        }
+      }
+
+      if (FLAGS_perf_est && (device_name.find("FPGA") != std::string::npos)) {
+        if (is_model_compiled) {
+          // Ahead of Time Flow: getting the imported, precalculated performance estimate
+          estimated_ipFps = core.get_property("FPGA", "IMPORT_PERFORMANCE_EST").as<double>();
+          if (estimated_ipFps < 0)
+            slog::warn << "Missing performance estimation from at least one of the compiled graphs" << slog::endl;
+          estimated_ipFps_assumed_fmax = core.get_property("FPGA", "IMPORT_PERFORMANCE_EST_ASSUMED_FMAX").as<double>();
+        } else {
+#ifndef DISABLE_JIT
+          // Just In Time Flow: running the performance estimate
+          if (fmax_core > 0.0) {
+#if defined(_WIN32) || defined(_WIN64)
+            _putenv_s("PERF_EST_COREDLA_FMAX", double_to_string(fmax_core).c_str());
+            _putenv_s("PERF_EST_PE_FMAX", double_to_string(fmax_core).c_str());
+#else
+            setenv("PERF_EST_COREDLA_FMAX", double_to_string(fmax_core).c_str(), true);
+            setenv("PERF_EST_PE_FMAX", double_to_string(fmax_core).c_str(), true);
+#endif
+            estimated_ipFps_assumed_fmax = fmax_core;
+          } else {
+// In case the fmax_core variable is not set, we use the estimated fmax values for AGX7 and A10.
+// This if statement is just defensive programming for a condition that should not happen.
+#ifdef DE10_AGILEX
+            estimated_ipFps_assumed_fmax = GetEnvOrDefault("PERF_EST_COREDLA_FMAX", 500);  // AGX7 fMAX estimate
+#else
+            estimated_ipFps_assumed_fmax = GetEnvOrDefault("PERF_EST_COREDLA_FMAX", 265);  // A10 fMAX estimate
+#endif
+            slog::warn
+                << "Warning: could not estimate clk_dla frequency on the FPGA, setting the fmax to default value."
+                << slog::endl;
+#if defined(_WIN32) || defined(_WIN64)
+            _putenv_s("PERF_EST_COREDLA_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str());
+            _putenv_s("PERF_EST_PE_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str());
+#else
+            setenv("PERF_EST_COREDLA_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str(), true);
+            setenv("PERF_EST_PE_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str(), true);
+#endif
+          }
+          estimated_ipFps = core.get_property("FPGA", "PLUGIN_PERFORMANCE_EST").as<double>();
+#endif
+        }
+        estimated_ipFpsPerFmax = estimated_ipFps / estimated_ipFps_assumed_fmax;
+      }
+    }
+
+    if (statistics) {
+      statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                {
+                                    {"total execution time (ms)", double_to_string(total_duration)},
+                                    {"IP active time (ms)", double_to_string(ip_duration)},
+                                    {"total number of iterations", std::to_string(iteration)},
+                                });
+      if (device_name.find("MULTI") == std::string::npos) {
+        statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                  {
+                                      {"latency (ms)", double_to_string(latency)},
+                                  });
+      }
+      statistics->addParameters(
+          StatisticsReport::Category::EXECUTION_RESULTS,
+          {{"throughput", double_to_string(total_fps)}, {"IP throughput", double_to_string(ip_fps)}});
+    }
+
+    progress_bar.finish();
+
+    // ----------------- 11. Dumping statistics report -------------------------------------------------------------
+    next_step();
+
+    if (perf_count || !perf_count_sort.empty()) {
+      std::vector<std::vector<ov::ProfilingInfo>> perfCounts;
+      for (size_t ireq = 0; ireq < nireq; ireq++) {
+        auto reqPerfCounts = infer_request_queues.at(0)->requests[ireq]->get_performance_counts();
+        perfCounts.push_back(reqPerfCounts);
+      }
+      if (statistics) {
+        if (perf_count_sort == "sort") {
+          statistics->printPerfCountersSort(perfCounts, "sort");
+        } else if (perf_count_sort == "simple_sort") {
+          statistics->printPerfCountersSort(perfCounts, "simple_sort");
+        } else {
+          statistics->printPerfCountersSort(perfCounts, "no_sort");
+        }
+      }
+    }
+
+    // dla_benchmark originally also implemented more detailed performance
+    // statistics via InferRequest's getPerformanceCounts function
+    // We did not support it, and removed it. If we want to re-implement it
+    // looking at the latest version of OpenVINO's benchmark_app or our git
+    // history would be a good starting point
+    if (statistics) {
+      statistics->dump();
+    }
+
+    std::cout << "count:             " << iteration << " iterations" << std::endl;
+    std::cout << "system duration:   " << double_to_string(total_duration) << " ms" << std::endl;
+    if (ip_duration != 0.0) std::cout << "IP duration:       " << double_to_string(ip_duration) << " ms" << std::endl;
+    if (device_name.find("MULTI") == std::string::npos)
+      std::cout << "latency:           " << double_to_string(latency) << " ms" << std::endl;
+    std::cout << "system throughput: " << double_to_string(total_fps) << " FPS" << std::endl;
+    if (ip_num_instances != 0) std::cout << "number of hardware instances: " << ip_num_instances << std::endl;
+    if (compiled_models.size() != 0)
+      std::cout << "number of network instances: " << compiled_models.size() << std::endl;
+    if (ip_fps != 0.0) std::cout << "IP throughput per instance: " << double_to_string(ip_fps) << " FPS" << std::endl;
+    if (ip_fps_per_fmax != 0.0)
+      std::cout << "IP throughput per fmax per instance: " << double_to_string(ip_fps_per_fmax) << " FPS/MHz"
+                << std::endl;
+    if (fmax_core > 0.0) std::cout << "IP clock frequency: " << double_to_string(fmax_core) << " MHz" << std::endl;
+    if (estimated_ipFps != 0.0)
+      std::cout << "estimated IP throughput per instance: " << double_to_string(estimated_ipFps) << " FPS ("
+                << (int)estimated_ipFps_assumed_fmax << " MHz assumed)" << std::endl;
+    if (estimated_ipFpsPerFmax != 0.0)
+      std::cout << "estimated IP throughput per fmax per instance: " << double_to_string(estimated_ipFpsPerFmax)
+                << " FPS/MHz" << std::endl;
+
+    // ----------------- 12. Dumping output values -------------------------------------------------------------
+    next_step();
+
+    if (FLAGS_dump_output) {
+      for (size_t i = 0; i < compiled_models.size(); i++) {
+        std::vector<ov::Output<const ov::Node>> output_info = compiled_models[i]->outputs();
+        // For multi-outputs: Sort to ensure the order of each tensor dump aligns with the ground truth files
+        std::sort(output_info.begin(), output_info.end(), CompareOutputNodeNames);
+        const auto& output_tensors_map = output_tensors[i];
+        // A flag regarding whether we can dump output tensor in a text file due to unsupported layout.
+        // This flag is set at first during dumping.
+        bool can_dump_txt = true;
+        bool can_dump_layout_info_in_txt = true;
+        // dump output tensor as bin, which can be loaded using Python Numpy
+        std::regex pattern("\\{batch\\}");
+        std::string results_bin_file_name = output_dir + "result_{batch}.bin";
+        // dump output tensor as text
+        // backward compatibility support for old regtests that used only one graph
+        std::string results_txt_file_name = output_dir + "result.txt";
+        std::string results_boundaries_file_name = output_dir + "result_tensor_boundaries.txt";
+        // dump inference arguments and metadata as JSON
+        std::string results_meta_file_name = output_dir + "result_meta.json";
+
+        if (compiled_models.size() > 1) {
+          results_bin_file_name = output_dir + topology_names[i] + "_result_{batch}.bin";
+          results_txt_file_name = output_dir + topology_names[i] + "_result.txt";
+          results_boundaries_file_name = output_dir + topology_names[i] + "_result_tensor_boundaries.txt";
+          results_meta_file_name = output_dir + topology_names[i] + "_result_meta.json";
+        }
+
+        slog::info << "Dumping result of " << topology_names[i]
+                   << " to " << results_txt_file_name << slog::endl;
+        slog::info << "Dumping per-batch result (raw output) of " << topology_names[i]
+                   << " to " << results_bin_file_name << slog::endl;
+        slog::info << "Dumping inference meta data of " << topology_names[i]
+                   << " to " << results_meta_file_name << slog::endl;
+
+        std::ofstream result_txt_file(results_txt_file_name);
+        std::ofstream results_boundaries(results_boundaries_file_name);
+        std::ofstream result_meta_file(results_meta_file_name);
+
+        dla_benchmark::InferenceMetaData result_metadata;
+        result_metadata.input_files = multi_input_files.at(i);  // all input files in -i
+        result_metadata.groundtruth_loc = FLAGS_groundtruth_loc;
+        result_metadata.batch_size = FLAGS_batch_size;
+        result_metadata.niter = niter;
+        result_metadata.nireq = nireq;
+        result_metadata.model_input_info = input_infos[i];
+        dla_benchmark::OutputsInfoVec model_output_info;
+
+        uint32_t current_lines = 1;
+        size_t max_allowed_megabytes_to_dump = FLAGS_max_output_file_size;
+
+        for (uint32_t batch = 0; batch < num_batches; batch++) {
+          std::string per_batch_results_bin_file_name = std::regex_replace(results_bin_file_name,
+                                                                           pattern,
+                                                                           std::to_string(batch));
+          std::ofstream per_batch_results_bin_file(per_batch_results_bin_file_name, std::ios::binary);
+
+          for (const auto& item : output_info) {
+            auto tensor = output_tensors_map.at(item.get_any_name()).at(batch);
+            unsigned int output_size = tensor.get_size() / batch_size;
+
+            const ov::Layout& layout = ov::layout::get_layout(item);
+            const auto& shape = tensor.get_shape();
+            const std::string& name = item.get_any_name();
+            size_t total_bytes_to_dump = tensor.get_size() * niter * sizeof(float);
+
+            if (can_dump_txt) {
+              // if we cannot dump as a text file, we set can_dump_txt flag to false and write the one-time message
+              if (total_bytes_to_dump > max_allowed_megabytes_to_dump * BYTE_TO_MEGABYTE) {
+                can_dump_txt = false;
+                std::string msg = "Output tensor (" + std::to_string(total_bytes_to_dump / BYTE_TO_MEGABYTE) +
+                                  " MB) "
+                                  "is too large to dump. Change environmental variable MAX_DUMP_OUTPUT_TXT (default " +
+                                  std::to_string(FLAGS_max_output_file_size) + " MB) to allow dumping larger tensors";
+                slog::warn << msg << slog::endl;
+                result_txt_file << msg;
+              } else {
+                if (can_dump_layout_info_in_txt && shape.size() != 2 && shape.size() != 4 && shape.size() != 5) {
+                  can_dump_layout_info_in_txt = false;
+                  slog::warn << "Output data tensor of rank that is not 2, 4 or 5. layout info will not be dumped in "
+                             << "result.txt." << slog::endl;
+                }
+                // Otherwise, dump text and write to the result_tensor_boundaries.txt with additional information
+                // about the result.txt file
+                results_boundaries << name << ": Line " << current_lines << " to "
+                                   << "line " << current_lines + output_size - 1 << std::endl;
+                results_boundaries << name << " output layout: " << layout.to_string() << std::endl;
+                results_boundaries << name << " output dimension:";
+                for (unsigned int dim = 0; dim < shape.size(); dim++) {
+                  results_boundaries << " " << shape[dim];
+                }
+                results_boundaries << std::endl;
+                current_lines = current_lines + output_size;
+                DumpResultTxtFile(tensor, item, output_size, result_txt_file);
+              }
+            }
+            DumpResultBinFile(tensor, per_batch_results_bin_file);
+
+            if (batch == 0) {
+              // all batches should have the same output info
+              dla_benchmark::OutputInfo output_info;
+              output_info.name = name;
+              output_info.shape = shape;
+              model_output_info.push_back(output_info);
+            }
+          }
+          per_batch_results_bin_file.close();
+        }
+
+        result_metadata.model_output_info = model_output_info;
+        DumpResultMetaJSONFile(result_metadata, result_meta_file);
+        result_txt_file.close();
+        results_boundaries.close();
+        result_meta_file.close();
+      }
+      const std::string throughput_file_name = output_dir + "throughput_report.txt";
+      std::ofstream throughput_file;
+      throughput_file.open(throughput_file_name);
+      throughput_file << "Throughput : " << total_fps << " fps" << std::endl;
+      throughput_file << "Batch Size : " << batch_size << std::endl;
+      throughput_file << "Graph number : " << compiled_models.size() << std::endl;
+      throughput_file << "Num Batches : " << num_batches << std::endl;
+      throughput_file.close();
+
+      // Append throughput to dataset
+      // Check both gz and non gz versions
+      std::string dataset_gz_file_name = "data.csv.gz";
+      append_value_if_incomplete_to_csv(dataset_gz_file_name, ip_fps);
+      std::string dataset_file_name = "data.csv";
+      append_value_if_incomplete_to_csv(dataset_file_name, ip_fps);
+    }
+
+    // Calculate top 1, top 5 results
+    if (FLAGS_groundtruth_loc != "") {
+      auto groundtruth_files = split(FLAGS_groundtruth_loc, MULTIGRAPH_SEP);
+      for (size_t i = 0; i < compiled_models.size(); i++) {
+        // This flag `FLAGS_enable_object_detection_ap` enables accuracy checking subroutine that
+        // gives the mAP and COCO AP scores. These scores are two of the main detection evaluation
+        // metrics used in the Common Objects in Context contest, https://cocodataset.org/#detection-eval.
+
+        std::vector<ov::Output<const ov::Node>> output_info = compiled_models[i]->outputs();
+        // For multi-outputs: Sort to ensure the order of each tensor dump aligns with the ground truth files
+        std::sort(output_info.begin(), output_info.end(), CompareOutputNodeNames);
+        // Run the default top-1, top-5 evaluation routine if AP scores are not required.
+        if (!FLAGS_enable_object_detection_ap) {
+          if (groundtruth_files.size() <= i) {
+            slog::warn << "Missing ground truth file for " << topology_names[i] << "! SKIPPED" << slog::endl;
+            continue;  // Print warnings for all missing ground truth graphs;
+          }
+          slog::info << "Comparing ground truth file " << groundtruth_files[i] << " with network " << topology_names[i]
+                     << slog::endl;
+          // captures the results in higher precision for accuracy analysis
+          std::vector<float> results;
+          const auto& output_tensors_map = output_tensors[i];
+          for (uint32_t batch = 0; batch < num_batches; batch++) {
+            for (unsigned int img = 0; img < batch_size; img++) {
+              for (const auto& item : output_info) {
+                auto tensor = output_tensors_map.at(item.get_any_name()).at(batch);
+                auto tensor_data = tensor.data<float>();
+                unsigned int output_size = tensor.get_size() / batch_size;
+                size_t offset = img * output_size;
+                for (unsigned int j = 0; j < output_size; j++) {
+                  results.push_back(tensor_data[j + offset]);
+                }
+              }
+            }
+          }
+          bool passed = TopResultsAnalyser::get_top_results(groundtruth_files[i], results, batch_size * num_batches);
+          if (passed) {
+            slog::info << "Get top results for \"" << topology_names[i] << "\" graph passed" << slog::endl;
+          } else {
+            // return 4 indicates that the accuracy of the result was below the threshold
+            return_code = 4;
+          }
+        } else {
+          // Runs the accuracy checking routine if AP scores are required.
+          set_runtime(FLAGS_yolo_version, FLAGS_niter, batch_size_flag, FLAGS_i, FLAGS_groundtruth_loc);
+          std::pair<double, double> res =
+              validate_yolo_wrapper(output_tensors[i], output_info, multi_input_files.at(0));
+          std::cout << std::endl;
+          slog::info << "Batch metrics results:" << slog::endl;
+          std::cout << "Detection - mAP@0.5: " << std::setprecision(6) << res.first * 100 << "%" << std::endl;
+          std::cout << "Detection - mAP@0.5:0.95: " << std::setprecision(6) << res.second * 100 << "%" << std::endl;
+        }
+      }
+    }
+    // Output Debug Network Info if COREDLA_TEST_DEBUG_NETWORK is set
+    ReadDebugNetworkInfo(core);
+    if (FLAGS_report_lsu_counters) {
+      PrintLSUCounterInfo(core);
+    }
+    if (return_code) return return_code;
+  } catch (const std::exception& ex) {
+    slog::err << ex.what() << slog::endl;
+
+    if (statistics) {
+      statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                {
+                                    {"Error during dla_benchmark: ", ex.what()},
+                                });
+      statistics->dump();
+    }
+
+    return 3;
+  }
+
+  return 0;
+  // Bypass long function lint check
+  // NOLINTNEXTLINE(readability/fn_size)
+}
diff --git a/python/openvino/runtime/dla_benchmark/progress_bar.hpp b/python/openvino/runtime/dla_benchmark/progress_bar.hpp
new file mode 100644
index 0000000..cb4459a
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/progress_bar.hpp
@@ -0,0 +1,52 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+
+#include <samples/console_progress.hpp>
+
+/// @brief Responsible for progress bar handling within the dla_benchmark
+class ProgressBar {
+ public:
+  explicit ProgressBar(size_t totalNum, bool streamOutput = false, bool progressEnabled = false) {
+    _bar.reset(new ConsoleProgress(totalNum, streamOutput));
+    _streamOutput = streamOutput;
+    _isFinished = true;
+    _progressEnabled = progressEnabled;
+  }
+
+  void addProgress(size_t num) {
+    _isFinished = false;
+    if (_progressEnabled) {
+      _bar->addProgress(num);
+    }
+  }
+
+  void finish(size_t num = 0) {
+    if (num > 0) {
+      addProgress(num);
+    }
+    _isFinished = true;
+    _bar->finish();
+    if (_progressEnabled) {
+      std::cout << std::endl;
+    }
+  }
+
+  void newBar(size_t totalNum) {
+    if (_isFinished) {
+      _bar.reset(new ConsoleProgress(totalNum, _streamOutput));
+    } else {
+      throw std::logic_error("Cannot create a new bar. Current bar is still in progress");
+    }
+  }
+
+ private:
+  std::unique_ptr<ConsoleProgress> _bar;
+  bool _streamOutput;
+  bool _isFinished;
+  bool _progressEnabled;
+};
diff --git a/python/openvino/runtime/dla_benchmark/shared_tensor_allocator.hpp b/python/openvino/runtime/dla_benchmark/shared_tensor_allocator.hpp
new file mode 100644
index 0000000..f97c798
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/shared_tensor_allocator.hpp
@@ -0,0 +1,55 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <algorithm>
+#include "openvino/runtime/allocator.hpp"
+
+// Modified from SharedTensorAllocator in [openvinotoolkit/openvino ›
+// samples/cpp/benchmark_app/shared_tensor_allocator.hpp]
+class SharedTensorAllocator : public ov::AllocatorImpl {
+ public:
+  SharedTensorAllocator(size_t sizeBytes) : size(sizeBytes) { data = new char[size]; }
+
+  // Copy Constructor
+  SharedTensorAllocator(const SharedTensorAllocator& other) : size(other.size) {
+    data = new char[size];
+    std::copy(other.data, other.data + size, data);
+  }
+
+  // Copy Assignment Operator
+  SharedTensorAllocator& operator=(const SharedTensorAllocator& other) {
+    if (this != &other) {
+      size = other.size;
+      delete[] data;
+      data = new char[size];
+      std::copy(other.data, other.data + size, data);
+    }
+    return *this;
+  }
+
+  ~SharedTensorAllocator() { delete[] data; }
+
+  void* allocate(const size_t bytes, const size_t) override {
+    return bytes <= this->size ? (void*)data : nullptr;
+  }
+
+  void deallocate(void* handle, const size_t bytes, const size_t) override {
+    if (handle == data) {
+      delete[] data;
+      data = nullptr;
+    }
+  }
+
+  bool is_equal(const AllocatorImpl& other) const override {
+    auto other_tensor_allocator = dynamic_cast<const SharedTensorAllocator*>(&other);
+    return other_tensor_allocator != nullptr && other_tensor_allocator == this;
+  }
+
+  char* get_buffer() { return data; }
+
+ private:
+  char* data;
+  size_t size;
+};
diff --git a/python/openvino/runtime/dla_benchmark/statistics_report.cpp b/python/openvino/runtime/dla_benchmark/statistics_report.cpp
new file mode 100644
index 0000000..ce80a2e
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/statistics_report.cpp
@@ -0,0 +1,149 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: The file implements functions to dump inference performance statistics
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "statistics_report.hpp"
+
+static const char* STATUS_NAMES[] = {"NOT_RUN", "OPTIMIZED_OUT", "EXECUTED"};
+
+void StatisticsReport::addParameters(const Category& category, const Parameters& parameters) {
+  if (_parameters.count(category) == 0)
+    _parameters[category] = parameters;
+  else
+    _parameters[category].insert(_parameters[category].end(), parameters.begin(), parameters.end());
+}
+
+void StatisticsReport::dump() {
+  CsvDumper dumper(true, _config.report_folder + _separator + "dla_benchmark_run_summary.csv");
+
+  auto dump_parameters = [&dumper](const Parameters& parameters) {
+    for (auto& parameter : parameters) {
+      dumper << parameter.first << parameter.second;
+      dumper.endLine();
+    }
+  };
+  if (_parameters.count(Category::COMMAND_LINE_PARAMETERS)) {
+    dumper << "Command line parameters";
+    dumper.endLine();
+
+    dump_parameters(_parameters.at(Category::COMMAND_LINE_PARAMETERS));
+    dumper.endLine();
+  }
+
+  if (_parameters.count(Category::RUNTIME_CONFIG)) {
+    dumper << "Configuration setup";
+    dumper.endLine();
+
+    dump_parameters(_parameters.at(Category::RUNTIME_CONFIG));
+    dumper.endLine();
+  }
+
+  if (_parameters.count(Category::EXECUTION_RESULTS)) {
+    dumper << "Execution results";
+    dumper.endLine();
+
+    dump_parameters(_parameters.at(Category::EXECUTION_RESULTS));
+    dumper.endLine();
+  }
+
+  slog::info << "Run summary is saved to " << dumper.getFilename() << slog::endl;
+}
+
+void StatisticsReport::printPerfCountersSort(const std::vector<PerformanceCounters>& perfCounts, std::string sortFlag) {
+  for (size_t ni = 0; ni < perfCounts.size(); ni++) {
+    const auto& perf_counts = perfCounts[ni];
+    double total_time(0);
+    double total_time_cpu(0);
+    std::cout << "Performance counts sorted for " << ni << "-th infer request" << std::endl;
+    for (auto&& pi : perf_counts) {
+      total_time += pi.real_time.count();
+      total_time_cpu += pi.cpu_time.count();
+    }
+    auto total_real_time_proportion = 0.0;
+    std::vector<std::vector<std::string>> total_detail_data;
+    for (auto&& pi : perf_counts) {
+      auto node_name = pi.node_name;
+      std::string layer_status_str =
+          ((int)pi.status < (int)(sizeof(STATUS_NAMES) / sizeof(STATUS_NAMES[0])) ? STATUS_NAMES[(int)pi.status]
+                                                                                  : "INVALID_STATUS");
+
+      auto layer_type = pi.node_type;
+      auto real_time = pi.real_time.count();
+      auto cpu_time = pi.cpu_time.count();
+      auto real_proportion = real_time / total_time;
+      auto execType = pi.exec_type;
+      std::vector<std::string> tmp_data{node_name,
+                                        layer_status_str,
+                                        std::string(layer_type),
+                                        std::to_string(real_time),
+                                        std::to_string(cpu_time),
+                                        std::to_string(real_proportion),
+                                        std::string(execType)};
+      total_detail_data.push_back(tmp_data);
+      total_real_time_proportion += real_proportion;
+    }
+    // sorted by read_time
+    if (sortFlag == "sort") {
+      std::sort(total_detail_data.begin(), total_detail_data.end(), [](const auto& a, const auto& b) {
+        return std::stod(a[3]) > std::stod(b[3]);
+      });
+    } else if (sortFlag == "no_sort") {
+      total_detail_data = total_detail_data;
+    } else if (sortFlag == "simple_sort") {
+      std::sort(total_detail_data.begin(), total_detail_data.end(), [](const auto& a, const auto& b) {
+        return std::stod(a[3]) > std::stod(b[3]);
+      });
+      total_detail_data.erase(
+          std::remove_if(
+              total_detail_data.begin(), total_detail_data.end(), [](const auto& a) { return a[1] == "NOT_RUN"; }),
+          total_detail_data.end());
+    }
+    printDetailResult(total_detail_data);
+    // Save the current state of std::cout. This is to avoid coverity error.
+    std::ios_base::fmtflags f(std::cout.flags());
+
+    std::cout << "Total time:       " << total_time / 1000 << " microseconds" << std::endl;
+    std::cout << "Total CPU time:   " << total_time_cpu / 1000 << " microseconds" << std::endl;
+    std::cout << "Total proportion: " << std::fixed << std::setprecision(2) << round(total_real_time_proportion * 100)
+              << " % \n"
+              << std::endl;
+
+    // Restore the original state
+    std::cout.flags(f);
+  }
+}
+
+void StatisticsReport::printDetailResult(std::vector<std::vector<std::string>> result_list) {
+  const int max_layer_name_len = 50;
+  for (auto&& tmp_result : result_list) {
+    std::string node_name = tmp_result[0];
+    std::string node_name_truncated = node_name.substr(0, max_layer_name_len - 4);
+    if (node_name.length() >= max_layer_name_len) {
+      node_name_truncated += "...";
+    }
+    std::string layerStatus = tmp_result[1];
+    std::string layerType = tmp_result[2];
+    float real_time = std::stof(tmp_result[3]);
+    float cpu_time = std::stof(tmp_result[4]);
+    float proportion = std::stof(tmp_result[5]);
+    std::string execType = tmp_result[6];
+
+    std::printf(
+        "node: %-50s LayerStatus: %-15s LayerType: %-30s RealTime: %-20.3f CPUTime: %-20.3f Proportion: %-30.3f "
+        "ExecType: %-20s\n",
+        node_name_truncated.c_str(),
+        layerStatus.c_str(),
+        layerType.substr(0, max_layer_name_len).c_str(),
+        real_time / 1000.0,  // ms
+        cpu_time / 1000.0,   // ms
+        proportion * 100,
+        std::string(execType).substr(0, max_layer_name_len).c_str());
+  }
+}
diff --git a/python/openvino/runtime/dla_benchmark/statistics_report.hpp b/python/openvino/runtime/dla_benchmark/statistics_report.hpp
new file mode 100644
index 0000000..8032630
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/statistics_report.hpp
@@ -0,0 +1,83 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: The file defines functions to dump inference performance statistics
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+#include <samples/common.hpp>
+#include <samples/csv_dumper.hpp>
+#include <samples/slog.hpp>
+#include "utils.hpp"
+#include "dla_defines.h"
+
+// @brief statistics reports types
+static constexpr char noCntReport[] = "no_counters";
+static constexpr char averageCntReport[] = "average_counters";
+static constexpr char detailedCntReport[] = "detailed_counters";
+
+/// @brief Responsible for collecting of statistics and dumping to .csv file
+class StatisticsReport {
+ public:
+  typedef std::vector<ov::ProfilingInfo> PerformanceCounters;
+  typedef std::vector<std::pair<std::string, std::string>> Parameters;
+
+  struct Config {
+    bool save_report;
+    std::string report_folder;
+  };
+
+  enum class Category {
+    COMMAND_LINE_PARAMETERS,
+    RUNTIME_CONFIG,
+    EXECUTION_RESULTS,
+  };
+
+  explicit StatisticsReport(Config config) : _config(std::move(config)) {
+    _separator = dla::util::path_separator;
+    if (_config.report_folder.empty()) _separator = "";
+  }
+
+  void addParameters(const Category &category, const Parameters &parameters);
+
+  void dump();
+
+  /// print the performance counters for neural net layers executed on the CPU.
+  /// @param perfCounts                vector of map of layer name and InferenceEngineProfileInfo.
+  /// @param sortFlag                  One of "sort", "no_sort", "simple_sort".
+  ///                                    "sort": sort by execution RealTime. Default value.
+  ///                                    "no_sort": no sort.
+  ///                                    "simple_sort": sort by execution RealTime after removing nodes with "NOT_RUN"
+  ///                                    status.
+  void printPerfCountersSort(const std::vector<PerformanceCounters> &perfCounts, std::string sortFlag = "sort");
+
+  /// Helper function used by printPerfCountersSort that prints a row of performance count info.
+  /// prints the following info for a layer from left to right:
+  /// 0. nodeName: name of the layer
+  /// 1. LayerStatus: NOT_RUN, OPTIMIZED_OUT, or EXECUTED
+  /// 2. LayerType: type of layer, such as Convolution.
+  /// 3. RealTime (ms): The absolute time that the layer ran (in total), including CPU processing time + any potential
+  /// wait time.
+  /// 4. CPUTime (ms): The net host cpu time that the layer ran, i.e. CPU processing time.
+  /// 5. Proportion: RealTime of the node / RealTime in total
+  /// 6. ExecType: An execution type of unit. e.g.,  jit_avx2_FP32 (executed using just-in-time (JIT) compilation with
+  /// AVX2 instructions for FP32 data)
+  /// @param result_list              vector of per-node info, where each per-node info is a vector of formatted string.
+  void printDetailResult(std::vector<std::vector<std::string>> result_list);
+
+ private:
+  // configuration of current benchmark execution
+  const Config _config;
+
+  // parameters
+  std::map<Category, Parameters> _parameters;
+
+  // csv separator
+  std::string _separator;
+};
diff --git a/python/openvino/runtime/dla_benchmark/top1_top5.hpp b/python/openvino/runtime/dla_benchmark/top1_top5.hpp
new file mode 100644
index 0000000..4f27bb2
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/top1_top5.hpp
@@ -0,0 +1,222 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: This file defines and implements functions to calculate top1 and top5 scores.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <utility>
+
+class TopResultsAnalyser {
+ public:
+  static bool get_top_results(const std::string groundtruth_loc, const std::string results_loc, uint32_t batchSize) {
+    // This function loads the output results from a file,
+    // The dla benchmark currently uses the get_top_results(string, vector<float>, uint)
+    // This function is kept as it can be used to assess accuracy post runtime,
+    // although it seems awfully similar to the other version of get_top_results().
+    const std::string accuracy_results_loc = "accuracy_report.txt";
+    std::ofstream accuracy_file(accuracy_results_loc);
+
+    if (!accuracy_file.is_open()) {
+      throw std::invalid_argument("Unable to open accuracy file.");
+    }
+
+    std::ifstream groundtruth_file(groundtruth_loc);
+    int groundtruth_lineno = 0;
+
+    if (!groundtruth_file.is_open()) {
+      throw std::invalid_argument("Unable to open groundtruth file.");
+    }
+
+    std::ifstream results_file(results_loc);
+
+    if (!results_file.is_open()) {
+      throw std::invalid_argument("Unable to open result file.");
+    }
+
+    std::string results_line;
+    std::vector<float> results;
+    while (std::getline(results_file, results_line)) {
+      const float result = std::stof(results_line);
+      results.push_back(result);
+    }
+
+    if (results.size() % batchSize != 0) {
+      std::cout << "Results size = " << results.size() << " Batch size = " << batchSize << std::endl;
+      throw std::invalid_argument("Results size is not a multiple of batch size");
+    }
+
+    typedef std::pair<uint64_t, float> CatProbPair;
+    const uint64_t img_output_size = results.size() / batchSize;
+    uint32_t top1_correct_guesses = 0;
+    uint32_t top5_correct_guesses = 0;
+    const auto top_n = fmin(5, img_output_size);
+    for (uint32_t img = 0; img < batchSize; img++) {
+      accuracy_file << "image " << img << " top 5:" << std::endl;
+
+      const auto start_addr = img_output_size * img;
+      std::vector<CatProbPair> top5;
+      for (int i = 0; i < top_n; i++) {
+        top5.push_back(std::make_pair(i, results[start_addr + i]));
+      }
+
+      for (uint64_t i = 5; i < img_output_size; i++) {
+        const auto e = results[start_addr + i];
+        auto min_ele = &top5.at(0);
+        for (size_t j = 1; j < top5.size(); j++) {
+          if (top5.at(j).second < min_ele->second) {
+            min_ele = &top5.at(j);
+          }
+        }
+        if (e > min_ele->second) {
+          *min_ele = std::make_pair(i, e);
+        }
+      }
+
+      // sort descending
+      std::sort(
+          top5.begin(), top5.end(), [](const CatProbPair& a, const CatProbPair& b) { return a.second > b.second; });
+      for (const auto& pair : top5) {
+        accuracy_file << pair.first << " : " << pair.second << std::endl;
+      }
+      std::string line;
+      std::getline(groundtruth_file, line);
+      ++groundtruth_lineno;
+      uint64_t truth;
+      try {
+        truth = std::stoi(line);
+      } catch (const std::invalid_argument& ia) {
+        THROW_IE_EXCEPTION << "Unable to parse line " << groundtruth_lineno << " "
+                           << "of the ground truth file " << groundtruth_loc;
+      }
+      accuracy_file << truth << " : truth" << std::endl;
+      top1_correct_guesses += (top5.at(0).first == truth);
+
+      uint64_t i = 1;
+      for (const auto& guess : top5) {
+        if (guess.first == truth && i < img_output_size) {
+          top5_correct_guesses += 1;
+          break;
+        }
+        i += 1;
+      }
+    }
+
+    const auto top_n_string = [&](std::ostream& stream, const double correct_guesses, const uint32_t N) {
+      stream << "top" << N << " accuracy: " << (correct_guesses * 100.0) / (batchSize) << " %" << std::endl;
+    };
+
+    accuracy_file << "====================" << std::endl;
+
+    top_n_string(accuracy_file, top1_correct_guesses, 1);
+    top_n_string(std::cout, top1_correct_guesses, 1);
+    if (2 < img_output_size && img_output_size < 6) {
+      top_n_string(accuracy_file, top5_correct_guesses, img_output_size - 1);
+      top_n_string(std::cout, top5_correct_guesses, img_output_size - 1);
+    } else if (6 <= img_output_size) {
+      top_n_string(accuracy_file, top5_correct_guesses, 5);
+      top_n_string(std::cout, top5_correct_guesses, 5);
+    }
+    return true;
+  }
+
+  static bool get_top_results(const std::string groundtruth_loc, std::vector<float> results, uint32_t batchSize) {
+    // This function takes the output results directly from runtime in a vector
+    // The dla benchmark currently uses this version of get_top_results
+    const std::string accuracy_results_loc = "accuracy_report.txt";
+    std::ofstream accuracy_file(accuracy_results_loc);
+
+    if (!accuracy_file.is_open()) {
+      throw std::invalid_argument("Unable to open accuracy file.");
+    }
+
+    std::ifstream groundtruth_file(groundtruth_loc);
+    int groundtruth_lineno = 0;
+
+    if (!groundtruth_file.is_open()) {
+      throw std::invalid_argument("Unable to open groundtruth file.");
+    }
+
+    if (results.size() % batchSize != 0) {
+      std::cout << "Results size = " << results.size() << " Batch size = " << batchSize << std::endl;
+      throw std::invalid_argument("Results size is not a multiple of batch size");
+    }
+
+    typedef std::pair<int, float> CatProbPair;
+    const int img_output_size = results.size() / batchSize;
+    uint32_t top1_correct_guesses = 0;
+    uint32_t top5_correct_guesses = 0;
+    const auto top_n = fmin(5, img_output_size);
+    for (uint32_t img = 0; img < batchSize; img++) {
+      accuracy_file << "image " << img << " top 5:" << std::endl;
+
+      const auto start_addr = img_output_size * img;
+      std::vector<CatProbPair> top5;
+      for (int i = 0; i < top_n; i++) {
+        top5.push_back(std::make_pair(i, results[start_addr + i]));
+      }
+
+      for (int i = 5; i < img_output_size; i++) {
+        const auto e = results[start_addr + i];
+        auto min_ele = &top5.at(0);
+        for (size_t j = 1; j < top5.size(); j++) {
+          if (top5.at(j).second < min_ele->second) {
+            min_ele = &top5.at(j);
+          }
+        }
+        if (e > min_ele->second) {
+          *min_ele = std::make_pair(i, e);
+        }
+      }
+
+      // sort descending
+      std::sort(
+          top5.begin(), top5.end(), [](const CatProbPair& a, const CatProbPair& b) { return a.second > b.second; });
+      for (const auto& pair : top5) {
+        accuracy_file << pair.first << " : " << pair.second << std::endl;
+      }
+      std::string line;
+      std::getline(groundtruth_file, line);
+      ++groundtruth_lineno;
+      int truth;
+      try {
+        truth = std::stoi(line);
+      } catch (const std::invalid_argument& ia) {
+        THROW_IE_EXCEPTION << "Unable to parse line " << groundtruth_lineno << " "
+                           << "of the ground truth file " << groundtruth_loc;
+      }
+      accuracy_file << truth << " : truth" << std::endl;
+      top1_correct_guesses += top5.at(0).first == truth;
+
+      int i = 1;
+      for (const auto& guess : top5) {
+        if (guess.first == truth && i < img_output_size) {
+          top5_correct_guesses += 1;
+          break;
+        }
+        i += 1;
+      }
+    }
+
+    const auto top_n_string = [&](std::ostream& stream, const double correct_guesses, const uint32_t N) {
+      stream << "top" << N << " accuracy: " << (correct_guesses * 100.0) / (batchSize) << " %" << std::endl;
+    };
+
+    accuracy_file << "====================" << std::endl;
+
+    top_n_string(accuracy_file, top1_correct_guesses, 1);
+    top_n_string(std::cout, top1_correct_guesses, 1);
+    if (2 < img_output_size && img_output_size < 6) {
+      top_n_string(accuracy_file, top5_correct_guesses, img_output_size - 1);
+      top_n_string(std::cout, top5_correct_guesses, img_output_size - 1);
+    } else if (6 <= img_output_size) {
+      top_n_string(accuracy_file, top5_correct_guesses, 5);
+      top_n_string(std::cout, top5_correct_guesses, 5);
+    }
+
+    return true;
+  }
+};
diff --git a/python/openvino/runtime/dla_benchmark/utils.cpp b/python/openvino/runtime/dla_benchmark/utils.cpp
new file mode 100644
index 0000000..066d234
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/utils.cpp
@@ -0,0 +1,689 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: Utility functions handling command line arguments and network input info for DLA's runtime.
+//              Loosely based off OpenVino's benchmark_app/utils.cpp
+//              [openvinotoolkit/openvino › samples/cpp/benchmark_app/utils.cpp]
+//              Future OpenVino uplifts should refer to the file listed above.
+
+#include <format_reader_ptr.h>
+#include <gflags/gflags.h>
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+#include <functional>
+#include <samples/common.hpp>
+#include <samples/slog.hpp>
+
+#include "dla_stl_utils.h"
+#include "utils.hpp"
+
+/**
+ * @brief Namespace dla_benchmark contains utility functions for working with network inputs.
+ */
+namespace dla_benchmark {
+
+/**
+ * @brief Checks if the input layout represents an image.
+ *
+ * This function determines whether the layout is compatible with image data based on the
+ * layout string and the number of channels.
+ *
+ * @return True if the layout is for an image, False otherwise.
+ */
+bool InputInfo::IsImage() const {
+  if ((layout != "NCHW" && layout != "NHWC")) return false;
+  return (GetChannels() == 1 || GetChannels() == 3);
+}
+
+/**
+ * @brief Checks if the input layout represents image information.
+ *
+ * This function checks if the layout corresponds to image information.
+ *
+ * @return True if the layout is for image information, False otherwise.
+ */
+bool InputInfo::IsImageInfo() const {
+  if (layout != "NC") return false;
+  return (GetChannels() >= 2);
+}
+
+/**
+ * @brief Checks if the input layout represents video data.
+ *
+ * This function determines whether the layout is compatible with video data based on the
+ * layout string and the number of channels.
+ *
+ * @return True if the layout is for video data, False otherwise.
+ */
+bool InputInfo::IsVideo() const {
+  if (layout != "NCDHW" && layout != "NDHWC") return false;
+  return (GetChannels() == 3);
+}
+
+/**
+ * @brief Gets the width dimension of the data shape based on the layout.
+ *
+ * @return The width dimension of the data shape.
+ */
+size_t InputInfo::GetWidth() const { return data_shape.at(ov::layout::width_idx(layout)); }
+
+/**
+ * @brief Gets the height dimension of the data shape based on the layout.
+ *
+ * @return The height dimension of the data shape.
+ */
+size_t InputInfo::GetHeight() const { return data_shape.at(ov::layout::height_idx(layout)); }
+
+/**
+ * @brief Gets the number of channels based on the layout.
+ *
+ * @return The number of channels.
+ */
+size_t InputInfo::GetChannels() const { return data_shape.at(ov::layout::channels_idx(layout)); }
+
+/**
+ * @brief Gets the batch size based on the layout.
+ *
+ * @return The batch size.
+ */
+size_t InputInfo::GetBatch() const { return data_shape.at(ov::layout::batch_idx(layout)); }
+
+/**
+ * @brief Gets the depth dimension of the data shape based on the layout.
+ *
+ * @return The depth dimension of the data shape.
+ */
+size_t InputInfo::GetDepth() const { return data_shape.at(ov::layout::depth_idx(layout)); }
+
+}  // namespace dla_benchmark
+
+/**
+ * @brief Parses number of streams for each device from a string argument.
+ *
+ * @param devices vector of supported DLA devices, ie FPGA, CPU
+ * @param values_string string arg of the format: <device1>:<value1>,<device2>:<value2>
+ * @return A map of device : number of streams
+ */
+std::map<std::string, uint32_t> ParseNStreamsValuePerDevice(const std::vector<std::string>& devices,
+                                                            const std::string& values_string) {
+  auto values_string_upper = values_string;
+  std::map<std::string, uint32_t> result;
+  auto device_value_strings = split(values_string_upper, ',');
+  for (auto& device_value_string : device_value_strings) {
+    auto device_value_vec = split(device_value_string, ':');
+    if (device_value_vec.size() == 2) {
+      auto device_name = device_value_vec.at(0);
+      auto nstreams = device_value_vec.at(1);
+      auto it = std::find(devices.begin(), devices.end(), device_name);
+      if (it != devices.end()) {
+        result[device_name] = std::stoi(nstreams);
+      } else {
+        throw std::logic_error("Can't set nstreams value " + std::string(nstreams) + " for device '" + device_name +
+                               "'! Incorrect device name!");
+      }
+    } else if (device_value_vec.size() == 1) {
+      uint32_t value = std::stoi(device_value_vec.at(0));
+      for (auto& device : devices) {
+        result[device] = value;
+      }
+    } else if (device_value_vec.size() != 0) {
+      throw std::runtime_error("Unknown string format: " + values_string);
+    }
+  }
+  return result;
+}
+
+/**
+ * @brief Parses CLI flag args -mean_values or -scale_values. Helper to GetInputsInfo()
+ *
+ * Parsing example: -mean_values data[255,255,255] is stored as data as the key, and a vector of 3 floats as the value
+ *
+ * @param arg raw string from CLI in the form of the example above
+ * @param inputs_info struct used to check that the input name exists in the graph
+ * @returns a map of input name and its respective mean/scale value vector
+ */
+std::map<std::string, std::vector<float>> ParseScaleOrMeanValues(const std::string& arg,
+                                                                 const dla_benchmark::InputsInfo& inputs_info) {
+  std::map<std::string, std::vector<float>> return_value;
+  // Create a copy of the input string for processing
+  std::string search_string = arg;
+  // Find the first '[' character in the string
+  auto start_pos = search_string.find_first_of('[');
+
+  while (start_pos != std::string::npos) {
+    // Find the matching ']' character
+    auto end_pos = search_string.find_first_of(']');
+    if (end_pos == std::string::npos) break;
+    // Extract the input name and value string between '[' and ']'
+    const std::string input_name = search_string.substr(0, start_pos);
+    const std::string input_value_string = search_string.substr(start_pos + 1, end_pos - start_pos - 1);
+    // Split the input value string into a vector of floats using a custom function SplitFloat
+    std::vector<float> input_value = SplitFloat(input_value_string, ',');
+    if (!input_name.empty()) {
+      // If the input name is not empty and exists in the inputs_info map, store the value
+      if (inputs_info.count(input_name)) {
+        return_value[input_name] = input_value;
+      } else {
+        // Ignore wrong input names but gives a warning
+        std::string network_input_names = "";
+        for (auto it = inputs_info.begin(); it != inputs_info.end(); ++it) {
+          network_input_names += it->first;
+          if (std::next(it) != inputs_info.end()) {
+              network_input_names += ", ";
+          }
+        }
+        slog::warn << "Scale values or mean values are applied to '" << input_name << "' but '" << input_name
+        << "' does not exist in network inputs. The available network inputs are: " << network_input_names
+        << slog::endl;
+      }
+    } else {
+      // If the input name is empty, apply the value to all image inputs in inputs_info
+      for (auto& item : inputs_info) {
+        if (item.second.IsImage()) return_value[item.first] = input_value;
+      }
+      // Clear the search string and exit the loop
+      search_string.clear();
+      break;
+    }
+    // Remove processed substring from the search string
+    search_string = search_string.substr(end_pos + 1);
+    // If the string is empty or doesn't start with a comma, exit the loop
+    if (search_string.empty() || search_string.front() != ',') {
+      break;
+    }
+    // Remove the leading comma and search for the next '[' character
+    search_string = search_string.substr(1);
+    start_pos = search_string.find_first_of('[');
+  }
+  // If there are remaining characters in the search string, it's an error
+  if (!search_string.empty()) {
+    throw std::logic_error("Can't parse input parameter string: " + arg);
+  }
+
+  return return_value;
+}
+
+/**
+ * @brief Splits command-line input arguments containing multiple image file paths
+ *        into separate vectors based on a specified separator.
+ * Modified from parseInputFilesArguments() in [openvinotoolkit/openvino ›
+ * inference-engine/samples/common/utils/src/args_helper.cpp]
+ *
+ * @param net_size The number of networks (multigraph functionality).
+ * @return A vector of vectors, where each inner vector contains image file paths
+ *         corresponding to a specific network graph.
+ */
+std::vector<std::vector<std::string>> SplitMultiInputFilesArguments(size_t net_size) {
+  std::vector<std::vector<std::string>> paths;
+  std::vector<std::string> args = gflags::GetArgvs();
+  const auto is_image_arg = [](const std::string& s) { return s == "-i" || s == "--images"; };
+  const auto is_arg = [](const std::string& s) { return s.front() == '-'; };
+  const auto img_start = std::find_if(begin(args), end(args), is_image_arg);  // looking for all `-i` or `--images` args
+  if (img_start == end(args)) {
+    // By default: if no -i argument is specified, then we should generate random
+    // input image data.  The fillBlobs() function will do that later when it sees
+    // an empty vector for its current network.
+    paths.push_back(std::vector<std::string>());
+    return paths;
+  }
+  const auto img_begin = std::next(img_start);
+  const auto img_end = std::find_if(img_begin, end(args), is_arg);
+  for (auto img = img_begin; img != img_end; ++img) {
+    auto multiFiles = split(*img, MULTIGRAPH_SEP);  // split this images arguments
+
+    if (multiFiles.size() != 1 && multiFiles.size() != net_size) {
+      slog::err << "Size of Input argument " << multiFiles.size() << " mismatch graph size " << net_size << " : "
+                << *img << slog::endl;
+      paths.clear();
+      break;
+    }
+    for (size_t i = 0; i < multiFiles.size(); i++)
+      slog::info << "Reading " << multiFiles[i] << " for graph index " << i << slog::endl;
+    while (paths.size() < multiFiles.size()) paths.push_back(std::vector<std::string>());
+
+    for (size_t i = 0; i < multiFiles.size(); i++) {
+      paths[i].push_back(multiFiles[i]);
+    }
+  }
+  return paths;
+}
+
+/**
+ * @brief Returns the stem of a file path.
+ *
+ * The stem is the base name of the file without its extension. This function
+ * takes a file path as input and extracts the stem, which is the part of the
+ * file name before the last period ('.') character.
+ *
+ * @param path The input file path.
+ * @return The stem of the file, excluding the extension.
+ */
+std::string GetStem(std::string path) {
+  auto last_index = path.rfind('/');
+
+  if (std::string::npos != last_index) {
+    path.erase(0, last_index + 1);
+  }
+
+  last_index = path.rfind('.');
+  if (std::string::npos != last_index) {
+    path.erase(last_index);
+  }
+
+  return path;
+}
+
+/**
+ * @brief Splits a string into substrings using a specified delimiter.
+ *
+ * @param s The input string to be split.
+ * @param delim The delimiter character used to separate the substrings.
+ * @return A vector of strings containing the substrings from the input string.
+ */
+std::vector<std::string> split(const std::string& s, char delim) {
+  std::vector<std::string> result;
+  std::stringstream ss(s);
+  std::string item;
+
+  while (getline(ss, item, delim)) {
+    result.push_back(item);
+  }
+  return result;
+}
+
+/**
+ * @brief Splits a string of floats into floats using a specified delimiter.
+ *
+ * @param s The input string to be split.
+ * @param delim The delimiter character used to separate the floats.
+ * @return A vector of floats containing the floats from the input string.
+ */
+std::vector<float> SplitFloat(const std::string& s, char delim) {
+  std::vector<float> result;
+  std::stringstream ss(s);
+  std::string item;
+
+  while (getline(ss, item, delim)) {
+    result.push_back(std::stof(item));
+  }
+  return result;
+}
+
+/**
+ * @brief Parses a list of devices from a string
+ *
+ * @param device_string The input string to be split. The delimiter is ':'
+ * @return A vector of strings containing the devices
+ */
+std::vector<std::string> ParseDevices(const std::string& device_string) {
+  std::string comma_separated_devices = device_string;
+  if (comma_separated_devices.find(":") != std::string::npos) {
+    comma_separated_devices = comma_separated_devices.substr(comma_separated_devices.find(":") + 1);
+  }
+  auto devices = split(comma_separated_devices, ',');
+  for (auto& device : devices) device = device.substr(0, device.find_first_of(".("));
+  return devices;
+}
+
+/**
+ * @brief Gets information about a network's inputs.
+ *
+ * Reads all input nodes from a network, determines tensor layout, shapes, precision, etc.
+ * Saves into dla::benchmark::InputsInfo which maps each input info struct to an input name.
+ *
+ * @param batch_size Network batch size from the user via the batch size flag
+ * @param input_info Vector of input nodes. Obtained from ov::Model.inputs() or ov::CompiledModel.inputs()
+ * @param reshape_required boolean flag indicating that the model needs to be reshaped according to the batch size flag
+ * @param is_binary_data User flag indicating that the data is binary data and not image data
+ * @param mean_string CLI arg specifying image mean value. Example: input[255,255,255]. (Optional)
+ * @param scale_string CLI arg specifying image scale value. Example: input[255,255,255]. (Optional)
+ * @return dla::benchmark::InputsInfo which is a map of input names and its respective input information
+ */
+dla_benchmark::InputsInfo GetInputsInfo(const size_t batch_size,
+                                        const std::vector<ov::Output<const ov::Node>>& input_info,
+                                        bool& reshape_required,
+                                        const bool is_binary_data,
+                                        const std::string& mean_string = "",
+                                        const std::string& scale_string = "") {
+  reshape_required = false;
+  dla_benchmark::InputsInfo info_map;
+
+  bool is_there_at_least_one_batch_dim = false;
+  for (auto& item : input_info) {
+    dla_benchmark::InputInfo info;
+    const std::string& name = item.get_any_name();
+
+    // Layout
+    info.layout = dynamic_cast<const ov::op::v0::Parameter&>(*item.get_node()).get_layout();
+
+    // Calculating default layout values if needed
+    std::string newLayout = "";
+    if (info.layout.empty()) {
+      const size_t rank = item.get_partial_shape().size();
+      const std::string newLayout = dla::util::getTensorLayout(rank);
+      if (newLayout != "") {
+        info.layout = ov::Layout(newLayout);
+        slog::warn << name << ": layout is not set explicitly through model optimizer"
+                   << (newLayout != "" ? std::string(", so it is defaulted to ") + newLayout : "")
+                   << ". It is recommended to explicity set layout via model optmizer." << slog::endl;
+      }
+    }
+
+    // Partial Shape
+    info.partial_shape = item.get_partial_shape();
+    info.data_shape = info.partial_shape.get_shape();
+
+    // DLA only supports static shapes
+    if (info.partial_shape.is_dynamic()) {
+      throw std::runtime_error(
+          "DLA only supports static shapes. Check your model and make sure all shapes are defined (No dims of -1).");
+    }
+
+    // Precision
+    // Edwinzha: setting input data to u8 for image data instead of the defined precision in .xml
+    // leads to accuracy loss that didn't exist prior to API 2.0. Should investigate or remove this condition.
+    // info.IsImage() && !is_binary_data ? ov::element::u8 : item.get_element_type();
+    info.type = item.get_element_type();
+
+    // Update shape with batch if needed (only in static shape case)
+    // Update blob shape only not affecting network shape to trigger dynamic batch size case
+    if (batch_size != 0) {
+      if (ov::layout::has_batch(info.layout)) {
+        std::size_t batch_index = ov::layout::batch_idx(info.layout);
+        if (info.data_shape.at(batch_index) != batch_size) {
+          info.partial_shape[batch_index] = batch_size;
+          info.data_shape[batch_index] = batch_size;
+          reshape_required = true;
+          is_there_at_least_one_batch_dim = true;
+        }
+      } else {
+        slog::warn << "Input '" << name
+                   << "' doesn't have batch dimension in layout. -b option will be ignored for this input."
+                   << slog::endl;
+      }
+    }
+    info_map[name] = info;
+  }
+
+  if (batch_size > 1 && !is_there_at_least_one_batch_dim) {
+    throw std::runtime_error(
+        "-b option is provided in command line, but there's no inputs with batch(B) "
+        "dimension in input layout, so batch cannot be set. "
+        "You may specify layout explicitly using -layout option.");
+  }
+
+  // Update scale and mean
+  std::map<std::string, std::vector<float>> scale_map = ParseScaleOrMeanValues(scale_string, info_map);
+  std::map<std::string, std::vector<float>> mean_map = ParseScaleOrMeanValues(mean_string, info_map);
+
+  for (auto& item : info_map) {
+    dla_benchmark::InputInfo& info = item.second;
+    if (info.IsImage()) {
+      if (info.GetChannels() == 3) {  // Image is RGB or BGR
+        info.scale_values.assign({1, 1, 1});
+        info.mean_values.assign({0, 0, 0});
+      } else if (info.GetChannels() == 1) {  // Image is greyscale
+        info.scale_values.assign({1});
+        info.mean_values.assign({0});
+      } else {
+        std::string err =
+            "Input is image but is not of 3 channels (RGB, BGR) or 1 channel (Greyscale). Cannot assign mean and/or "
+            "scale values";
+        throw std::logic_error(err);
+      }
+      if (scale_map.count(item.first)) {
+        info.scale_values = scale_map.at(item.first);
+      }
+      if (mean_map.count(item.first)) {
+        info.mean_values = mean_map.at(item.first);
+      }
+    }
+  }
+  return info_map;
+}
+
+/**
+ * @brief Gets information about a network's inputs.
+ *
+ * Reads all input nodes from a network, determines tensor layout, shapes, precision, etc.
+ * Saves into dla::benchmark::InputsInfo which maps each input info struct to an input name.
+ * Used in AOT flow where reshaping is not required (Handled by compiler)
+ *
+ * @param batch_size Network batch size from the user via the batch size flag
+ * @param input_info Vector of input nodes. Obtained from ov::Model.inputs() or ov::CompiledModel.inputs()
+ * @param is_binary_data User flag indicating that the data is binary data and not image data
+ * @return dla::benchmark::InputsInfo which is a map of input names and its respective input information
+ */
+dla_benchmark::InputsInfo GetInputsInfo(const size_t batch_size,
+                                        const std::vector<ov::Output<const ov::Node>>& input_info,
+                                        const bool is_binary_data) {
+  bool reshape_required = false;
+  return GetInputsInfo(batch_size, input_info, reshape_required, is_binary_data);
+}
+
+/**
+ * @brief Extracts the file extension from a given file name.
+ *
+ * @param name The file name from which to extract the extension.
+ * @return The file extension as a string, or an empty string if no extension is found.
+ */
+std::string GetExtension(const std::string& name) {
+  auto extension_position = name.rfind('.', name.size());
+  return extension_position == std::string::npos ? "" : name.substr(extension_position + 1, name.size() - 1);
+}
+
+/**
+ * @brief Filters a list of file paths by specified file extensions (case insensitive).
+ *
+ * @param file_paths A vector of file paths to be filtered.
+ * @param extensions A vector of file extensions to filter by.
+ * @return A vector of filtered file paths that match the specified extensions.
+ */
+std::vector<std::string> FilterFilesByExtensions(const std::vector<std::string>& file_paths,
+                                                 const std::vector<std::string>& extensions) {
+  std::vector<std::string> filtered;
+  for (auto& file_path : file_paths) {
+    auto extension = GetExtension(file_path);
+    std::transform(extension.begin(), extension.end(), extension.begin(), ::tolower);
+    if (std::find(extensions.begin(), extensions.end(), extension) != extensions.end()) {
+      filtered.push_back(file_path);
+    }
+  }
+  return filtered;
+}
+
+/**
+ * @brief Dumps output tensor into result.txt. Mainly used for regtesting, only runs with -dump_output flag
+ *
+ * @param output_tensor Output tensor to dump
+ * @param output_node Output node corresponding to the output tensor to dump
+ * @param output_size Size of the output tensor
+ * @param result_file ofstream object corresponding to result.txt
+ */
+void DumpResultTxtFile(const ov::Tensor& output_tensor,
+                       const ov::Output<const ov::Node>& output_node,
+                       const unsigned int output_size,
+                       std::ofstream& result_file) {
+  size_t C = 1;
+  size_t H = 1;
+  size_t W = 1;
+  size_t D = 1;
+
+  // allow dumping the data as txt for all layouts, but not dumping layout if it's unknown
+  bool unknown_layout = false;
+
+  const ov::Layout& layout = ov::layout::get_layout(output_node);
+  const ov::Shape& shape = output_tensor.get_shape();
+  const std::string& name = output_node.get_any_name();
+  const size_t num_dims = shape.size();
+  const size_t tensor_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+  if (num_dims == 2) {
+    C = shape[1];
+  } else if (num_dims == 4) {
+    C = shape[1];
+    H = shape[2];
+    W = shape[3];
+  } else if (num_dims == 5) {
+    C = shape[1];
+    D = shape[2];
+    H = shape[3];
+    W = shape[4];
+  } else {
+    unknown_layout = true;
+  }
+
+  const auto* data = output_tensor.data<float>();
+  if (data == nullptr) {
+    throw std::runtime_error("Unable to dump result tensors because tensor data is NULL");
+  }
+  if (!result_file.is_open()) {
+    // Fix coverity, this should always be open from dla_benchmark/main.cpp
+    throw std::runtime_error("Unable to dump result tensors due to result ofstream not being open!");
+  }
+  // Save the original formatting flags for coverity
+  std::ios_base::fmtflags original_flags = result_file.flags();
+
+  for (size_t idx = 0; idx < tensor_size; ++idx) {
+    // Explicity set precision for coverity
+    result_file << std::fixed << std::setprecision(6) << data[idx] << std::defaultfloat;
+    if (!unknown_layout) {
+      size_t n = idx / (C * D * H * W);
+      size_t c = (idx / (D * H * W)) % C;
+      size_t d = (idx / (H * W)) % D;
+      size_t h = (idx / W) % H;
+      size_t w = idx % W;
+      result_file <<" # Layout: " << layout.to_string() << "; ";
+      result_file << "Index: " << n << " " << c;
+      if (num_dims == 4) {
+        result_file << " " << h << " " << w;
+      }
+      if (num_dims == 5) {
+        result_file << " " << d << " " << h << " " << w;
+      }
+    } else {
+      result_file << " # Index: " << idx;
+    }
+
+    if (idx == 0) {
+      result_file << " start of " << name;
+    } else if (idx == output_size - 1) {
+      result_file << " end of " << name << ", see result_tensor_boundaries.txt for details";
+    }
+    result_file << std::endl;
+  }
+  // restore orginal formatting flags
+  result_file.flags(original_flags);
+}
+
+/**
+ * @brief Dumps output tensor as binaries into result.bin.
+ * Can be useful in postprocessing of the result tensor using Python numpy,
+ * or when the tensor layout is not supported by DLA.
+ *
+ * @param output_tensor Output tensor to dump
+ * @param result_file ofstream object corresponding to result.bin
+ */
+void DumpResultBinFile(const ov::Tensor& output_tensor,
+                       std::ofstream& result_file) {
+  const ov::Shape& shape = output_tensor.get_shape();
+  size_t total_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+  const auto* data = output_tensor.data<float>();
+  if (data == nullptr) {
+    throw std::runtime_error("Unable to dump result tensors because tensor data is NULL");
+  }
+  for (size_t idx = 0; idx < total_size; ++idx) {
+    result_file.write(reinterpret_cast<const char*>(&data[idx]), sizeof(float));
+  }
+}
+
+/**
+ * @brief Dumps inference metadata as a JSON file into result_meta.json
+ * Useful for postprocessing and reviewing inference arguments
+ *
+ * @param metadata Meta data to dump
+ * @param result_file ofstream object corresponding to result_meta.json
+ */
+void DumpResultMetaJSONFile(const dla_benchmark::InferenceMetaData& metadata,
+                            std::ofstream& result_file) {
+  result_file << "{\n";
+  // batch size
+  result_file << "\t\"batch_size\": " << metadata.batch_size << ",\n";
+
+  // niter
+  result_file << "\t\"niter\": " << metadata.niter << ",\n";
+
+  // nireq
+  result_file << "\t\"nireq\": " << metadata.nireq << ",\n";
+
+  // groundtruth loc
+  result_file << "\t\"groundtruth_loc\": \"" << metadata.groundtruth_loc << "\",\n";
+
+  // input info: model_input_info
+  result_file << "\t\"input_info\": [\n";
+  long unsigned int idx = 0;
+  for (const auto &name_input_pair : metadata.model_input_info) {
+    // to collect scale_values and mean_values
+    std::ostringstream oss_scale_vals, oss_mean_vals;
+    unsigned int scale_values_size = name_input_pair.second.scale_values.size();
+    if (scale_values_size != name_input_pair.second.mean_values.size()) {
+      throw std::logic_error("scale_values and mean_values should always have the same size");
+    }
+    oss_scale_vals << "[";
+    oss_mean_vals << "[";
+    for (long unsigned int i = 0; i < scale_values_size; i++) {
+      oss_scale_vals << name_input_pair.second.scale_values[i];
+      oss_mean_vals << name_input_pair.second.mean_values[i];
+      if (i < scale_values_size - 1) {
+        oss_scale_vals << ",";
+        oss_mean_vals << ",";
+      } else {
+        oss_scale_vals << "]";
+        oss_mean_vals << "]";
+      }
+    }
+    result_file <<  "\t\t{\"name\": \"" << name_input_pair.first << "\", \"shape\": \""
+                << name_input_pair.second.data_shape.to_string() << "\", \"scale_values\": \""
+                << oss_scale_vals.str() << "\", \"mean_values\": \""
+                << oss_mean_vals.str() << "\", \"layout\": \""
+                << name_input_pair.second.layout.to_string() << "\"}";
+    if (idx == metadata.model_input_info.size() - 1) {
+      result_file << "\n";
+    } else {
+      result_file << ",\n";
+    }
+    idx += 1;
+  }
+  result_file << "\t],\n";
+
+  // output info: model_output_info preserves the order multi-tensor output
+  result_file << "\t\"output_info\": [\n";
+  for (long unsigned int i = 0; i < metadata.model_output_info.size(); i++) {
+    dla_benchmark::OutputInfo info = metadata.model_output_info[i];
+    result_file <<  "\t\t{\"name\": \"" << info.name << "\", \"shape\": \"" << info.shape.to_string() << "\"}";
+    if (i == metadata.model_output_info.size() - 1) {
+      result_file << "\n";
+    } else {
+      result_file << ",\n";
+    }
+  }
+  result_file << "\t],\n";
+
+  // input files
+  result_file << "\t\"input_files\": [\n";
+  for (long unsigned int i = 0; i < metadata.input_files.size(); i++) {
+    std::string input_file = metadata.input_files[i];
+    result_file <<  "\t\t\"" << input_file << "\"";
+    if (i == metadata.input_files.size() - 1) {
+      result_file << "\n";
+    } else {
+      result_file << ",\n";
+    }
+  }
+  result_file << "\t]\n";
+
+  result_file << "}\n";
+}
diff --git a/python/openvino/runtime/dla_benchmark/utils.hpp b/python/openvino/runtime/dla_benchmark/utils.hpp
new file mode 100644
index 0000000..5ca7834
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/utils.hpp
@@ -0,0 +1,249 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: Utility functions handling command line arguments and network input info for DLA's runtime.
+//              Loosely based off OpenVino's benchmark_app/utils.hpp
+//              [openvinotoolkit/openvino › samples/cpp/benchmark_app/utils.hpp]
+//              Future OpenVino uplifts should refer to the file listed above.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "dla_runtime_log.h"
+
+#define MULTIGRAPH_SEP ',' /* seperator used in argument line when multigraph activated */
+
+// Constants
+constexpr size_t BYTE_TO_MEGABYTE = 1024 * 1024;
+constexpr size_t MAX_COUT_WITHOUT_VERBOSE = 20;  // How many couts can be printed w/o VERBOSE=1
+
+typedef std::chrono::high_resolution_clock Time;
+typedef std::chrono::nanoseconds ns;
+
+#ifdef USE_OPENCV
+// This is the full list of image extensions supported by the opencv reader:
+// "bmp", "dib", "jpeg", "jpg", "jpe",
+// "jp2", "png", "pbm", "pgm", "ppm",
+// "sr", "ras", "tiff", "tif"
+// However, the ones in the list below are already
+// tested using the synthetic graphs infrastructure.
+// Only jpeg, jpg, jpe extensions of very high quality
+// and with subsampling disabled were tested.
+// TODO(meldafra): Check why the remaining extensions are not passing and fix them
+static const std::vector<std::string> supported_image_extensions = {
+    "bmp", "png", "pbm", "pgm", "ppm", "jpeg", "jpg", "jpe"};
+
+#else
+static const std::vector<std::string> supported_image_extensions = {"bmp"};
+#endif
+static const std::vector<std::string> supported_binary_extensions = {"bin"};
+static const std::vector<std::string> supported_video_extensions = {"mp4", "gif"};
+
+/**
+ * @brief Namespace dla_benchmark contains utility functions for working with network inputs.
+ */
+namespace dla_benchmark {
+struct InputInfo {
+  ov::element::Type type;
+  ov::PartialShape partial_shape;
+  ov::Shape data_shape;
+  ov::Layout layout;
+  std::vector<float> scale_values;
+  std::vector<float> mean_values;
+  bool IsImage() const;
+  bool IsImageInfo() const;
+  bool IsVideo() const;
+  size_t GetWidth() const;
+  size_t GetHeight() const;
+  size_t GetChannels() const;
+  size_t GetBatch() const;
+  size_t GetDepth() const;
+};
+
+struct OutputInfo {
+  std::string name;
+  ov::Shape shape;
+};
+
+using InputsInfo = std::map<std::string, InputInfo>;
+using OutputsInfoVec = std::vector<OutputInfo>;
+using PartialShapes = std::map<std::string, ngraph::PartialShape>;
+
+struct InferenceMetaData {
+  std::vector<std::string> input_files;  // Input files used inferencing
+  std::string groundtruth_loc;  // The directory that contains the groundtruth files
+  unsigned int batch_size;  // The batch size used in the inference
+  unsigned int niter;  // The number of iterations set by -niter in dla_benchmark
+  unsigned int nireq;  // The number of inference requests set by -nireq in dla_benchmark
+  dla_benchmark::InputsInfo model_input_info;  // the metadata of the model input
+  dla_benchmark::OutputsInfoVec model_output_info;  // the metadata of the model output
+};
+}  // namespace dla_benchmark
+
+/**
+ * @brief Parses number of streams for each device from a string argument.
+ *
+ * @param devices vector of supported DLA devices, ie FPGA, CPU
+ * @param values_string string arg of the format: <device1>:<value1>,<device2>:<value2>
+ * @return A map of device : number of streams
+ */
+std::map<std::string, uint32_t> ParseNStreamsValuePerDevice(const std::vector<std::string>& devices,
+                                                            const std::string& values_string);
+/**
+ * @brief Splits a string into substrings using a specified delimiter.
+ *
+ * @param s The input string to be split.
+ * @param delim The delimiter character used to separate the substrings.
+ * @return A vector of strings containing the substrings from the input string.
+ */
+std::vector<std::string> split(const std::string& s, char delim);
+
+/**
+ * @brief Splits a string of floats into floats using a specified delimiter.
+ *
+ * @param s The input string to be split.
+ * @param delim The delimiter character used to separate the floats.
+ * @return A vector of floats containing the floats from the input string.
+ */
+std::vector<float> SplitFloat(const std::string& s, char delim);
+
+// To enable multigraph operations to all CNNNetworks, inputs are mutable
+template <typename T, typename S, typename Functor>
+inline std::vector<T> VectorMap(std::vector<S>& inputs, Functor fn) {
+  std::vector<T> results;
+  for (auto& input : inputs) results.push_back(fn(input));
+  return results;
+}
+
+// Supports temporary object or constant expression
+template <typename T, typename S, typename Functor>
+inline std::vector<T> VectorMap(const std::vector<S>& inputs, Functor fn) {
+  std::vector<T> results;
+  for (auto& input : inputs) results.push_back(fn(input));
+  return results;
+}
+
+template <typename T, typename S, typename Functor>
+inline std::vector<T> vectorMapWithIndex(const std::vector<S>& inputs, Functor fn) {
+  std::vector<T> results;
+  uint32_t index = 0;
+  for (auto& input : inputs) results.push_back(fn(input, index++));
+  return results;
+}
+
+/**
+ * @brief Splits command-line input arguments containing multiple image file paths
+ *        into separate vectors based on a specified separator.
+ * Modified from parseInputFilesArguments() in [openvinotoolkit/openvino ›
+ * inference-engine/samples/common/utils/src/args_helper.cpp]
+ *
+ * @param net_size The number of networks (multigraph functionality).
+ * @return A vector of vectors, where each inner vector contains image file paths
+ *         corresponding to a specific network graph.
+ */
+std::vector<std::vector<std::string>> SplitMultiInputFilesArguments(size_t net_size);
+
+/**
+ * @brief Returns the stem of a file path.
+ *
+ * The stem is the base name of the file without its extension. This function
+ * takes a file path as input and extracts the stem, which is the part of the
+ * file name before the last period ('.') character.
+ *
+ * @param path The input file path.
+ * @return The stem of the file, excluding the extension.
+ */
+std::string GetStem(std::string path);
+
+/**
+ * @brief Extracts the file extension from a given file name.
+ *
+ * @param name The file name from which to extract the extension.
+ * @return The file extension as a string, or an empty string if no extension is found.
+ */
+std::string GetExtension(const std::string& path);
+
+/**
+ * @brief Parses a list of devices from a string
+ *
+ * @param device_string The input string to be split. The delimiter is ':'
+ * @return A vector of strings containing the devices
+ */
+std::vector<std::string> ParseDevices(const std::string& device_string);
+
+/**
+ * @brief Gets information about a network's inputs.
+ *
+ * Reads all input nodes from a network, determines tensor layout, shapes, precision, etc.
+ * Saves into dla::benchmark::InputsInfo which maps each input info struct to an input name.
+ *
+ * @param batch_size Network batch size from the user via the batch size flag
+ * @param input_info Vector of input nodes. Obtained from ov::Model.inputs() or ov::CompiledModel.inputs()
+ * @param reshape_required boolean flag indicating that the model needs to be reshaped according to the batch size flag
+ * @param is_binary_data User flag indicating that the data is binary data and not image data
+ * @param mean_string CLI arg specifying image mean value. Example: input[255,255,255]. (Optional)
+ * @param scale_string CLI arg specifying image scale value. Example: input[255,255,255]. (Optional)
+ * @return dla::benchmark::InputsInfo which is a map of input names and its respective input information
+ */
+dla_benchmark::InputsInfo GetInputsInfo(const size_t batch_size,
+                                        const std::vector<ov::Output<const ov::Node>>& input_info,
+                                        bool& reshape_required,
+                                        const bool is_binary_data,
+                                        const std::string& mean_string,
+                                        const std::string& scale_string);
+
+/**
+ * @brief Gets information about a network's inputs.
+ *
+ * Reads all input nodes from a network, determines tensor layout, shapes, precision, etc.
+ * Saves into dla::benchmark::InputsInfo which maps each input info struct to an input name.
+ * Used in AOT flow where reshaping is not required (Handled by compiler)
+ *
+ * @param batch_size Network batch size from the user via the batch size flag
+ * @param input_info Vector of input nodes. Obtained from ov::Model.inputs() or ov::CompiledModel.inputs()
+ * @param is_binary_data User flag indicating that the data is binary data and not image data
+ * @return dla::benchmark::InputsInfo which is a map of input names and its respective input information
+ */
+dla_benchmark::InputsInfo GetInputsInfo(const size_t batch_size,
+                                        const std::vector<ov::Output<const ov::Node>>& input_info,
+                                        const bool isBinaryData);
+
+/**
+ * @brief Filters a list of file paths by specified file extensions (case insensitive).
+ *
+ * @param file_paths A vector of file paths to be filtered.
+ * @param extensions A vector of file extensions to filter by.
+ * @return A vector of filtered file paths that match the specified extensions.
+ */
+std::vector<std::string> FilterFilesByExtensions(const std::vector<std::string>& file_paths,
+                                                 const std::vector<std::string>& extensions);
+
+// Helper function to dump result.txt with tensor indicies
+void DumpResultTxtFile(const ov::Tensor& output_tensor,
+                       const ov::Output<const ov::Node>& output_node,
+                       const unsigned int output_size,
+                       std::ofstream& result_file);
+
+// Helper function to dump the output tensor as binaries in result.bin
+void DumpResultBinFile(const ov::Tensor& output_tensor,
+                       std::ofstream& result_file);
+
+// Helper function to dump the inference metadata into result_meta.json
+void DumpResultMetaJSONFile(const dla_benchmark::InferenceMetaData& metadata,
+                            std::ofstream& result_file);
+
+/**
+ * @brief Gets the appriopriate DLA supported tensor layout from a node.
+ *
+ * @param node Node to determine the tensor layout. Obtained from ov::Model.inputs()/outputs()
+ *             or ov::CompiledModel.inputs()/outputs()
+ * @param allow_partial_defined Whether to allow partial defined layout. When set true, DLA tolerates
+ *             dumping custom layouts e.g., when the rank of shape is 3. The layout will have ? in
+ *             all dimensions. e.g., [???].
+ *             This param should ONLY be used when dumping graph output of irregular layout.
+ * @return OpenVino's tensor layout object.
+ */
+ov::Layout GetTensorLayout(const ov::Output<ov::Node>& node, const bool allow_partial_defined = false);